Implement Azure OpenAI vector embeddings for Romanian Bible

- Add pgvector support with bible_passages table for vector search
- Create Python ingestion script for Azure OpenAI embed-3 embeddings
- Implement hybrid search combining vector similarity and full-text search
- Update AI chat to use vector search with Azure OpenAI gpt-4o
- Add floating chat component with Material UI design
- Import complete Romanian Bible (FIDELA) with 30K+ verses
- Add vector search library for semantic Bible search
- Create multi-language implementation plan for future expansion

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
andupetcu
2025-09-20 15:18:00 +03:00
parent 3b375c869b
commit dd5e1102eb
14 changed files with 2082 additions and 68 deletions

121
scripts/bible_search.py Normal file
View File

@@ -0,0 +1,121 @@
import os
import asyncio
from typing import List, Dict
from dotenv import load_dotenv
import httpx
import psycopg
from psycopg.rows import dict_row
load_dotenv()
AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
DB_URL = os.getenv("DATABASE_URL")
EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
async def get_embedding(text: str) -> List[float]:
"""Get embedding for a text using Azure OpenAI"""
payload = {"input": [text]}
headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
async with httpx.AsyncClient() as client:
for attempt in range(3):
try:
r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=30)
if r.status_code == 200:
data = r.json()
return data["data"][0]["embedding"]
elif r.status_code in (429, 500, 503):
backoff = 2 ** attempt
await asyncio.sleep(backoff)
else:
raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
except Exception as e:
if attempt == 2:
raise e
await asyncio.sleep(2 ** attempt)
async def search_bible_semantic(query: str, limit: int = 10) -> List[Dict]:
"""Search Bible using semantic similarity"""
# Get embedding for the query
query_embedding = await get_embedding(query)
# Search for similar verses
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT ref, book, chapter, verse, text_raw,
1 - (embedding <=> %s) AS similarity
FROM bible_passages
WHERE embedding IS NOT NULL
ORDER BY embedding <=> %s
LIMIT %s
""", (query_embedding, query_embedding, limit))
return cur.fetchall()
async def search_bible_hybrid(query: str, limit: int = 10) -> List[Dict]:
"""Search Bible using hybrid semantic + lexical search"""
# Get embedding for the query
query_embedding = await get_embedding(query)
# Create search query for full-text search
search_query = " & ".join(query.split())
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute("""
WITH vector_search AS (
SELECT id, 1 - (embedding <=> %s) AS vector_sim
FROM bible_passages
WHERE embedding IS NOT NULL
ORDER BY embedding <=> %s
LIMIT 100
),
text_search AS (
SELECT id, ts_rank(tsv, plainto_tsquery('romanian', %s)) AS text_rank
FROM bible_passages
WHERE tsv @@ plainto_tsquery('romanian', %s)
)
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
FROM bible_passages bp
LEFT JOIN vector_search vs ON vs.id = bp.id
LEFT JOIN text_search ts ON ts.id = bp.id
WHERE vs.id IS NOT NULL OR ts.id IS NOT NULL
ORDER BY combined_score DESC
LIMIT %s
""", (query_embedding, query_embedding, query, query, limit))
return cur.fetchall()
async def get_context_verses(book: str, chapter: int, verse: int, context_size: int = 2) -> List[Dict]:
"""Get surrounding verses for context"""
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT ref, book, chapter, verse, text_raw
FROM bible_passages
WHERE book = %s AND chapter = %s
AND verse BETWEEN %s AND %s
ORDER BY verse
""", (book, chapter, verse - context_size, verse + context_size))
return cur.fetchall()
if __name__ == "__main__":
async def test_search():
results = await search_bible_semantic("dragoste", 5)
print("Semantic search results for 'dragoste':")
for result in results:
print(f"{result['ref']}: {result['text_raw'][:100]}... (similarity: {result['similarity']:.3f})")
print("\nHybrid search results for 'dragoste':")
hybrid_results = await search_bible_hybrid("dragoste", 5)
for result in hybrid_results:
print(f"{result['ref']}: {result['text_raw'][:100]}... (score: {result['combined_score']:.3f})")
asyncio.run(test_search())