import os import asyncio from typing import List, Dict from dotenv import load_dotenv import httpx import psycopg from psycopg.rows import dict_row load_dotenv() AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/") AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY") AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview") AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3") DB_URL = os.getenv("DATABASE_URL") EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}" async def get_embedding(text: str) -> List[float]: """Get embedding for a text using Azure OpenAI""" payload = {"input": [text]} headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"} async with httpx.AsyncClient() as client: for attempt in range(3): try: r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=30) if r.status_code == 200: data = r.json() return data["data"][0]["embedding"] elif r.status_code in (429, 500, 503): backoff = 2 ** attempt await asyncio.sleep(backoff) else: raise RuntimeError(f"Embedding error {r.status_code}: {r.text}") except Exception as e: if attempt == 2: raise e await asyncio.sleep(2 ** attempt) async def search_bible_semantic(query: str, limit: int = 10) -> List[Dict]: """Search Bible using semantic similarity""" # Get embedding for the query query_embedding = await get_embedding(query) # Search for similar verses with psycopg.connect(DB_URL, row_factory=dict_row) as conn: with conn.cursor() as cur: cur.execute(""" SELECT ref, book, chapter, verse, text_raw, 1 - (embedding <=> %s) AS similarity FROM bible_passages WHERE embedding IS NOT NULL ORDER BY embedding <=> %s LIMIT %s """, (query_embedding, query_embedding, limit)) return cur.fetchall() async def search_bible_hybrid(query: str, limit: int = 10) -> List[Dict]: """Search Bible using hybrid semantic + lexical search""" # Get embedding for the query query_embedding = await get_embedding(query) # Create search query for full-text search search_query = " & ".join(query.split()) with psycopg.connect(DB_URL, row_factory=dict_row) as conn: with conn.cursor() as cur: cur.execute(""" WITH vector_search AS ( SELECT id, 1 - (embedding <=> %s) AS vector_sim FROM bible_passages WHERE embedding IS NOT NULL ORDER BY embedding <=> %s LIMIT 100 ), text_search AS ( SELECT id, ts_rank(tsv, plainto_tsquery('romanian', %s)) AS text_rank FROM bible_passages WHERE tsv @@ plainto_tsquery('romanian', %s) ) SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw, COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score FROM bible_passages bp LEFT JOIN vector_search vs ON vs.id = bp.id LEFT JOIN text_search ts ON ts.id = bp.id WHERE vs.id IS NOT NULL OR ts.id IS NOT NULL ORDER BY combined_score DESC LIMIT %s """, (query_embedding, query_embedding, query, query, limit)) return cur.fetchall() async def get_context_verses(book: str, chapter: int, verse: int, context_size: int = 2) -> List[Dict]: """Get surrounding verses for context""" with psycopg.connect(DB_URL, row_factory=dict_row) as conn: with conn.cursor() as cur: cur.execute(""" SELECT ref, book, chapter, verse, text_raw FROM bible_passages WHERE book = %s AND chapter = %s AND verse BETWEEN %s AND %s ORDER BY verse """, (book, chapter, verse - context_size, verse + context_size)) return cur.fetchall() if __name__ == "__main__": async def test_search(): results = await search_bible_semantic("dragoste", 5) print("Semantic search results for 'dragoste':") for result in results: print(f"{result['ref']}: {result['text_raw'][:100]}... (similarity: {result['similarity']:.3f})") print("\nHybrid search results for 'dragoste':") hybrid_results = await search_bible_hybrid("dragoste", 5) for result in hybrid_results: print(f"{result['ref']}: {result['text_raw'][:100]}... (score: {result['combined_score']:.3f})") asyncio.run(test_search())