Implement Azure OpenAI vector embeddings for Romanian Bible
- Add pgvector support with bible_passages table for vector search - Create Python ingestion script for Azure OpenAI embed-3 embeddings - Implement hybrid search combining vector similarity and full-text search - Update AI chat to use vector search with Azure OpenAI gpt-4o - Add floating chat component with Material UI design - Import complete Romanian Bible (FIDELA) with 30K+ verses - Add vector search library for semantic Bible search - Create multi-language implementation plan for future expansion 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
121
scripts/bible_search.py
Normal file
121
scripts/bible_search.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import asyncio
|
||||
from typing import List, Dict
|
||||
from dotenv import load_dotenv
|
||||
import httpx
|
||||
import psycopg
|
||||
from psycopg.rows import dict_row
|
||||
|
||||
load_dotenv()
|
||||
|
||||
AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
|
||||
AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
|
||||
AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
|
||||
AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
|
||||
DB_URL = os.getenv("DATABASE_URL")
|
||||
|
||||
EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
|
||||
|
||||
async def get_embedding(text: str) -> List[float]:
|
||||
"""Get embedding for a text using Azure OpenAI"""
|
||||
payload = {"input": [text]}
|
||||
headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
for attempt in range(3):
|
||||
try:
|
||||
r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=30)
|
||||
if r.status_code == 200:
|
||||
data = r.json()
|
||||
return data["data"][0]["embedding"]
|
||||
elif r.status_code in (429, 500, 503):
|
||||
backoff = 2 ** attempt
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
|
||||
except Exception as e:
|
||||
if attempt == 2:
|
||||
raise e
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
|
||||
async def search_bible_semantic(query: str, limit: int = 10) -> List[Dict]:
|
||||
"""Search Bible using semantic similarity"""
|
||||
# Get embedding for the query
|
||||
query_embedding = await get_embedding(query)
|
||||
|
||||
# Search for similar verses
|
||||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT ref, book, chapter, verse, text_raw,
|
||||
1 - (embedding <=> %s) AS similarity
|
||||
FROM bible_passages
|
||||
WHERE embedding IS NOT NULL
|
||||
ORDER BY embedding <=> %s
|
||||
LIMIT %s
|
||||
""", (query_embedding, query_embedding, limit))
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
async def search_bible_hybrid(query: str, limit: int = 10) -> List[Dict]:
|
||||
"""Search Bible using hybrid semantic + lexical search"""
|
||||
# Get embedding for the query
|
||||
query_embedding = await get_embedding(query)
|
||||
|
||||
# Create search query for full-text search
|
||||
search_query = " & ".join(query.split())
|
||||
|
||||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
WITH vector_search AS (
|
||||
SELECT id, 1 - (embedding <=> %s) AS vector_sim
|
||||
FROM bible_passages
|
||||
WHERE embedding IS NOT NULL
|
||||
ORDER BY embedding <=> %s
|
||||
LIMIT 100
|
||||
),
|
||||
text_search AS (
|
||||
SELECT id, ts_rank(tsv, plainto_tsquery('romanian', %s)) AS text_rank
|
||||
FROM bible_passages
|
||||
WHERE tsv @@ plainto_tsquery('romanian', %s)
|
||||
)
|
||||
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
|
||||
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
|
||||
FROM bible_passages bp
|
||||
LEFT JOIN vector_search vs ON vs.id = bp.id
|
||||
LEFT JOIN text_search ts ON ts.id = bp.id
|
||||
WHERE vs.id IS NOT NULL OR ts.id IS NOT NULL
|
||||
ORDER BY combined_score DESC
|
||||
LIMIT %s
|
||||
""", (query_embedding, query_embedding, query, query, limit))
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
async def get_context_verses(book: str, chapter: int, verse: int, context_size: int = 2) -> List[Dict]:
|
||||
"""Get surrounding verses for context"""
|
||||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT ref, book, chapter, verse, text_raw
|
||||
FROM bible_passages
|
||||
WHERE book = %s AND chapter = %s
|
||||
AND verse BETWEEN %s AND %s
|
||||
ORDER BY verse
|
||||
""", (book, chapter, verse - context_size, verse + context_size))
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def test_search():
|
||||
results = await search_bible_semantic("dragoste", 5)
|
||||
print("Semantic search results for 'dragoste':")
|
||||
for result in results:
|
||||
print(f"{result['ref']}: {result['text_raw'][:100]}... (similarity: {result['similarity']:.3f})")
|
||||
|
||||
print("\nHybrid search results for 'dragoste':")
|
||||
hybrid_results = await search_bible_hybrid("dragoste", 5)
|
||||
for result in hybrid_results:
|
||||
print(f"{result['ref']}: {result['text_raw'][:100]}... (score: {result['combined_score']:.3f})")
|
||||
|
||||
asyncio.run(test_search())
|
||||
Reference in New Issue
Block a user