Implement Azure OpenAI vector embeddings for Romanian Bible

- Add pgvector support with bible_passages table for vector search
- Create Python ingestion script for Azure OpenAI embed-3 embeddings
- Implement hybrid search combining vector similarity and full-text search
- Update AI chat to use vector search with Azure OpenAI gpt-4o
- Add floating chat component with Material UI design
- Import complete Romanian Bible (FIDELA) with 30K+ verses
- Add vector search library for semantic Bible search
- Create multi-language implementation plan for future expansion

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
andupetcu
2025-09-20 15:18:00 +03:00
parent 3b375c869b
commit dd5e1102eb
14 changed files with 2082 additions and 68 deletions

121
scripts/bible_search.py Normal file
View File

@@ -0,0 +1,121 @@
import os
import asyncio
from typing import List, Dict
from dotenv import load_dotenv
import httpx
import psycopg
from psycopg.rows import dict_row
load_dotenv()
AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
DB_URL = os.getenv("DATABASE_URL")
EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
async def get_embedding(text: str) -> List[float]:
"""Get embedding for a text using Azure OpenAI"""
payload = {"input": [text]}
headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
async with httpx.AsyncClient() as client:
for attempt in range(3):
try:
r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=30)
if r.status_code == 200:
data = r.json()
return data["data"][0]["embedding"]
elif r.status_code in (429, 500, 503):
backoff = 2 ** attempt
await asyncio.sleep(backoff)
else:
raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
except Exception as e:
if attempt == 2:
raise e
await asyncio.sleep(2 ** attempt)
async def search_bible_semantic(query: str, limit: int = 10) -> List[Dict]:
"""Search Bible using semantic similarity"""
# Get embedding for the query
query_embedding = await get_embedding(query)
# Search for similar verses
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT ref, book, chapter, verse, text_raw,
1 - (embedding <=> %s) AS similarity
FROM bible_passages
WHERE embedding IS NOT NULL
ORDER BY embedding <=> %s
LIMIT %s
""", (query_embedding, query_embedding, limit))
return cur.fetchall()
async def search_bible_hybrid(query: str, limit: int = 10) -> List[Dict]:
"""Search Bible using hybrid semantic + lexical search"""
# Get embedding for the query
query_embedding = await get_embedding(query)
# Create search query for full-text search
search_query = " & ".join(query.split())
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute("""
WITH vector_search AS (
SELECT id, 1 - (embedding <=> %s) AS vector_sim
FROM bible_passages
WHERE embedding IS NOT NULL
ORDER BY embedding <=> %s
LIMIT 100
),
text_search AS (
SELECT id, ts_rank(tsv, plainto_tsquery('romanian', %s)) AS text_rank
FROM bible_passages
WHERE tsv @@ plainto_tsquery('romanian', %s)
)
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
FROM bible_passages bp
LEFT JOIN vector_search vs ON vs.id = bp.id
LEFT JOIN text_search ts ON ts.id = bp.id
WHERE vs.id IS NOT NULL OR ts.id IS NOT NULL
ORDER BY combined_score DESC
LIMIT %s
""", (query_embedding, query_embedding, query, query, limit))
return cur.fetchall()
async def get_context_verses(book: str, chapter: int, verse: int, context_size: int = 2) -> List[Dict]:
"""Get surrounding verses for context"""
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
with conn.cursor() as cur:
cur.execute("""
SELECT ref, book, chapter, verse, text_raw
FROM bible_passages
WHERE book = %s AND chapter = %s
AND verse BETWEEN %s AND %s
ORDER BY verse
""", (book, chapter, verse - context_size, verse + context_size))
return cur.fetchall()
if __name__ == "__main__":
async def test_search():
results = await search_bible_semantic("dragoste", 5)
print("Semantic search results for 'dragoste':")
for result in results:
print(f"{result['ref']}: {result['text_raw'][:100]}... (similarity: {result['similarity']:.3f})")
print("\nHybrid search results for 'dragoste':")
hybrid_results = await search_bible_hybrid("dragoste", 5)
for result in hybrid_results:
print(f"{result['ref']}: {result['text_raw'][:100]}... (score: {result['combined_score']:.3f})")
asyncio.run(test_search())

View File

@@ -0,0 +1,305 @@
import { PrismaClient } from '@prisma/client'
import * as fs from 'fs'
import * as path from 'path'
const prisma = new PrismaClient()
// Book name mappings from Romanian to standardized names
const BOOK_MAPPINGS: Record<string, { name: string; abbreviation: string; testament: string; orderNum: number }> = {
'Geneza': { name: 'Geneza', abbreviation: 'GEN', testament: 'OT', orderNum: 1 },
'Exodul': { name: 'Exodul', abbreviation: 'EXO', testament: 'OT', orderNum: 2 },
'Leviticul': { name: 'Leviticul', abbreviation: 'LEV', testament: 'OT', orderNum: 3 },
'Numeri': { name: 'Numerii', abbreviation: 'NUM', testament: 'OT', orderNum: 4 },
'Deuteronom': { name: 'Deuteronomul', abbreviation: 'DEU', testament: 'OT', orderNum: 5 },
'Iosua': { name: 'Iosua', abbreviation: 'JOS', testament: 'OT', orderNum: 6 },
'Judecători': { name: 'Judecătorii', abbreviation: 'JDG', testament: 'OT', orderNum: 7 },
'Rut': { name: 'Rut', abbreviation: 'RUT', testament: 'OT', orderNum: 8 },
'1 Samuel': { name: '1 Samuel', abbreviation: '1SA', testament: 'OT', orderNum: 9 },
'2 Samuel': { name: '2 Samuel', abbreviation: '2SA', testament: 'OT', orderNum: 10 },
'1 Imparati': { name: '1 Împărați', abbreviation: '1KI', testament: 'OT', orderNum: 11 },
'2 Imparati': { name: '2 Împărați', abbreviation: '2KI', testament: 'OT', orderNum: 12 },
'1 Cronici': { name: '1 Cronici', abbreviation: '1CH', testament: 'OT', orderNum: 13 },
'2 Cronici': { name: '2 Cronici', abbreviation: '2CH', testament: 'OT', orderNum: 14 },
'Ezra': { name: 'Ezra', abbreviation: 'EZR', testament: 'OT', orderNum: 15 },
'Neemia': { name: 'Neemia', abbreviation: 'NEH', testament: 'OT', orderNum: 16 },
'Estera': { name: 'Estera', abbreviation: 'EST', testament: 'OT', orderNum: 17 },
'Iov': { name: 'Iov', abbreviation: 'JOB', testament: 'OT', orderNum: 18 },
'Psalmii': { name: 'Psalmii', abbreviation: 'PSA', testament: 'OT', orderNum: 19 },
'Proverbe': { name: 'Proverbele', abbreviation: 'PRO', testament: 'OT', orderNum: 20 },
'Eclesiastul': { name: 'Eclesiastul', abbreviation: 'ECC', testament: 'OT', orderNum: 21 },
'Cântarea Cântărilor': { name: 'Cântarea Cântărilor', abbreviation: 'SNG', testament: 'OT', orderNum: 22 },
'Isaia': { name: 'Isaia', abbreviation: 'ISA', testament: 'OT', orderNum: 23 },
'Ieremia': { name: 'Ieremia', abbreviation: 'JER', testament: 'OT', orderNum: 24 },
'Plângerile': { name: 'Plângerile', abbreviation: 'LAM', testament: 'OT', orderNum: 25 },
'Ezechiel': { name: 'Ezechiel', abbreviation: 'EZK', testament: 'OT', orderNum: 26 },
'Daniel': { name: 'Daniel', abbreviation: 'DAN', testament: 'OT', orderNum: 27 },
'Osea': { name: 'Osea', abbreviation: 'HOS', testament: 'OT', orderNum: 28 },
'Ioel': { name: 'Ioel', abbreviation: 'JOL', testament: 'OT', orderNum: 29 },
'Amos': { name: 'Amos', abbreviation: 'AMO', testament: 'OT', orderNum: 30 },
'Obadia': { name: 'Obadia', abbreviation: 'OBA', testament: 'OT', orderNum: 31 },
'Iona': { name: 'Iona', abbreviation: 'JON', testament: 'OT', orderNum: 32 },
'Mica': { name: 'Mica', abbreviation: 'MIC', testament: 'OT', orderNum: 33 },
'Naum': { name: 'Naum', abbreviation: 'NAM', testament: 'OT', orderNum: 34 },
'Habacuc': { name: 'Habacuc', abbreviation: 'HAB', testament: 'OT', orderNum: 35 },
'Țefania': { name: 'Țefania', abbreviation: 'ZEP', testament: 'OT', orderNum: 36 },
'Hagai': { name: 'Hagai', abbreviation: 'HAG', testament: 'OT', orderNum: 37 },
'Zaharia': { name: 'Zaharia', abbreviation: 'ZEC', testament: 'OT', orderNum: 38 },
'Maleahi': { name: 'Maleahi', abbreviation: 'MAL', testament: 'OT', orderNum: 39 },
// New Testament
'Matei': { name: 'Matei', abbreviation: 'MAT', testament: 'NT', orderNum: 40 },
'Marcu': { name: 'Marcu', abbreviation: 'MRK', testament: 'NT', orderNum: 41 },
'Luca': { name: 'Luca', abbreviation: 'LUK', testament: 'NT', orderNum: 42 },
'Ioan': { name: 'Ioan', abbreviation: 'JHN', testament: 'NT', orderNum: 43 },
'Faptele Apostolilor': { name: 'Faptele Apostolilor', abbreviation: 'ACT', testament: 'NT', orderNum: 44 },
'Romani': { name: 'Romani', abbreviation: 'ROM', testament: 'NT', orderNum: 45 },
'1 Corinteni': { name: '1 Corinteni', abbreviation: '1CO', testament: 'NT', orderNum: 46 },
'2 Corinteni': { name: '2 Corinteni', abbreviation: '2CO', testament: 'NT', orderNum: 47 },
'Galateni': { name: 'Galateni', abbreviation: 'GAL', testament: 'NT', orderNum: 48 },
'Efeseni': { name: 'Efeseni', abbreviation: 'EPH', testament: 'NT', orderNum: 49 },
'Filipeni': { name: 'Filipeni', abbreviation: 'PHP', testament: 'NT', orderNum: 50 },
'Coloseni': { name: 'Coloseni', abbreviation: 'COL', testament: 'NT', orderNum: 51 },
'1 Tesaloniceni': { name: '1 Tesaloniceni', abbreviation: '1TH', testament: 'NT', orderNum: 52 },
'2 Tesaloniceni': { name: '2 Tesaloniceni', abbreviation: '2TH', testament: 'NT', orderNum: 53 },
'1 Timotei': { name: '1 Timotei', abbreviation: '1TI', testament: 'NT', orderNum: 54 },
'2 Timotei': { name: '2 Timotei', abbreviation: '2TI', testament: 'NT', orderNum: 55 },
'Titus': { name: 'Titus', abbreviation: 'TIT', testament: 'NT', orderNum: 56 },
'Filimon': { name: 'Filimon', abbreviation: 'PHM', testament: 'NT', orderNum: 57 },
'Evrei': { name: 'Evrei', abbreviation: 'HEB', testament: 'NT', orderNum: 58 },
'Iacov': { name: 'Iacov', abbreviation: 'JAS', testament: 'NT', orderNum: 59 },
'1 Petru': { name: '1 Petru', abbreviation: '1PE', testament: 'NT', orderNum: 60 },
'2 Petru': { name: '2 Petru', abbreviation: '2PE', testament: 'NT', orderNum: 61 },
'1 Ioan': { name: '1 Ioan', abbreviation: '1JN', testament: 'NT', orderNum: 62 },
'2 Ioan': { name: '2 Ioan', abbreviation: '2JN', testament: 'NT', orderNum: 63 },
'3 Ioan': { name: '3 Ioan', abbreviation: '3JN', testament: 'NT', orderNum: 64 },
'Iuda': { name: 'Iuda', abbreviation: 'JUD', testament: 'NT', orderNum: 65 },
'Revelaţia': { name: 'Revelația', abbreviation: 'REV', testament: 'NT', orderNum: 66 },
}
interface ParsedVerse {
verseNum: number
text: string
}
interface ParsedChapter {
chapterNum: number
verses: ParsedVerse[]
}
interface ParsedBook {
name: string
chapters: ParsedChapter[]
}
async function parseRomanianBible(filePath: string): Promise<ParsedBook[]> {
console.log(`Reading Romanian Bible from: ${filePath}`)
const content = fs.readFileSync(filePath, 'utf-8')
const lines = content.split('\n')
const books: ParsedBook[] = []
let currentBook: ParsedBook | null = null
let currentChapter: ParsedChapter | null = null
let isInBibleContent = false
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim()
// Start processing after "VECHIUL TESTAMENT"
if (line === 'VECHIUL TESTAMENT' || line === 'TESTAMENT') {
isInBibleContent = true
continue
}
if (!isInBibleContent) continue
// Book detection: … BookName …
const bookMatch = line.match(/^…\s*(.+?)\s*…$/)
if (bookMatch) {
// Save previous book if exists
if (currentBook && currentBook.chapters.length > 0) {
books.push(currentBook)
}
const bookName = bookMatch[1].trim()
console.log(`Found book: ${bookName}`)
currentBook = {
name: bookName,
chapters: []
}
currentChapter = null
continue
}
// Chapter detection: Capitolul X or CApitoLuL X
const chapterMatch = line.match(/^[cC][aA][pP][iI][tT][oO][lL][uU][lL]\s+(\d+)$/i)
if (chapterMatch && currentBook) {
// Save previous chapter if exists
if (currentChapter && currentChapter.verses.length > 0) {
currentBook.chapters.push(currentChapter)
}
const chapterNum = parseInt(chapterMatch[1])
console.log(` Chapter ${chapterNum}`)
currentChapter = {
chapterNum,
verses: []
}
continue
}
// Verse detection: starts with number
const verseMatch = line.match(/^(\d+)\s+(.+)$/)
if (verseMatch && currentChapter) {
const verseNum = parseInt(verseMatch[1])
let verseText = verseMatch[2].trim()
// Handle paragraph markers
verseText = verseText.replace(/^¶\s*/, '')
// Look ahead for continuation lines (lines that don't start with numbers or special markers)
let j = i + 1
while (j < lines.length) {
const nextLine = lines[j].trim()
// Stop if we hit a new verse, chapter, book, or empty line
if (!nextLine ||
nextLine.match(/^\d+\s/) || // New verse
nextLine.match(/^[cC][aA][pP][iI][tT][oO][lL][uU][lL]\s+\d+$/i) || // New chapter
nextLine.match(/^….*…$/) || // New book
nextLine === 'TESTAMENT') { // Testament marker
break
}
// Add continuation line
verseText += ' ' + nextLine
j++
}
// Clean up the text
verseText = verseText.replace(/\s+/g, ' ').trim()
currentChapter.verses.push({
verseNum,
text: verseText
})
// Skip the lines we've processed
i = j - 1
continue
}
}
// Save the last book and chapter
if (currentChapter && currentChapter.verses.length > 0 && currentBook) {
currentBook.chapters.push(currentChapter)
}
if (currentBook && currentBook.chapters.length > 0) {
books.push(currentBook)
}
console.log(`Parsed ${books.length} books`)
return books
}
async function importRomanianBible() {
try {
console.log('Starting Romanian Bible import...')
// Clear existing data
console.log('Clearing existing data...')
await prisma.bibleVerse.deleteMany()
await prisma.bibleChapter.deleteMany()
await prisma.bibleBook.deleteMany()
// Parse the markdown file
const filePath = path.join(process.cwd(), 'bibles', 'Biblia-Fidela-limba-romana.md')
const books = await parseRomanianBible(filePath)
console.log(`Importing ${books.length} books into database...`)
for (const book of books) {
const bookInfo = BOOK_MAPPINGS[book.name]
if (!bookInfo) {
console.warn(`Warning: No mapping found for book "${book.name}", skipping...`)
continue
}
console.log(`Creating book: ${bookInfo.name}`)
// Create book
const createdBook = await prisma.bibleBook.create({
data: {
id: bookInfo.orderNum,
name: bookInfo.name,
testament: bookInfo.testament,
orderNum: bookInfo.orderNum
}
})
// Create chapters and verses
for (const chapter of book.chapters) {
console.log(` Creating chapter ${chapter.chapterNum} with ${chapter.verses.length} verses`)
const createdChapter = await prisma.bibleChapter.create({
data: {
bookId: createdBook.id,
chapterNum: chapter.chapterNum
}
})
// Create verses in batch (deduplicate by verse number)
const uniqueVerses = chapter.verses.reduce((acc, verse) => {
acc[verse.verseNum] = verse // This will overwrite duplicates
return acc
}, {} as Record<number, ParsedVerse>)
const versesData = Object.values(uniqueVerses).map(verse => ({
chapterId: createdChapter.id,
verseNum: verse.verseNum,
text: verse.text,
version: 'FIDELA'
}))
if (versesData.length > 0) {
await prisma.bibleVerse.createMany({
data: versesData
})
}
}
}
// Print summary
const bookCount = await prisma.bibleBook.count()
const chapterCount = await prisma.bibleChapter.count()
const verseCount = await prisma.bibleVerse.count()
console.log('\n✅ Romanian Bible import completed successfully!')
console.log(`📚 Books imported: ${bookCount}`)
console.log(`📖 Chapters imported: ${chapterCount}`)
console.log(`📝 Verses imported: ${verseCount}`)
} catch (error) {
console.error('❌ Error importing Romanian Bible:', error)
throw error
} finally {
await prisma.$disconnect()
}
}
// Run the import
if (require.main === module) {
importRomanianBible()
.then(() => {
console.log('Import completed successfully!')
process.exit(0)
})
.catch((error) => {
console.error('Import failed:', error)
process.exit(1)
})
}
export { importRomanianBible }

View File

@@ -0,0 +1,231 @@
import os, re, json, math, time, asyncio
from typing import List, Dict, Tuple, Iterable
from dataclasses import dataclass
from pathlib import Path
from dotenv import load_dotenv
import httpx
import psycopg
from psycopg.rows import dict_row
load_dotenv()
AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
EMBED_DIMS = int(os.getenv("EMBED_DIMS", "3072"))
DB_URL = os.getenv("DATABASE_URL")
BIBLE_MD_PATH = os.getenv("BIBLE_MD_PATH")
LANG_CODE = os.getenv("LANG_CODE", "ro")
TRANSLATION = os.getenv("TRANSLATION_CODE", "FIDELA")
assert AZ_ENDPOINT and AZ_API_KEY and DB_URL and BIBLE_MD_PATH, "Missing required env vars"
EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
BOOKS_OT = [
"Geneza","Exodul","Leviticul","Numeri","Deuteronom","Iosua","Judecători","Rut",
"1 Samuel","2 Samuel","1 Imparati","2 Imparati","1 Cronici","2 Cronici","Ezra","Neemia","Estera",
"Iov","Psalmii","Proverbe","Eclesiastul","Cântarea Cântărilor","Isaia","Ieremia","Plângerile",
"Ezechiel","Daniel","Osea","Ioel","Amos","Obadia","Iona","Mica","Naum","Habacuc","Țefania","Hagai","Zaharia","Maleahi"
]
BOOKS_NT = [
"Matei","Marcu","Luca","Ioan","Faptele Apostolilor","Romani","1 Corinteni","2 Corinteni",
"Galateni","Efeseni","Filipeni","Coloseni","1 Tesaloniceni","2 Tesaloniceni","1 Timotei","2 Timotei",
"Titus","Filimon","Evrei","Iacov","1 Petru","2 Petru","1 Ioan","2 Ioan","3 Ioan","Iuda","Revelaţia"
]
BOOK_CANON = {b:("OT" if b in BOOKS_OT else "NT") for b in BOOKS_OT + BOOKS_NT}
@dataclass
class Verse:
testament: str
book: str
chapter: int
verse: int
text_raw: str
text_norm: str
def normalize_text(s: str) -> str:
s = re.sub(r"\s+", " ", s.strip())
s = s.replace(" ", " ")
return s
BOOK_RE = re.compile(r"^(?P<book>[A-ZĂÂÎȘȚ][^\n]+?)\s*$")
CH_RE = re.compile(r"^(?i:Capitolul|CApitoLuL)\s+(?P<ch>\d+)\b")
VERSE_RE = re.compile(r"^(?P<v>\d+)\s+(?P<body>.+)$")
def parse_bible_md(md_text: str):
cur_book, cur_ch = None, None
testament = None
is_in_bible_content = False
for line in md_text.splitlines():
line = line.rstrip()
# Start processing after "VECHIUL TESTAMENT" or when we find book markers
if line == 'VECHIUL TESTAMENT' or line == 'TESTAMENT' or '' in line:
is_in_bible_content = True
if not is_in_bible_content:
continue
# Book detection: … BookName …
book_match = re.match(r'^…\s*(.+?)\s*…$', line)
if book_match:
bname = book_match.group(1).strip()
if bname in BOOK_CANON:
cur_book = bname
testament = BOOK_CANON[bname]
cur_ch = None
print(f"Found book: {bname}")
continue
# Chapter detection: Capitolul X or CApitoLuL X
m_ch = CH_RE.match(line)
if m_ch and cur_book:
cur_ch = int(m_ch.group("ch"))
print(f" Chapter {cur_ch}")
continue
# Verse detection: starts with number
m_v = VERSE_RE.match(line)
if m_v and cur_book and cur_ch:
vnum = int(m_v.group("v"))
body = m_v.group("body").strip()
# Remove paragraph markers
body = re.sub(r'\s*', '', body)
raw = body
norm = normalize_text(body)
yield {
"testament": testament, "book": cur_book, "chapter": cur_ch, "verse": vnum,
"text_raw": raw, "text_norm": norm
}
async def embed_batch(client, inputs):
payload = {"input": inputs}
headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
for attempt in range(6):
try:
r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=60)
if r.status_code == 200:
data = r.json()
ordered = sorted(data["data"], key=lambda x: x["index"])
return [d["embedding"] for d in ordered]
elif r.status_code in (429, 500, 503):
backoff = 2 ** attempt + (0.1 * attempt)
print(f"Rate limited, waiting {backoff:.1f}s...")
await asyncio.sleep(backoff)
else:
raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
except Exception as e:
backoff = 2 ** attempt + (0.1 * attempt)
print(f"Error on attempt {attempt + 1}: {e}, waiting {backoff:.1f}s...")
await asyncio.sleep(backoff)
raise RuntimeError("Failed to embed after retries")
# First, we need to create the table with proper SQL
CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS bible_passages (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
testament TEXT NOT NULL,
book TEXT NOT NULL,
chapter INT NOT NULL,
verse INT NOT NULL,
ref TEXT GENERATED ALWAYS AS (book || ' ' || chapter || ':' || verse) STORED,
lang TEXT NOT NULL DEFAULT 'ro',
translation TEXT NOT NULL DEFAULT 'FIDELA',
text_raw TEXT NOT NULL,
text_norm TEXT NOT NULL,
tsv tsvector,
embedding vector(1536),
created_at TIMESTAMPTZ DEFAULT now(),
updated_at TIMESTAMPTZ DEFAULT now()
);
"""
CREATE_INDEXES_SQL = """
-- Uniqueness by canonical reference within translation/language
CREATE UNIQUE INDEX IF NOT EXISTS ux_ref_lang ON bible_passages (translation, lang, book, chapter, verse);
-- Full-text index
CREATE INDEX IF NOT EXISTS idx_tsv ON bible_passages USING GIN (tsv);
-- Other indexes
CREATE INDEX IF NOT EXISTS idx_book_ch ON bible_passages (book, chapter);
CREATE INDEX IF NOT EXISTS idx_testament ON bible_passages (testament);
"""
UPSERT_SQL = """
INSERT INTO bible_passages (testament, book, chapter, verse, lang, translation, text_raw, text_norm, tsv, embedding)
VALUES (%(testament)s, %(book)s, %(chapter)s, %(verse)s, %(lang)s, %(translation)s, %(text_raw)s, %(text_norm)s,
to_tsvector(COALESCE(%(ts_lang)s,'simple')::regconfig, %(text_norm)s), %(embedding)s)
ON CONFLICT (translation, lang, book, chapter, verse) DO UPDATE
SET text_raw=EXCLUDED.text_raw,
text_norm=EXCLUDED.text_norm,
tsv=EXCLUDED.tsv,
embedding=EXCLUDED.embedding,
updated_at=now();
"""
async def main():
print("Starting Bible embedding ingestion...")
md_text = Path(BIBLE_MD_PATH).read_text(encoding="utf-8", errors="ignore")
verses = list(parse_bible_md(md_text))
print(f"Parsed verses: {len(verses)}")
batch_size = 128
# First create the table structure
with psycopg.connect(DB_URL) as conn:
with conn.cursor() as cur:
print("Creating bible_passages table...")
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
cur.execute(CREATE_TABLE_SQL)
cur.execute(CREATE_INDEXES_SQL)
conn.commit()
print("Table created successfully")
# Now process embeddings
async with httpx.AsyncClient() as client:
with psycopg.connect(DB_URL, autocommit=False) as conn:
with conn.cursor() as cur:
for i in range(0, len(verses), batch_size):
batch = verses[i:i+batch_size]
inputs = [v["text_norm"] for v in batch]
print(f"Generating embeddings for batch {i//batch_size + 1}/{(len(verses) + batch_size - 1)//batch_size}")
embs = await embed_batch(client, inputs)
rows = []
for v, e in zip(batch, embs):
rows.append({
**v,
"lang": LANG_CODE,
"translation": TRANSLATION,
"ts_lang": "romanian",
"embedding": e
})
cur.executemany(UPSERT_SQL, rows)
conn.commit()
print(f"Upserted {len(rows)} verses... {i+len(rows)}/{len(verses)}")
# Create IVFFLAT index after data is loaded
print("Creating IVFFLAT index...")
with psycopg.connect(DB_URL, autocommit=True) as conn:
with conn.cursor() as cur:
cur.execute("VACUUM ANALYZE bible_passages;")
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_vec_ivfflat
ON bible_passages USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 200);
""")
print("✅ Bible embedding ingestion completed successfully!")
if __name__ == "__main__":
asyncio.run(main())