Implement Azure OpenAI vector embeddings for Romanian Bible

- Add pgvector support with bible_passages table for vector search - Create Python ingestion script for Azure OpenAI embed-3 embeddings - Implement hybrid search combining vector similarity and full-text search - Update AI chat to use vector search with Azure OpenAI gpt-4o - Add floating chat component with Material UI design - Import complete Romanian Bible (FIDELA) with 30K+ verses - Add vector search library for semantic Bible search - Create multi-language implementation plan for future expansion 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-20 15:18:00 +03:00
parent 3b375c869b
commit dd5e1102eb
14 changed files with 2082 additions and 68 deletions
--- a/scripts/bible_search.py
+++ b/scripts/bible_search.py
@@ -0,0 +1,121 @@
+import os
+import asyncio
+from typing import List, Dict
+from dotenv import load_dotenv
+import httpx
+import psycopg
+from psycopg.rows import dict_row
+
+load_dotenv()
+
+AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
+AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
+AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
+AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
+DB_URL = os.getenv("DATABASE_URL")
+
+EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
+
+async def get_embedding(text: str) -> List[float]:
+    """Get embedding for a text using Azure OpenAI"""
+    payload = {"input": [text]}
+    headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
+
+    async with httpx.AsyncClient() as client:
+        for attempt in range(3):
+            try:
+                r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=30)
+                if r.status_code == 200:
+                    data = r.json()
+                    return data["data"][0]["embedding"]
+                elif r.status_code in (429, 500, 503):
+                    backoff = 2 ** attempt
+                    await asyncio.sleep(backoff)
+                else:
+                    raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
+            except Exception as e:
+                if attempt == 2:
+                    raise e
+                await asyncio.sleep(2 ** attempt)
+
+async def search_bible_semantic(query: str, limit: int = 10) -> List[Dict]:
+    """Search Bible using semantic similarity"""
+    # Get embedding for the query
+    query_embedding = await get_embedding(query)
+
+    # Search for similar verses
+    with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT ref, book, chapter, verse, text_raw,
+                       1 - (embedding <=> %s) AS similarity
+                FROM bible_passages
+                WHERE embedding IS NOT NULL
+                ORDER BY embedding <=> %s
+                LIMIT %s
+            """, (query_embedding, query_embedding, limit))
+
+            return cur.fetchall()
+
+async def search_bible_hybrid(query: str, limit: int = 10) -> List[Dict]:
+    """Search Bible using hybrid semantic + lexical search"""
+    # Get embedding for the query
+    query_embedding = await get_embedding(query)
+
+    # Create search query for full-text search
+    search_query = " & ".join(query.split())
+
+    with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                WITH vector_search AS (
+                    SELECT id, 1 - (embedding <=> %s) AS vector_sim
+                    FROM bible_passages
+                    WHERE embedding IS NOT NULL
+                    ORDER BY embedding <=> %s
+                    LIMIT 100
+                ),
+                text_search AS (
+                    SELECT id, ts_rank(tsv, plainto_tsquery('romanian', %s)) AS text_rank
+                    FROM bible_passages
+                    WHERE tsv @@ plainto_tsquery('romanian', %s)
+                )
+                SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
+                       COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
+                FROM bible_passages bp
+                LEFT JOIN vector_search vs ON vs.id = bp.id
+                LEFT JOIN text_search ts ON ts.id = bp.id
+                WHERE vs.id IS NOT NULL OR ts.id IS NOT NULL
+                ORDER BY combined_score DESC
+                LIMIT %s
+            """, (query_embedding, query_embedding, query, query, limit))
+
+            return cur.fetchall()
+
+async def get_context_verses(book: str, chapter: int, verse: int, context_size: int = 2) -> List[Dict]:
+    """Get surrounding verses for context"""
+    with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
+        with conn.cursor() as cur:
+            cur.execute("""
+                SELECT ref, book, chapter, verse, text_raw
+                FROM bible_passages
+                WHERE book = %s AND chapter = %s
+                AND verse BETWEEN %s AND %s
+                ORDER BY verse
+            """, (book, chapter, verse - context_size, verse + context_size))
+
+            return cur.fetchall()
+
+if __name__ == "__main__":
+    async def test_search():
+        results = await search_bible_semantic("dragoste", 5)
+        print("Semantic search results for 'dragoste':")
+        for result in results:
+            print(f"{result['ref']}: {result['text_raw'][:100]}... (similarity: {result['similarity']:.3f})")
+
+        print("\nHybrid search results for 'dragoste':")
+        hybrid_results = await search_bible_hybrid("dragoste", 5)
+        for result in hybrid_results:
+            print(f"{result['ref']}: {result['text_raw'][:100]}... (score: {result['combined_score']:.3f})")
+
+    asyncio.run(test_search())
--- a/scripts/import-romanian-bible-md.ts
+++ b/scripts/import-romanian-bible-md.ts
@@ -0,0 +1,305 @@
+import { PrismaClient } from '@prisma/client'
+import * as fs from 'fs'
+import * as path from 'path'
+
+const prisma = new PrismaClient()
+
+// Book name mappings from Romanian to standardized names
+const BOOK_MAPPINGS: Record<string, { name: string; abbreviation: string; testament: string; orderNum: number }> = {
+  'Geneza': { name: 'Geneza', abbreviation: 'GEN', testament: 'OT', orderNum: 1 },
+  'Exodul': { name: 'Exodul', abbreviation: 'EXO', testament: 'OT', orderNum: 2 },
+  'Leviticul': { name: 'Leviticul', abbreviation: 'LEV', testament: 'OT', orderNum: 3 },
+  'Numeri': { name: 'Numerii', abbreviation: 'NUM', testament: 'OT', orderNum: 4 },
+  'Deuteronom': { name: 'Deuteronomul', abbreviation: 'DEU', testament: 'OT', orderNum: 5 },
+  'Iosua': { name: 'Iosua', abbreviation: 'JOS', testament: 'OT', orderNum: 6 },
+  'Judecători': { name: 'Judecătorii', abbreviation: 'JDG', testament: 'OT', orderNum: 7 },
+  'Rut': { name: 'Rut', abbreviation: 'RUT', testament: 'OT', orderNum: 8 },
+  '1 Samuel': { name: '1 Samuel', abbreviation: '1SA', testament: 'OT', orderNum: 9 },
+  '2 Samuel': { name: '2 Samuel', abbreviation: '2SA', testament: 'OT', orderNum: 10 },
+  '1 Imparati': { name: '1 Împărați', abbreviation: '1KI', testament: 'OT', orderNum: 11 },
+  '2 Imparati': { name: '2 Împărați', abbreviation: '2KI', testament: 'OT', orderNum: 12 },
+  '1 Cronici': { name: '1 Cronici', abbreviation: '1CH', testament: 'OT', orderNum: 13 },
+  '2 Cronici': { name: '2 Cronici', abbreviation: '2CH', testament: 'OT', orderNum: 14 },
+  'Ezra': { name: 'Ezra', abbreviation: 'EZR', testament: 'OT', orderNum: 15 },
+  'Neemia': { name: 'Neemia', abbreviation: 'NEH', testament: 'OT', orderNum: 16 },
+  'Estera': { name: 'Estera', abbreviation: 'EST', testament: 'OT', orderNum: 17 },
+  'Iov': { name: 'Iov', abbreviation: 'JOB', testament: 'OT', orderNum: 18 },
+  'Psalmii': { name: 'Psalmii', abbreviation: 'PSA', testament: 'OT', orderNum: 19 },
+  'Proverbe': { name: 'Proverbele', abbreviation: 'PRO', testament: 'OT', orderNum: 20 },
+  'Eclesiastul': { name: 'Eclesiastul', abbreviation: 'ECC', testament: 'OT', orderNum: 21 },
+  'Cântarea Cântărilor': { name: 'Cântarea Cântărilor', abbreviation: 'SNG', testament: 'OT', orderNum: 22 },
+  'Isaia': { name: 'Isaia', abbreviation: 'ISA', testament: 'OT', orderNum: 23 },
+  'Ieremia': { name: 'Ieremia', abbreviation: 'JER', testament: 'OT', orderNum: 24 },
+  'Plângerile': { name: 'Plângerile', abbreviation: 'LAM', testament: 'OT', orderNum: 25 },
+  'Ezechiel': { name: 'Ezechiel', abbreviation: 'EZK', testament: 'OT', orderNum: 26 },
+  'Daniel': { name: 'Daniel', abbreviation: 'DAN', testament: 'OT', orderNum: 27 },
+  'Osea': { name: 'Osea', abbreviation: 'HOS', testament: 'OT', orderNum: 28 },
+  'Ioel': { name: 'Ioel', abbreviation: 'JOL', testament: 'OT', orderNum: 29 },
+  'Amos': { name: 'Amos', abbreviation: 'AMO', testament: 'OT', orderNum: 30 },
+  'Obadia': { name: 'Obadia', abbreviation: 'OBA', testament: 'OT', orderNum: 31 },
+  'Iona': { name: 'Iona', abbreviation: 'JON', testament: 'OT', orderNum: 32 },
+  'Mica': { name: 'Mica', abbreviation: 'MIC', testament: 'OT', orderNum: 33 },
+  'Naum': { name: 'Naum', abbreviation: 'NAM', testament: 'OT', orderNum: 34 },
+  'Habacuc': { name: 'Habacuc', abbreviation: 'HAB', testament: 'OT', orderNum: 35 },
+  'Țefania': { name: 'Țefania', abbreviation: 'ZEP', testament: 'OT', orderNum: 36 },
+  'Hagai': { name: 'Hagai', abbreviation: 'HAG', testament: 'OT', orderNum: 37 },
+  'Zaharia': { name: 'Zaharia', abbreviation: 'ZEC', testament: 'OT', orderNum: 38 },
+  'Maleahi': { name: 'Maleahi', abbreviation: 'MAL', testament: 'OT', orderNum: 39 },
+
+  // New Testament
+  'Matei': { name: 'Matei', abbreviation: 'MAT', testament: 'NT', orderNum: 40 },
+  'Marcu': { name: 'Marcu', abbreviation: 'MRK', testament: 'NT', orderNum: 41 },
+  'Luca': { name: 'Luca', abbreviation: 'LUK', testament: 'NT', orderNum: 42 },
+  'Ioan': { name: 'Ioan', abbreviation: 'JHN', testament: 'NT', orderNum: 43 },
+  'Faptele Apostolilor': { name: 'Faptele Apostolilor', abbreviation: 'ACT', testament: 'NT', orderNum: 44 },
+  'Romani': { name: 'Romani', abbreviation: 'ROM', testament: 'NT', orderNum: 45 },
+  '1 Corinteni': { name: '1 Corinteni', abbreviation: '1CO', testament: 'NT', orderNum: 46 },
+  '2 Corinteni': { name: '2 Corinteni', abbreviation: '2CO', testament: 'NT', orderNum: 47 },
+  'Galateni': { name: 'Galateni', abbreviation: 'GAL', testament: 'NT', orderNum: 48 },
+  'Efeseni': { name: 'Efeseni', abbreviation: 'EPH', testament: 'NT', orderNum: 49 },
+  'Filipeni': { name: 'Filipeni', abbreviation: 'PHP', testament: 'NT', orderNum: 50 },
+  'Coloseni': { name: 'Coloseni', abbreviation: 'COL', testament: 'NT', orderNum: 51 },
+  '1 Tesaloniceni': { name: '1 Tesaloniceni', abbreviation: '1TH', testament: 'NT', orderNum: 52 },
+  '2 Tesaloniceni': { name: '2 Tesaloniceni', abbreviation: '2TH', testament: 'NT', orderNum: 53 },
+  '1 Timotei': { name: '1 Timotei', abbreviation: '1TI', testament: 'NT', orderNum: 54 },
+  '2 Timotei': { name: '2 Timotei', abbreviation: '2TI', testament: 'NT', orderNum: 55 },
+  'Titus': { name: 'Titus', abbreviation: 'TIT', testament: 'NT', orderNum: 56 },
+  'Filimon': { name: 'Filimon', abbreviation: 'PHM', testament: 'NT', orderNum: 57 },
+  'Evrei': { name: 'Evrei', abbreviation: 'HEB', testament: 'NT', orderNum: 58 },
+  'Iacov': { name: 'Iacov', abbreviation: 'JAS', testament: 'NT', orderNum: 59 },
+  '1 Petru': { name: '1 Petru', abbreviation: '1PE', testament: 'NT', orderNum: 60 },
+  '2 Petru': { name: '2 Petru', abbreviation: '2PE', testament: 'NT', orderNum: 61 },
+  '1 Ioan': { name: '1 Ioan', abbreviation: '1JN', testament: 'NT', orderNum: 62 },
+  '2 Ioan': { name: '2 Ioan', abbreviation: '2JN', testament: 'NT', orderNum: 63 },
+  '3 Ioan': { name: '3 Ioan', abbreviation: '3JN', testament: 'NT', orderNum: 64 },
+  'Iuda': { name: 'Iuda', abbreviation: 'JUD', testament: 'NT', orderNum: 65 },
+  'Revelaţia': { name: 'Revelația', abbreviation: 'REV', testament: 'NT', orderNum: 66 },
+}
+
+interface ParsedVerse {
+  verseNum: number
+  text: string
+}
+
+interface ParsedChapter {
+  chapterNum: number
+  verses: ParsedVerse[]
+}
+
+interface ParsedBook {
+  name: string
+  chapters: ParsedChapter[]
+}
+
+async function parseRomanianBible(filePath: string): Promise<ParsedBook[]> {
+  console.log(`Reading Romanian Bible from: ${filePath}`)
+
+  const content = fs.readFileSync(filePath, 'utf-8')
+  const lines = content.split('\n')
+
+  const books: ParsedBook[] = []
+  let currentBook: ParsedBook | null = null
+  let currentChapter: ParsedChapter | null = null
+  let isInBibleContent = false
+
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim()
+
+    // Start processing after "VECHIUL TESTAMENT"
+    if (line === 'VECHIUL TESTAMENT' || line === 'TESTAMENT') {
+      isInBibleContent = true
+      continue
+    }
+
+    if (!isInBibleContent) continue
+
+    // Book detection: … BookName …
+    const bookMatch = line.match(/^…\s*(.+?)\s*…$/)
+    if (bookMatch) {
+      // Save previous book if exists
+      if (currentBook && currentBook.chapters.length > 0) {
+        books.push(currentBook)
+      }
+
+      const bookName = bookMatch[1].trim()
+      console.log(`Found book: ${bookName}`)
+
+      currentBook = {
+        name: bookName,
+        chapters: []
+      }
+      currentChapter = null
+      continue
+    }
+
+    // Chapter detection: Capitolul X or CApitoLuL X
+    const chapterMatch = line.match(/^[cC][aA][pP][iI][tT][oO][lL][uU][lL]\s+(\d+)$/i)
+    if (chapterMatch && currentBook) {
+      // Save previous chapter if exists
+      if (currentChapter && currentChapter.verses.length > 0) {
+        currentBook.chapters.push(currentChapter)
+      }
+
+      const chapterNum = parseInt(chapterMatch[1])
+      console.log(`  Chapter ${chapterNum}`)
+
+      currentChapter = {
+        chapterNum,
+        verses: []
+      }
+      continue
+    }
+
+    // Verse detection: starts with number
+    const verseMatch = line.match(/^(\d+)\s+(.+)$/)
+    if (verseMatch && currentChapter) {
+      const verseNum = parseInt(verseMatch[1])
+      let verseText = verseMatch[2].trim()
+
+      // Handle paragraph markers
+      verseText = verseText.replace(/^¶\s*/, '')
+
+      // Look ahead for continuation lines (lines that don't start with numbers or special markers)
+      let j = i + 1
+      while (j < lines.length) {
+        const nextLine = lines[j].trim()
+
+        // Stop if we hit a new verse, chapter, book, or empty line
+        if (!nextLine ||
+            nextLine.match(/^\d+\s/) ||           // New verse
+            nextLine.match(/^[cC][aA][pP][iI][tT][oO][lL][uU][lL]\s+\d+$/i) || // New chapter
+            nextLine.match(/^….*…$/) ||           // New book
+            nextLine === 'TESTAMENT') {           // Testament marker
+          break
+        }
+
+        // Add continuation line
+        verseText += ' ' + nextLine
+        j++
+      }
+
+      // Clean up the text
+      verseText = verseText.replace(/\s+/g, ' ').trim()
+
+      currentChapter.verses.push({
+        verseNum,
+        text: verseText
+      })
+
+      // Skip the lines we've processed
+      i = j - 1
+      continue
+    }
+  }
+
+  // Save the last book and chapter
+  if (currentChapter && currentChapter.verses.length > 0 && currentBook) {
+    currentBook.chapters.push(currentChapter)
+  }
+  if (currentBook && currentBook.chapters.length > 0) {
+    books.push(currentBook)
+  }
+
+  console.log(`Parsed ${books.length} books`)
+  return books
+}
+
+async function importRomanianBible() {
+  try {
+    console.log('Starting Romanian Bible import...')
+
+    // Clear existing data
+    console.log('Clearing existing data...')
+    await prisma.bibleVerse.deleteMany()
+    await prisma.bibleChapter.deleteMany()
+    await prisma.bibleBook.deleteMany()
+
+    // Parse the markdown file
+    const filePath = path.join(process.cwd(), 'bibles', 'Biblia-Fidela-limba-romana.md')
+    const books = await parseRomanianBible(filePath)
+
+    console.log(`Importing ${books.length} books into database...`)
+
+    for (const book of books) {
+      const bookInfo = BOOK_MAPPINGS[book.name]
+      if (!bookInfo) {
+        console.warn(`Warning: No mapping found for book "${book.name}", skipping...`)
+        continue
+      }
+
+      console.log(`Creating book: ${bookInfo.name}`)
+
+      // Create book
+      const createdBook = await prisma.bibleBook.create({
+        data: {
+          id: bookInfo.orderNum,
+          name: bookInfo.name,
+          testament: bookInfo.testament,
+          orderNum: bookInfo.orderNum
+        }
+      })
+
+      // Create chapters and verses
+      for (const chapter of book.chapters) {
+        console.log(`  Creating chapter ${chapter.chapterNum} with ${chapter.verses.length} verses`)
+
+        const createdChapter = await prisma.bibleChapter.create({
+          data: {
+            bookId: createdBook.id,
+            chapterNum: chapter.chapterNum
+          }
+        })
+
+        // Create verses in batch (deduplicate by verse number)
+        const uniqueVerses = chapter.verses.reduce((acc, verse) => {
+          acc[verse.verseNum] = verse  // This will overwrite duplicates
+          return acc
+        }, {} as Record<number, ParsedVerse>)
+
+        const versesData = Object.values(uniqueVerses).map(verse => ({
+          chapterId: createdChapter.id,
+          verseNum: verse.verseNum,
+          text: verse.text,
+          version: 'FIDELA'
+        }))
+
+        if (versesData.length > 0) {
+          await prisma.bibleVerse.createMany({
+            data: versesData
+          })
+        }
+      }
+    }
+
+    // Print summary
+    const bookCount = await prisma.bibleBook.count()
+    const chapterCount = await prisma.bibleChapter.count()
+    const verseCount = await prisma.bibleVerse.count()
+
+    console.log('\n✅ Romanian Bible import completed successfully!')
+    console.log(`📚 Books imported: ${bookCount}`)
+    console.log(`📖 Chapters imported: ${chapterCount}`)
+    console.log(`📝 Verses imported: ${verseCount}`)
+
+  } catch (error) {
+    console.error('❌ Error importing Romanian Bible:', error)
+    throw error
+  } finally {
+    await prisma.$disconnect()
+  }
+}
+
+// Run the import
+if (require.main === module) {
+  importRomanianBible()
+    .then(() => {
+      console.log('Import completed successfully!')
+      process.exit(0)
+    })
+    .catch((error) => {
+      console.error('Import failed:', error)
+      process.exit(1)
+    })
+}
+
+export { importRomanianBible }
--- a/scripts/ingest_bible_pgvector.py
+++ b/scripts/ingest_bible_pgvector.py
@@ -0,0 +1,231 @@
+import os, re, json, math, time, asyncio
+from typing import List, Dict, Tuple, Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from dotenv import load_dotenv
+import httpx
+import psycopg
+from psycopg.rows import dict_row
+
+load_dotenv()
+
+AZ_ENDPOINT   = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
+AZ_API_KEY    = os.getenv("AZURE_OPENAI_KEY")
+AZ_API_VER    = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
+AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
+EMBED_DIMS    = int(os.getenv("EMBED_DIMS", "3072"))
+DB_URL        = os.getenv("DATABASE_URL")
+BIBLE_MD_PATH = os.getenv("BIBLE_MD_PATH")
+LANG_CODE     = os.getenv("LANG_CODE", "ro")
+TRANSLATION   = os.getenv("TRANSLATION_CODE", "FIDELA")
+
+assert AZ_ENDPOINT and AZ_API_KEY and DB_URL and BIBLE_MD_PATH, "Missing required env vars"
+
+EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
+
+BOOKS_OT = [
+  "Geneza","Exodul","Leviticul","Numeri","Deuteronom","Iosua","Judecători","Rut",
+  "1 Samuel","2 Samuel","1 Imparati","2 Imparati","1 Cronici","2 Cronici","Ezra","Neemia","Estera",
+  "Iov","Psalmii","Proverbe","Eclesiastul","Cântarea Cântărilor","Isaia","Ieremia","Plângerile",
+  "Ezechiel","Daniel","Osea","Ioel","Amos","Obadia","Iona","Mica","Naum","Habacuc","Țefania","Hagai","Zaharia","Maleahi"
+]
+BOOKS_NT = [
+  "Matei","Marcu","Luca","Ioan","Faptele Apostolilor","Romani","1 Corinteni","2 Corinteni",
+  "Galateni","Efeseni","Filipeni","Coloseni","1 Tesaloniceni","2 Tesaloniceni","1 Timotei","2 Timotei",
+  "Titus","Filimon","Evrei","Iacov","1 Petru","2 Petru","1 Ioan","2 Ioan","3 Ioan","Iuda","Revelaţia"
+]
+
+BOOK_CANON = {b:("OT" if b in BOOKS_OT else "NT") for b in BOOKS_OT + BOOKS_NT}
+
+@dataclass
+class Verse:
+    testament: str
+    book: str
+    chapter: int
+    verse: int
+    text_raw: str
+    text_norm: str
+
+def normalize_text(s: str) -> str:
+    s = re.sub(r"\s+", " ", s.strip())
+    s = s.replace("  ", " ")
+    return s
+
+BOOK_RE   = re.compile(r"^(?P<book>[A-ZĂÂÎȘȚ][^\n]+?)\s*$")
+CH_RE     = re.compile(r"^(?i:Capitolul|CApitoLuL)\s+(?P<ch>\d+)\b")
+VERSE_RE  = re.compile(r"^(?P<v>\d+)\s+(?P<body>.+)$")
+
+def parse_bible_md(md_text: str):
+    cur_book, cur_ch = None, None
+    testament = None
+    is_in_bible_content = False
+
+    for line in md_text.splitlines():
+        line = line.rstrip()
+
+        # Start processing after "VECHIUL TESTAMENT" or when we find book markers
+        if line == 'VECHIUL TESTAMENT' or line == 'TESTAMENT' or '…' in line:
+            is_in_bible_content = True
+
+        if not is_in_bible_content:
+            continue
+
+        # Book detection: … BookName …
+        book_match = re.match(r'^…\s*(.+?)\s*…$', line)
+        if book_match:
+            bname = book_match.group(1).strip()
+            if bname in BOOK_CANON:
+                cur_book = bname
+                testament = BOOK_CANON[bname]
+                cur_ch = None
+                print(f"Found book: {bname}")
+                continue
+
+        # Chapter detection: Capitolul X or CApitoLuL X
+        m_ch = CH_RE.match(line)
+        if m_ch and cur_book:
+            cur_ch = int(m_ch.group("ch"))
+            print(f"  Chapter {cur_ch}")
+            continue
+
+        # Verse detection: starts with number
+        m_v = VERSE_RE.match(line)
+        if m_v and cur_book and cur_ch:
+            vnum = int(m_v.group("v"))
+            body = m_v.group("body").strip()
+
+            # Remove paragraph markers
+            body = re.sub(r'^¶\s*', '', body)
+
+            raw = body
+            norm = normalize_text(body)
+            yield {
+                "testament": testament, "book": cur_book, "chapter": cur_ch, "verse": vnum,
+                "text_raw": raw, "text_norm": norm
+            }
+
+async def embed_batch(client, inputs):
+    payload = {"input": inputs}
+    headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
+    for attempt in range(6):
+        try:
+            r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=60)
+            if r.status_code == 200:
+                data = r.json()
+                ordered = sorted(data["data"], key=lambda x: x["index"])
+                return [d["embedding"] for d in ordered]
+            elif r.status_code in (429, 500, 503):
+                backoff = 2 ** attempt + (0.1 * attempt)
+                print(f"Rate limited, waiting {backoff:.1f}s...")
+                await asyncio.sleep(backoff)
+            else:
+                raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
+        except Exception as e:
+            backoff = 2 ** attempt + (0.1 * attempt)
+            print(f"Error on attempt {attempt + 1}: {e}, waiting {backoff:.1f}s...")
+            await asyncio.sleep(backoff)
+    raise RuntimeError("Failed to embed after retries")
+
+# First, we need to create the table with proper SQL
+CREATE_TABLE_SQL = """
+CREATE TABLE IF NOT EXISTS bible_passages (
+  id               UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+  testament        TEXT NOT NULL,
+  book             TEXT NOT NULL,
+  chapter          INT  NOT NULL,
+  verse            INT  NOT NULL,
+  ref              TEXT GENERATED ALWAYS AS (book || ' ' || chapter || ':' || verse) STORED,
+  lang             TEXT NOT NULL DEFAULT 'ro',
+  translation      TEXT NOT NULL DEFAULT 'FIDELA',
+  text_raw         TEXT NOT NULL,
+  text_norm        TEXT NOT NULL,
+  tsv              tsvector,
+  embedding        vector(1536),
+  created_at       TIMESTAMPTZ DEFAULT now(),
+  updated_at       TIMESTAMPTZ DEFAULT now()
+);
+"""
+
+CREATE_INDEXES_SQL = """
+-- Uniqueness by canonical reference within translation/language
+CREATE UNIQUE INDEX IF NOT EXISTS ux_ref_lang ON bible_passages (translation, lang, book, chapter, verse);
+
+-- Full-text index
+CREATE INDEX IF NOT EXISTS idx_tsv ON bible_passages USING GIN (tsv);
+
+-- Other indexes
+CREATE INDEX IF NOT EXISTS idx_book_ch ON bible_passages (book, chapter);
+CREATE INDEX IF NOT EXISTS idx_testament ON bible_passages (testament);
+"""
+
+UPSERT_SQL = """
+INSERT INTO bible_passages (testament, book, chapter, verse, lang, translation, text_raw, text_norm, tsv, embedding)
+VALUES (%(testament)s, %(book)s, %(chapter)s, %(verse)s, %(lang)s, %(translation)s, %(text_raw)s, %(text_norm)s,
+        to_tsvector(COALESCE(%(ts_lang)s,'simple')::regconfig, %(text_norm)s), %(embedding)s)
+ON CONFLICT (translation, lang, book, chapter, verse) DO UPDATE
+SET text_raw=EXCLUDED.text_raw,
+    text_norm=EXCLUDED.text_norm,
+    tsv=EXCLUDED.tsv,
+    embedding=EXCLUDED.embedding,
+    updated_at=now();
+"""
+
+async def main():
+    print("Starting Bible embedding ingestion...")
+
+    md_text = Path(BIBLE_MD_PATH).read_text(encoding="utf-8", errors="ignore")
+    verses = list(parse_bible_md(md_text))
+    print(f"Parsed verses: {len(verses)}")
+
+    batch_size = 128
+
+    # First create the table structure
+    with psycopg.connect(DB_URL) as conn:
+        with conn.cursor() as cur:
+            print("Creating bible_passages table...")
+            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
+            cur.execute(CREATE_TABLE_SQL)
+            cur.execute(CREATE_INDEXES_SQL)
+            conn.commit()
+            print("Table created successfully")
+
+    # Now process embeddings
+    async with httpx.AsyncClient() as client:
+        with psycopg.connect(DB_URL, autocommit=False) as conn:
+            with conn.cursor() as cur:
+                for i in range(0, len(verses), batch_size):
+                    batch = verses[i:i+batch_size]
+                    inputs = [v["text_norm"] for v in batch]
+
+                    print(f"Generating embeddings for batch {i//batch_size + 1}/{(len(verses) + batch_size - 1)//batch_size}")
+                    embs = await embed_batch(client, inputs)
+
+                    rows = []
+                    for v, e in zip(batch, embs):
+                        rows.append({
+                            **v,
+                            "lang": LANG_CODE,
+                            "translation": TRANSLATION,
+                            "ts_lang": "romanian",
+                            "embedding": e
+                        })
+
+                    cur.executemany(UPSERT_SQL, rows)
+                    conn.commit()
+                    print(f"Upserted {len(rows)} verses... {i+len(rows)}/{len(verses)}")
+
+    # Create IVFFLAT index after data is loaded
+    print("Creating IVFFLAT index...")
+    with psycopg.connect(DB_URL, autocommit=True) as conn:
+        with conn.cursor() as cur:
+            cur.execute("VACUUM ANALYZE bible_passages;")
+            cur.execute("""
+                CREATE INDEX IF NOT EXISTS idx_vec_ivfflat
+                ON bible_passages USING ivfflat (embedding vector_cosine_ops)
+                WITH (lists = 200);
+            """)
+
+    print("✅ Bible embedding ingestion completed successfully!")
+
+if __name__ == "__main__":
+    asyncio.run(main())