Implement Azure OpenAI vector embeddings for Romanian Bible
- Add pgvector support with bible_passages table for vector search - Create Python ingestion script for Azure OpenAI embed-3 embeddings - Implement hybrid search combining vector similarity and full-text search - Update AI chat to use vector search with Azure OpenAI gpt-4o - Add floating chat component with Material UI design - Import complete Romanian Bible (FIDELA) with 30K+ verses - Add vector search library for semantic Bible search - Create multi-language implementation plan for future expansion 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
121
scripts/bible_search.py
Normal file
121
scripts/bible_search.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import asyncio
|
||||
from typing import List, Dict
|
||||
from dotenv import load_dotenv
|
||||
import httpx
|
||||
import psycopg
|
||||
from psycopg.rows import dict_row
|
||||
|
||||
load_dotenv()
|
||||
|
||||
AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
|
||||
AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
|
||||
AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
|
||||
AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
|
||||
DB_URL = os.getenv("DATABASE_URL")
|
||||
|
||||
EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
|
||||
|
||||
async def get_embedding(text: str) -> List[float]:
|
||||
"""Get embedding for a text using Azure OpenAI"""
|
||||
payload = {"input": [text]}
|
||||
headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
for attempt in range(3):
|
||||
try:
|
||||
r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=30)
|
||||
if r.status_code == 200:
|
||||
data = r.json()
|
||||
return data["data"][0]["embedding"]
|
||||
elif r.status_code in (429, 500, 503):
|
||||
backoff = 2 ** attempt
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
|
||||
except Exception as e:
|
||||
if attempt == 2:
|
||||
raise e
|
||||
await asyncio.sleep(2 ** attempt)
|
||||
|
||||
async def search_bible_semantic(query: str, limit: int = 10) -> List[Dict]:
|
||||
"""Search Bible using semantic similarity"""
|
||||
# Get embedding for the query
|
||||
query_embedding = await get_embedding(query)
|
||||
|
||||
# Search for similar verses
|
||||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT ref, book, chapter, verse, text_raw,
|
||||
1 - (embedding <=> %s) AS similarity
|
||||
FROM bible_passages
|
||||
WHERE embedding IS NOT NULL
|
||||
ORDER BY embedding <=> %s
|
||||
LIMIT %s
|
||||
""", (query_embedding, query_embedding, limit))
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
async def search_bible_hybrid(query: str, limit: int = 10) -> List[Dict]:
|
||||
"""Search Bible using hybrid semantic + lexical search"""
|
||||
# Get embedding for the query
|
||||
query_embedding = await get_embedding(query)
|
||||
|
||||
# Create search query for full-text search
|
||||
search_query = " & ".join(query.split())
|
||||
|
||||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
WITH vector_search AS (
|
||||
SELECT id, 1 - (embedding <=> %s) AS vector_sim
|
||||
FROM bible_passages
|
||||
WHERE embedding IS NOT NULL
|
||||
ORDER BY embedding <=> %s
|
||||
LIMIT 100
|
||||
),
|
||||
text_search AS (
|
||||
SELECT id, ts_rank(tsv, plainto_tsquery('romanian', %s)) AS text_rank
|
||||
FROM bible_passages
|
||||
WHERE tsv @@ plainto_tsquery('romanian', %s)
|
||||
)
|
||||
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
|
||||
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
|
||||
FROM bible_passages bp
|
||||
LEFT JOIN vector_search vs ON vs.id = bp.id
|
||||
LEFT JOIN text_search ts ON ts.id = bp.id
|
||||
WHERE vs.id IS NOT NULL OR ts.id IS NOT NULL
|
||||
ORDER BY combined_score DESC
|
||||
LIMIT %s
|
||||
""", (query_embedding, query_embedding, query, query, limit))
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
async def get_context_verses(book: str, chapter: int, verse: int, context_size: int = 2) -> List[Dict]:
|
||||
"""Get surrounding verses for context"""
|
||||
with psycopg.connect(DB_URL, row_factory=dict_row) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT ref, book, chapter, verse, text_raw
|
||||
FROM bible_passages
|
||||
WHERE book = %s AND chapter = %s
|
||||
AND verse BETWEEN %s AND %s
|
||||
ORDER BY verse
|
||||
""", (book, chapter, verse - context_size, verse + context_size))
|
||||
|
||||
return cur.fetchall()
|
||||
|
||||
if __name__ == "__main__":
|
||||
async def test_search():
|
||||
results = await search_bible_semantic("dragoste", 5)
|
||||
print("Semantic search results for 'dragoste':")
|
||||
for result in results:
|
||||
print(f"{result['ref']}: {result['text_raw'][:100]}... (similarity: {result['similarity']:.3f})")
|
||||
|
||||
print("\nHybrid search results for 'dragoste':")
|
||||
hybrid_results = await search_bible_hybrid("dragoste", 5)
|
||||
for result in hybrid_results:
|
||||
print(f"{result['ref']}: {result['text_raw'][:100]}... (score: {result['combined_score']:.3f})")
|
||||
|
||||
asyncio.run(test_search())
|
||||
305
scripts/import-romanian-bible-md.ts
Normal file
305
scripts/import-romanian-bible-md.ts
Normal file
@@ -0,0 +1,305 @@
|
||||
import { PrismaClient } from '@prisma/client'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
|
||||
const prisma = new PrismaClient()
|
||||
|
||||
// Book name mappings from Romanian to standardized names
|
||||
const BOOK_MAPPINGS: Record<string, { name: string; abbreviation: string; testament: string; orderNum: number }> = {
|
||||
'Geneza': { name: 'Geneza', abbreviation: 'GEN', testament: 'OT', orderNum: 1 },
|
||||
'Exodul': { name: 'Exodul', abbreviation: 'EXO', testament: 'OT', orderNum: 2 },
|
||||
'Leviticul': { name: 'Leviticul', abbreviation: 'LEV', testament: 'OT', orderNum: 3 },
|
||||
'Numeri': { name: 'Numerii', abbreviation: 'NUM', testament: 'OT', orderNum: 4 },
|
||||
'Deuteronom': { name: 'Deuteronomul', abbreviation: 'DEU', testament: 'OT', orderNum: 5 },
|
||||
'Iosua': { name: 'Iosua', abbreviation: 'JOS', testament: 'OT', orderNum: 6 },
|
||||
'Judecători': { name: 'Judecătorii', abbreviation: 'JDG', testament: 'OT', orderNum: 7 },
|
||||
'Rut': { name: 'Rut', abbreviation: 'RUT', testament: 'OT', orderNum: 8 },
|
||||
'1 Samuel': { name: '1 Samuel', abbreviation: '1SA', testament: 'OT', orderNum: 9 },
|
||||
'2 Samuel': { name: '2 Samuel', abbreviation: '2SA', testament: 'OT', orderNum: 10 },
|
||||
'1 Imparati': { name: '1 Împărați', abbreviation: '1KI', testament: 'OT', orderNum: 11 },
|
||||
'2 Imparati': { name: '2 Împărați', abbreviation: '2KI', testament: 'OT', orderNum: 12 },
|
||||
'1 Cronici': { name: '1 Cronici', abbreviation: '1CH', testament: 'OT', orderNum: 13 },
|
||||
'2 Cronici': { name: '2 Cronici', abbreviation: '2CH', testament: 'OT', orderNum: 14 },
|
||||
'Ezra': { name: 'Ezra', abbreviation: 'EZR', testament: 'OT', orderNum: 15 },
|
||||
'Neemia': { name: 'Neemia', abbreviation: 'NEH', testament: 'OT', orderNum: 16 },
|
||||
'Estera': { name: 'Estera', abbreviation: 'EST', testament: 'OT', orderNum: 17 },
|
||||
'Iov': { name: 'Iov', abbreviation: 'JOB', testament: 'OT', orderNum: 18 },
|
||||
'Psalmii': { name: 'Psalmii', abbreviation: 'PSA', testament: 'OT', orderNum: 19 },
|
||||
'Proverbe': { name: 'Proverbele', abbreviation: 'PRO', testament: 'OT', orderNum: 20 },
|
||||
'Eclesiastul': { name: 'Eclesiastul', abbreviation: 'ECC', testament: 'OT', orderNum: 21 },
|
||||
'Cântarea Cântărilor': { name: 'Cântarea Cântărilor', abbreviation: 'SNG', testament: 'OT', orderNum: 22 },
|
||||
'Isaia': { name: 'Isaia', abbreviation: 'ISA', testament: 'OT', orderNum: 23 },
|
||||
'Ieremia': { name: 'Ieremia', abbreviation: 'JER', testament: 'OT', orderNum: 24 },
|
||||
'Plângerile': { name: 'Plângerile', abbreviation: 'LAM', testament: 'OT', orderNum: 25 },
|
||||
'Ezechiel': { name: 'Ezechiel', abbreviation: 'EZK', testament: 'OT', orderNum: 26 },
|
||||
'Daniel': { name: 'Daniel', abbreviation: 'DAN', testament: 'OT', orderNum: 27 },
|
||||
'Osea': { name: 'Osea', abbreviation: 'HOS', testament: 'OT', orderNum: 28 },
|
||||
'Ioel': { name: 'Ioel', abbreviation: 'JOL', testament: 'OT', orderNum: 29 },
|
||||
'Amos': { name: 'Amos', abbreviation: 'AMO', testament: 'OT', orderNum: 30 },
|
||||
'Obadia': { name: 'Obadia', abbreviation: 'OBA', testament: 'OT', orderNum: 31 },
|
||||
'Iona': { name: 'Iona', abbreviation: 'JON', testament: 'OT', orderNum: 32 },
|
||||
'Mica': { name: 'Mica', abbreviation: 'MIC', testament: 'OT', orderNum: 33 },
|
||||
'Naum': { name: 'Naum', abbreviation: 'NAM', testament: 'OT', orderNum: 34 },
|
||||
'Habacuc': { name: 'Habacuc', abbreviation: 'HAB', testament: 'OT', orderNum: 35 },
|
||||
'Țefania': { name: 'Țefania', abbreviation: 'ZEP', testament: 'OT', orderNum: 36 },
|
||||
'Hagai': { name: 'Hagai', abbreviation: 'HAG', testament: 'OT', orderNum: 37 },
|
||||
'Zaharia': { name: 'Zaharia', abbreviation: 'ZEC', testament: 'OT', orderNum: 38 },
|
||||
'Maleahi': { name: 'Maleahi', abbreviation: 'MAL', testament: 'OT', orderNum: 39 },
|
||||
|
||||
// New Testament
|
||||
'Matei': { name: 'Matei', abbreviation: 'MAT', testament: 'NT', orderNum: 40 },
|
||||
'Marcu': { name: 'Marcu', abbreviation: 'MRK', testament: 'NT', orderNum: 41 },
|
||||
'Luca': { name: 'Luca', abbreviation: 'LUK', testament: 'NT', orderNum: 42 },
|
||||
'Ioan': { name: 'Ioan', abbreviation: 'JHN', testament: 'NT', orderNum: 43 },
|
||||
'Faptele Apostolilor': { name: 'Faptele Apostolilor', abbreviation: 'ACT', testament: 'NT', orderNum: 44 },
|
||||
'Romani': { name: 'Romani', abbreviation: 'ROM', testament: 'NT', orderNum: 45 },
|
||||
'1 Corinteni': { name: '1 Corinteni', abbreviation: '1CO', testament: 'NT', orderNum: 46 },
|
||||
'2 Corinteni': { name: '2 Corinteni', abbreviation: '2CO', testament: 'NT', orderNum: 47 },
|
||||
'Galateni': { name: 'Galateni', abbreviation: 'GAL', testament: 'NT', orderNum: 48 },
|
||||
'Efeseni': { name: 'Efeseni', abbreviation: 'EPH', testament: 'NT', orderNum: 49 },
|
||||
'Filipeni': { name: 'Filipeni', abbreviation: 'PHP', testament: 'NT', orderNum: 50 },
|
||||
'Coloseni': { name: 'Coloseni', abbreviation: 'COL', testament: 'NT', orderNum: 51 },
|
||||
'1 Tesaloniceni': { name: '1 Tesaloniceni', abbreviation: '1TH', testament: 'NT', orderNum: 52 },
|
||||
'2 Tesaloniceni': { name: '2 Tesaloniceni', abbreviation: '2TH', testament: 'NT', orderNum: 53 },
|
||||
'1 Timotei': { name: '1 Timotei', abbreviation: '1TI', testament: 'NT', orderNum: 54 },
|
||||
'2 Timotei': { name: '2 Timotei', abbreviation: '2TI', testament: 'NT', orderNum: 55 },
|
||||
'Titus': { name: 'Titus', abbreviation: 'TIT', testament: 'NT', orderNum: 56 },
|
||||
'Filimon': { name: 'Filimon', abbreviation: 'PHM', testament: 'NT', orderNum: 57 },
|
||||
'Evrei': { name: 'Evrei', abbreviation: 'HEB', testament: 'NT', orderNum: 58 },
|
||||
'Iacov': { name: 'Iacov', abbreviation: 'JAS', testament: 'NT', orderNum: 59 },
|
||||
'1 Petru': { name: '1 Petru', abbreviation: '1PE', testament: 'NT', orderNum: 60 },
|
||||
'2 Petru': { name: '2 Petru', abbreviation: '2PE', testament: 'NT', orderNum: 61 },
|
||||
'1 Ioan': { name: '1 Ioan', abbreviation: '1JN', testament: 'NT', orderNum: 62 },
|
||||
'2 Ioan': { name: '2 Ioan', abbreviation: '2JN', testament: 'NT', orderNum: 63 },
|
||||
'3 Ioan': { name: '3 Ioan', abbreviation: '3JN', testament: 'NT', orderNum: 64 },
|
||||
'Iuda': { name: 'Iuda', abbreviation: 'JUD', testament: 'NT', orderNum: 65 },
|
||||
'Revelaţia': { name: 'Revelația', abbreviation: 'REV', testament: 'NT', orderNum: 66 },
|
||||
}
|
||||
|
||||
interface ParsedVerse {
|
||||
verseNum: number
|
||||
text: string
|
||||
}
|
||||
|
||||
interface ParsedChapter {
|
||||
chapterNum: number
|
||||
verses: ParsedVerse[]
|
||||
}
|
||||
|
||||
interface ParsedBook {
|
||||
name: string
|
||||
chapters: ParsedChapter[]
|
||||
}
|
||||
|
||||
async function parseRomanianBible(filePath: string): Promise<ParsedBook[]> {
|
||||
console.log(`Reading Romanian Bible from: ${filePath}`)
|
||||
|
||||
const content = fs.readFileSync(filePath, 'utf-8')
|
||||
const lines = content.split('\n')
|
||||
|
||||
const books: ParsedBook[] = []
|
||||
let currentBook: ParsedBook | null = null
|
||||
let currentChapter: ParsedChapter | null = null
|
||||
let isInBibleContent = false
|
||||
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i].trim()
|
||||
|
||||
// Start processing after "VECHIUL TESTAMENT"
|
||||
if (line === 'VECHIUL TESTAMENT' || line === 'TESTAMENT') {
|
||||
isInBibleContent = true
|
||||
continue
|
||||
}
|
||||
|
||||
if (!isInBibleContent) continue
|
||||
|
||||
// Book detection: … BookName …
|
||||
const bookMatch = line.match(/^…\s*(.+?)\s*…$/)
|
||||
if (bookMatch) {
|
||||
// Save previous book if exists
|
||||
if (currentBook && currentBook.chapters.length > 0) {
|
||||
books.push(currentBook)
|
||||
}
|
||||
|
||||
const bookName = bookMatch[1].trim()
|
||||
console.log(`Found book: ${bookName}`)
|
||||
|
||||
currentBook = {
|
||||
name: bookName,
|
||||
chapters: []
|
||||
}
|
||||
currentChapter = null
|
||||
continue
|
||||
}
|
||||
|
||||
// Chapter detection: Capitolul X or CApitoLuL X
|
||||
const chapterMatch = line.match(/^[cC][aA][pP][iI][tT][oO][lL][uU][lL]\s+(\d+)$/i)
|
||||
if (chapterMatch && currentBook) {
|
||||
// Save previous chapter if exists
|
||||
if (currentChapter && currentChapter.verses.length > 0) {
|
||||
currentBook.chapters.push(currentChapter)
|
||||
}
|
||||
|
||||
const chapterNum = parseInt(chapterMatch[1])
|
||||
console.log(` Chapter ${chapterNum}`)
|
||||
|
||||
currentChapter = {
|
||||
chapterNum,
|
||||
verses: []
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Verse detection: starts with number
|
||||
const verseMatch = line.match(/^(\d+)\s+(.+)$/)
|
||||
if (verseMatch && currentChapter) {
|
||||
const verseNum = parseInt(verseMatch[1])
|
||||
let verseText = verseMatch[2].trim()
|
||||
|
||||
// Handle paragraph markers
|
||||
verseText = verseText.replace(/^¶\s*/, '')
|
||||
|
||||
// Look ahead for continuation lines (lines that don't start with numbers or special markers)
|
||||
let j = i + 1
|
||||
while (j < lines.length) {
|
||||
const nextLine = lines[j].trim()
|
||||
|
||||
// Stop if we hit a new verse, chapter, book, or empty line
|
||||
if (!nextLine ||
|
||||
nextLine.match(/^\d+\s/) || // New verse
|
||||
nextLine.match(/^[cC][aA][pP][iI][tT][oO][lL][uU][lL]\s+\d+$/i) || // New chapter
|
||||
nextLine.match(/^….*…$/) || // New book
|
||||
nextLine === 'TESTAMENT') { // Testament marker
|
||||
break
|
||||
}
|
||||
|
||||
// Add continuation line
|
||||
verseText += ' ' + nextLine
|
||||
j++
|
||||
}
|
||||
|
||||
// Clean up the text
|
||||
verseText = verseText.replace(/\s+/g, ' ').trim()
|
||||
|
||||
currentChapter.verses.push({
|
||||
verseNum,
|
||||
text: verseText
|
||||
})
|
||||
|
||||
// Skip the lines we've processed
|
||||
i = j - 1
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Save the last book and chapter
|
||||
if (currentChapter && currentChapter.verses.length > 0 && currentBook) {
|
||||
currentBook.chapters.push(currentChapter)
|
||||
}
|
||||
if (currentBook && currentBook.chapters.length > 0) {
|
||||
books.push(currentBook)
|
||||
}
|
||||
|
||||
console.log(`Parsed ${books.length} books`)
|
||||
return books
|
||||
}
|
||||
|
||||
async function importRomanianBible() {
|
||||
try {
|
||||
console.log('Starting Romanian Bible import...')
|
||||
|
||||
// Clear existing data
|
||||
console.log('Clearing existing data...')
|
||||
await prisma.bibleVerse.deleteMany()
|
||||
await prisma.bibleChapter.deleteMany()
|
||||
await prisma.bibleBook.deleteMany()
|
||||
|
||||
// Parse the markdown file
|
||||
const filePath = path.join(process.cwd(), 'bibles', 'Biblia-Fidela-limba-romana.md')
|
||||
const books = await parseRomanianBible(filePath)
|
||||
|
||||
console.log(`Importing ${books.length} books into database...`)
|
||||
|
||||
for (const book of books) {
|
||||
const bookInfo = BOOK_MAPPINGS[book.name]
|
||||
if (!bookInfo) {
|
||||
console.warn(`Warning: No mapping found for book "${book.name}", skipping...`)
|
||||
continue
|
||||
}
|
||||
|
||||
console.log(`Creating book: ${bookInfo.name}`)
|
||||
|
||||
// Create book
|
||||
const createdBook = await prisma.bibleBook.create({
|
||||
data: {
|
||||
id: bookInfo.orderNum,
|
||||
name: bookInfo.name,
|
||||
testament: bookInfo.testament,
|
||||
orderNum: bookInfo.orderNum
|
||||
}
|
||||
})
|
||||
|
||||
// Create chapters and verses
|
||||
for (const chapter of book.chapters) {
|
||||
console.log(` Creating chapter ${chapter.chapterNum} with ${chapter.verses.length} verses`)
|
||||
|
||||
const createdChapter = await prisma.bibleChapter.create({
|
||||
data: {
|
||||
bookId: createdBook.id,
|
||||
chapterNum: chapter.chapterNum
|
||||
}
|
||||
})
|
||||
|
||||
// Create verses in batch (deduplicate by verse number)
|
||||
const uniqueVerses = chapter.verses.reduce((acc, verse) => {
|
||||
acc[verse.verseNum] = verse // This will overwrite duplicates
|
||||
return acc
|
||||
}, {} as Record<number, ParsedVerse>)
|
||||
|
||||
const versesData = Object.values(uniqueVerses).map(verse => ({
|
||||
chapterId: createdChapter.id,
|
||||
verseNum: verse.verseNum,
|
||||
text: verse.text,
|
||||
version: 'FIDELA'
|
||||
}))
|
||||
|
||||
if (versesData.length > 0) {
|
||||
await prisma.bibleVerse.createMany({
|
||||
data: versesData
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const bookCount = await prisma.bibleBook.count()
|
||||
const chapterCount = await prisma.bibleChapter.count()
|
||||
const verseCount = await prisma.bibleVerse.count()
|
||||
|
||||
console.log('\n✅ Romanian Bible import completed successfully!')
|
||||
console.log(`📚 Books imported: ${bookCount}`)
|
||||
console.log(`📖 Chapters imported: ${chapterCount}`)
|
||||
console.log(`📝 Verses imported: ${verseCount}`)
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Error importing Romanian Bible:', error)
|
||||
throw error
|
||||
} finally {
|
||||
await prisma.$disconnect()
|
||||
}
|
||||
}
|
||||
|
||||
// Run the import
|
||||
if (require.main === module) {
|
||||
importRomanianBible()
|
||||
.then(() => {
|
||||
console.log('Import completed successfully!')
|
||||
process.exit(0)
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Import failed:', error)
|
||||
process.exit(1)
|
||||
})
|
||||
}
|
||||
|
||||
export { importRomanianBible }
|
||||
231
scripts/ingest_bible_pgvector.py
Normal file
231
scripts/ingest_bible_pgvector.py
Normal file
@@ -0,0 +1,231 @@
|
||||
import os, re, json, math, time, asyncio
|
||||
from typing import List, Dict, Tuple, Iterable
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
import httpx
|
||||
import psycopg
|
||||
from psycopg.rows import dict_row
|
||||
|
||||
load_dotenv()
|
||||
|
||||
AZ_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT", "").rstrip("/")
|
||||
AZ_API_KEY = os.getenv("AZURE_OPENAI_KEY")
|
||||
AZ_API_VER = os.getenv("AZURE_OPENAI_API_VERSION", "2024-05-01-preview")
|
||||
AZ_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBED_DEPLOYMENT", "embed-3")
|
||||
EMBED_DIMS = int(os.getenv("EMBED_DIMS", "3072"))
|
||||
DB_URL = os.getenv("DATABASE_URL")
|
||||
BIBLE_MD_PATH = os.getenv("BIBLE_MD_PATH")
|
||||
LANG_CODE = os.getenv("LANG_CODE", "ro")
|
||||
TRANSLATION = os.getenv("TRANSLATION_CODE", "FIDELA")
|
||||
|
||||
assert AZ_ENDPOINT and AZ_API_KEY and DB_URL and BIBLE_MD_PATH, "Missing required env vars"
|
||||
|
||||
EMBED_URL = f"{AZ_ENDPOINT}/openai/deployments/{AZ_DEPLOYMENT}/embeddings?api-version={AZ_API_VER}"
|
||||
|
||||
BOOKS_OT = [
|
||||
"Geneza","Exodul","Leviticul","Numeri","Deuteronom","Iosua","Judecători","Rut",
|
||||
"1 Samuel","2 Samuel","1 Imparati","2 Imparati","1 Cronici","2 Cronici","Ezra","Neemia","Estera",
|
||||
"Iov","Psalmii","Proverbe","Eclesiastul","Cântarea Cântărilor","Isaia","Ieremia","Plângerile",
|
||||
"Ezechiel","Daniel","Osea","Ioel","Amos","Obadia","Iona","Mica","Naum","Habacuc","Țefania","Hagai","Zaharia","Maleahi"
|
||||
]
|
||||
BOOKS_NT = [
|
||||
"Matei","Marcu","Luca","Ioan","Faptele Apostolilor","Romani","1 Corinteni","2 Corinteni",
|
||||
"Galateni","Efeseni","Filipeni","Coloseni","1 Tesaloniceni","2 Tesaloniceni","1 Timotei","2 Timotei",
|
||||
"Titus","Filimon","Evrei","Iacov","1 Petru","2 Petru","1 Ioan","2 Ioan","3 Ioan","Iuda","Revelaţia"
|
||||
]
|
||||
|
||||
BOOK_CANON = {b:("OT" if b in BOOKS_OT else "NT") for b in BOOKS_OT + BOOKS_NT}
|
||||
|
||||
@dataclass
|
||||
class Verse:
|
||||
testament: str
|
||||
book: str
|
||||
chapter: int
|
||||
verse: int
|
||||
text_raw: str
|
||||
text_norm: str
|
||||
|
||||
def normalize_text(s: str) -> str:
|
||||
s = re.sub(r"\s+", " ", s.strip())
|
||||
s = s.replace(" ", " ")
|
||||
return s
|
||||
|
||||
BOOK_RE = re.compile(r"^(?P<book>[A-ZĂÂÎȘȚ][^\n]+?)\s*$")
|
||||
CH_RE = re.compile(r"^(?i:Capitolul|CApitoLuL)\s+(?P<ch>\d+)\b")
|
||||
VERSE_RE = re.compile(r"^(?P<v>\d+)\s+(?P<body>.+)$")
|
||||
|
||||
def parse_bible_md(md_text: str):
|
||||
cur_book, cur_ch = None, None
|
||||
testament = None
|
||||
is_in_bible_content = False
|
||||
|
||||
for line in md_text.splitlines():
|
||||
line = line.rstrip()
|
||||
|
||||
# Start processing after "VECHIUL TESTAMENT" or when we find book markers
|
||||
if line == 'VECHIUL TESTAMENT' or line == 'TESTAMENT' or '…' in line:
|
||||
is_in_bible_content = True
|
||||
|
||||
if not is_in_bible_content:
|
||||
continue
|
||||
|
||||
# Book detection: … BookName …
|
||||
book_match = re.match(r'^…\s*(.+?)\s*…$', line)
|
||||
if book_match:
|
||||
bname = book_match.group(1).strip()
|
||||
if bname in BOOK_CANON:
|
||||
cur_book = bname
|
||||
testament = BOOK_CANON[bname]
|
||||
cur_ch = None
|
||||
print(f"Found book: {bname}")
|
||||
continue
|
||||
|
||||
# Chapter detection: Capitolul X or CApitoLuL X
|
||||
m_ch = CH_RE.match(line)
|
||||
if m_ch and cur_book:
|
||||
cur_ch = int(m_ch.group("ch"))
|
||||
print(f" Chapter {cur_ch}")
|
||||
continue
|
||||
|
||||
# Verse detection: starts with number
|
||||
m_v = VERSE_RE.match(line)
|
||||
if m_v and cur_book and cur_ch:
|
||||
vnum = int(m_v.group("v"))
|
||||
body = m_v.group("body").strip()
|
||||
|
||||
# Remove paragraph markers
|
||||
body = re.sub(r'^¶\s*', '', body)
|
||||
|
||||
raw = body
|
||||
norm = normalize_text(body)
|
||||
yield {
|
||||
"testament": testament, "book": cur_book, "chapter": cur_ch, "verse": vnum,
|
||||
"text_raw": raw, "text_norm": norm
|
||||
}
|
||||
|
||||
async def embed_batch(client, inputs):
|
||||
payload = {"input": inputs}
|
||||
headers = {"api-key": AZ_API_KEY, "Content-Type": "application/json"}
|
||||
for attempt in range(6):
|
||||
try:
|
||||
r = await client.post(EMBED_URL, headers=headers, json=payload, timeout=60)
|
||||
if r.status_code == 200:
|
||||
data = r.json()
|
||||
ordered = sorted(data["data"], key=lambda x: x["index"])
|
||||
return [d["embedding"] for d in ordered]
|
||||
elif r.status_code in (429, 500, 503):
|
||||
backoff = 2 ** attempt + (0.1 * attempt)
|
||||
print(f"Rate limited, waiting {backoff:.1f}s...")
|
||||
await asyncio.sleep(backoff)
|
||||
else:
|
||||
raise RuntimeError(f"Embedding error {r.status_code}: {r.text}")
|
||||
except Exception as e:
|
||||
backoff = 2 ** attempt + (0.1 * attempt)
|
||||
print(f"Error on attempt {attempt + 1}: {e}, waiting {backoff:.1f}s...")
|
||||
await asyncio.sleep(backoff)
|
||||
raise RuntimeError("Failed to embed after retries")
|
||||
|
||||
# First, we need to create the table with proper SQL
|
||||
CREATE_TABLE_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS bible_passages (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
testament TEXT NOT NULL,
|
||||
book TEXT NOT NULL,
|
||||
chapter INT NOT NULL,
|
||||
verse INT NOT NULL,
|
||||
ref TEXT GENERATED ALWAYS AS (book || ' ' || chapter || ':' || verse) STORED,
|
||||
lang TEXT NOT NULL DEFAULT 'ro',
|
||||
translation TEXT NOT NULL DEFAULT 'FIDELA',
|
||||
text_raw TEXT NOT NULL,
|
||||
text_norm TEXT NOT NULL,
|
||||
tsv tsvector,
|
||||
embedding vector(1536),
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ DEFAULT now()
|
||||
);
|
||||
"""
|
||||
|
||||
CREATE_INDEXES_SQL = """
|
||||
-- Uniqueness by canonical reference within translation/language
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS ux_ref_lang ON bible_passages (translation, lang, book, chapter, verse);
|
||||
|
||||
-- Full-text index
|
||||
CREATE INDEX IF NOT EXISTS idx_tsv ON bible_passages USING GIN (tsv);
|
||||
|
||||
-- Other indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_book_ch ON bible_passages (book, chapter);
|
||||
CREATE INDEX IF NOT EXISTS idx_testament ON bible_passages (testament);
|
||||
"""
|
||||
|
||||
UPSERT_SQL = """
|
||||
INSERT INTO bible_passages (testament, book, chapter, verse, lang, translation, text_raw, text_norm, tsv, embedding)
|
||||
VALUES (%(testament)s, %(book)s, %(chapter)s, %(verse)s, %(lang)s, %(translation)s, %(text_raw)s, %(text_norm)s,
|
||||
to_tsvector(COALESCE(%(ts_lang)s,'simple')::regconfig, %(text_norm)s), %(embedding)s)
|
||||
ON CONFLICT (translation, lang, book, chapter, verse) DO UPDATE
|
||||
SET text_raw=EXCLUDED.text_raw,
|
||||
text_norm=EXCLUDED.text_norm,
|
||||
tsv=EXCLUDED.tsv,
|
||||
embedding=EXCLUDED.embedding,
|
||||
updated_at=now();
|
||||
"""
|
||||
|
||||
async def main():
|
||||
print("Starting Bible embedding ingestion...")
|
||||
|
||||
md_text = Path(BIBLE_MD_PATH).read_text(encoding="utf-8", errors="ignore")
|
||||
verses = list(parse_bible_md(md_text))
|
||||
print(f"Parsed verses: {len(verses)}")
|
||||
|
||||
batch_size = 128
|
||||
|
||||
# First create the table structure
|
||||
with psycopg.connect(DB_URL) as conn:
|
||||
with conn.cursor() as cur:
|
||||
print("Creating bible_passages table...")
|
||||
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
||||
cur.execute(CREATE_TABLE_SQL)
|
||||
cur.execute(CREATE_INDEXES_SQL)
|
||||
conn.commit()
|
||||
print("Table created successfully")
|
||||
|
||||
# Now process embeddings
|
||||
async with httpx.AsyncClient() as client:
|
||||
with psycopg.connect(DB_URL, autocommit=False) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for i in range(0, len(verses), batch_size):
|
||||
batch = verses[i:i+batch_size]
|
||||
inputs = [v["text_norm"] for v in batch]
|
||||
|
||||
print(f"Generating embeddings for batch {i//batch_size + 1}/{(len(verses) + batch_size - 1)//batch_size}")
|
||||
embs = await embed_batch(client, inputs)
|
||||
|
||||
rows = []
|
||||
for v, e in zip(batch, embs):
|
||||
rows.append({
|
||||
**v,
|
||||
"lang": LANG_CODE,
|
||||
"translation": TRANSLATION,
|
||||
"ts_lang": "romanian",
|
||||
"embedding": e
|
||||
})
|
||||
|
||||
cur.executemany(UPSERT_SQL, rows)
|
||||
conn.commit()
|
||||
print(f"Upserted {len(rows)} verses... {i+len(rows)}/{len(verses)}")
|
||||
|
||||
# Create IVFFLAT index after data is loaded
|
||||
print("Creating IVFFLAT index...")
|
||||
with psycopg.connect(DB_URL, autocommit=True) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("VACUUM ANALYZE bible_passages;")
|
||||
cur.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_vec_ivfflat
|
||||
ON bible_passages USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 200);
|
||||
""")
|
||||
|
||||
print("✅ Bible embedding ingestion completed successfully!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user