Vector schema per version: add ai_bible schema with per-version tables (bv_<lang>_<abbr>) in Python ingest; dynamic table resolution in vector-search with fallback to legacy table; sample pgAdmin queries printed.

This commit is contained in:
andupetcu
2025-09-20 19:08:11 +03:00
parent 8b26d72c1c
commit 5ddf62e5cf
2 changed files with 168 additions and 90 deletions

View File

@@ -4,6 +4,44 @@ const pool = new Pool({
connectionString: process.env.DATABASE_URL,
})
const VECTOR_SCHEMA = process.env.VECTOR_SCHEMA || 'ai_bible'
function safeIdent(s: string): string {
return s.toLowerCase().replace(/[^a-z0-9_]+/g, '_').replace(/^_+|_+$/g, '')
}
// Resolve per-language default version and corresponding vector table name
// e.g. ai_bible.bv_ro_cornilescu
async function resolveVectorTable(language: string): Promise<{ table: string; exists: boolean }> {
const lang = safeIdent(language || 'ro')
const client = await pool.connect()
try {
// Get default version abbreviation from "BibleVersion"
const res = await client.query(
`SELECT "abbreviation" FROM "BibleVersion"
WHERE lower(language) = lower($1)
ORDER BY "isDefault" DESC, "createdAt" ASC
LIMIT 1`,
[language]
)
const abbr = res.rows?.[0]?.abbreviation || 'default'
const ab = safeIdent(abbr)
const table = `${VECTOR_SCHEMA}.bv_${lang}_${ab}`
// Check if table exists
const check = await client.query(
`SELECT EXISTS (
SELECT 1 FROM information_schema.tables
WHERE table_schema = $1 AND table_name = $2
) AS exists`,
[VECTOR_SCHEMA, `bv_${lang}_${ab}`]
)
return { table, exists: Boolean(check.rows?.[0]?.exists) }
} finally {
client.release()
}
}
export interface BibleVerse {
id: string
ref: string
@@ -44,21 +82,29 @@ export async function searchBibleSemantic(
limit: number = 10
): Promise<BibleVerse[]> {
try {
const { table, exists } = await resolveVectorTable(language)
const queryEmbedding = await getEmbedding(query)
const client = await pool.connect()
try {
const result = await client.query(
`
SELECT ref, book, chapter, verse, text_raw,
1 - (embedding <=> $1) AS similarity
FROM bible_passages
WHERE embedding IS NOT NULL AND lang = $3
ORDER BY embedding <=> $1
LIMIT $2
`,
[JSON.stringify(queryEmbedding), limit, language]
)
const sql = exists
? `SELECT ref, book, chapter, verse, text_raw,
1 - (embedding <=> $1) AS similarity
FROM ${table}
WHERE embedding IS NOT NULL
ORDER BY embedding <=> $1
LIMIT $2`
: `SELECT ref, book, chapter, verse, text_raw,
1 - (embedding <=> $1) AS similarity
FROM bible_passages
WHERE embedding IS NOT NULL AND lang = $3
ORDER BY embedding <=> $1
LIMIT $2`
const params = exists
? [JSON.stringify(queryEmbedding), limit]
: [JSON.stringify(queryEmbedding), limit, language]
const result = await client.query(sql, params)
return result.rows
} finally {
@@ -76,6 +122,7 @@ export async function searchBibleHybrid(
limit: number = 10
): Promise<BibleVerse[]> {
try {
const { table, exists } = await resolveVectorTable(language)
const queryEmbedding = await getEmbedding(query)
// Use appropriate text search configuration based on language
@@ -83,31 +130,53 @@ export async function searchBibleHybrid(
const client = await pool.connect()
try {
const result = await client.query(
`
WITH vector_search AS (
SELECT id, 1 - (embedding <=> $1) AS vector_sim
FROM bible_passages
WHERE embedding IS NOT NULL AND lang = $4
ORDER BY embedding <=> $1
LIMIT 100
),
text_search AS (
SELECT id, ts_rank(tsv, plainto_tsquery($5, $3)) AS text_rank
FROM bible_passages
WHERE tsv @@ plainto_tsquery($5, $3) AND lang = $4
)
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
FROM bible_passages bp
LEFT JOIN vector_search vs ON vs.id = bp.id
LEFT JOIN text_search ts ON ts.id = bp.id
WHERE (vs.id IS NOT NULL OR ts.id IS NOT NULL) AND bp.lang = $4
ORDER BY combined_score DESC
LIMIT $2
`,
[JSON.stringify(queryEmbedding), limit, query, language, textConfig]
)
const sql = exists
? `WITH vector_search AS (
SELECT id, 1 - (embedding <=> $1) AS vector_sim
FROM ${table}
WHERE embedding IS NOT NULL
ORDER BY embedding <=> $1
LIMIT 100
),
text_search AS (
SELECT id, ts_rank(tsv, plainto_tsquery($4, $3)) AS text_rank
FROM ${table}
WHERE tsv @@ plainto_tsquery($4, $3)
)
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
FROM ${table} bp
LEFT JOIN vector_search vs ON vs.id = bp.id
LEFT JOIN text_search ts ON ts.id = bp.id
WHERE (vs.id IS NOT NULL OR ts.id IS NOT NULL)
ORDER BY combined_score DESC
LIMIT $2`
: `WITH vector_search AS (
SELECT id, 1 - (embedding <=> $1) AS vector_sim
FROM bible_passages
WHERE embedding IS NOT NULL AND lang = $4
ORDER BY embedding <=> $1
LIMIT 100
),
text_search AS (
SELECT id, ts_rank(tsv, plainto_tsquery($5, $3)) AS text_rank
FROM bible_passages
WHERE tsv @@ plainto_tsquery($5, $3) AND lang = $4
)
SELECT bp.ref, bp.book, bp.chapter, bp.verse, bp.text_raw,
COALESCE(vs.vector_sim, 0) * 0.7 + COALESCE(ts.text_rank, 0) * 0.3 AS combined_score
FROM bible_passages bp
LEFT JOIN vector_search vs ON vs.id = bp.id
LEFT JOIN text_search ts ON ts.id = bp.id
WHERE (vs.id IS NOT NULL OR ts.id IS NOT NULL) AND bp.lang = $4
ORDER BY combined_score DESC
LIMIT $2`
const params = exists
? [JSON.stringify(queryEmbedding), limit, query, textConfig]
: [JSON.stringify(queryEmbedding), limit, query, language, textConfig]
const result = await client.query(sql, params)
return result.rows
} finally {
@@ -125,6 +194,8 @@ export async function getContextVerses(
verse: number,
contextSize: number = 2
): Promise<BibleVerse[]> {
// For context, we can't infer language here; callers should use the main hybrid result to decide.
// For now, fallback to legacy table for context retrieval; can be extended to use per-language table.
const client = await pool.connect()
try {
const result = await client.query(
@@ -142,4 +213,4 @@ export async function getContextVerses(
} finally {
client.release()
}
}
}