Vector schema per version: add ai_bible schema with per-version tables (bv_<lang>_<abbr>) in Python ingest; dynamic table resolution in vector-search with fallback to legacy table; sample pgAdmin queries printed.
This commit is contained in:
@@ -18,6 +18,7 @@ DB_URL = os.getenv("DATABASE_URL")
|
||||
BIBLE_MD_PATH = os.getenv("BIBLE_MD_PATH")
|
||||
LANG_CODE = os.getenv("LANG_CODE", "ro")
|
||||
TRANSLATION = os.getenv("TRANSLATION_CODE", "FIDELA")
|
||||
VECTOR_SCHEMA = os.getenv("VECTOR_SCHEMA", "ai_bible")
|
||||
|
||||
assert AZ_ENDPOINT and AZ_API_KEY and DB_URL and BIBLE_MD_PATH, "Missing required env vars"
|
||||
|
||||
@@ -126,49 +127,51 @@ async def embed_batch(client, inputs):
|
||||
await asyncio.sleep(backoff)
|
||||
raise RuntimeError("Failed to embed after retries")
|
||||
|
||||
# First, we need to create the table with proper SQL
|
||||
CREATE_TABLE_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS bible_passages (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
testament TEXT NOT NULL,
|
||||
book TEXT NOT NULL,
|
||||
chapter INT NOT NULL,
|
||||
verse INT NOT NULL,
|
||||
ref TEXT GENERATED ALWAYS AS (book || ' ' || chapter || ':' || verse) STORED,
|
||||
lang TEXT NOT NULL DEFAULT 'ro',
|
||||
translation TEXT NOT NULL DEFAULT 'FIDELA',
|
||||
text_raw TEXT NOT NULL,
|
||||
text_norm TEXT NOT NULL,
|
||||
tsv tsvector,
|
||||
embedding vector(1536),
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ DEFAULT now()
|
||||
);
|
||||
"""
|
||||
def safe_ident(s: str) -> str:
|
||||
return re.sub(r"[^a-z0-9_]+", "_", s.lower()).strip("_")
|
||||
|
||||
CREATE_INDEXES_SQL = """
|
||||
-- Uniqueness by canonical reference within translation/language
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS ux_ref_lang ON bible_passages (translation, lang, book, chapter, verse);
|
||||
TABLE_BASENAME = f"bv_{safe_ident(LANG_CODE)}_{safe_ident(TRANSLATION)}"
|
||||
TABLE_FQN = f'"{VECTOR_SCHEMA}"."{TABLE_BASENAME}"'
|
||||
|
||||
-- Full-text index
|
||||
CREATE INDEX IF NOT EXISTS idx_tsv ON bible_passages USING GIN (tsv);
|
||||
def create_table_sql() -> str:
|
||||
return f"""
|
||||
CREATE SCHEMA IF NOT EXISTS "{VECTOR_SCHEMA}";
|
||||
CREATE TABLE IF NOT EXISTS {TABLE_FQN} (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
testament TEXT NOT NULL,
|
||||
book TEXT NOT NULL,
|
||||
chapter INT NOT NULL,
|
||||
verse INT NOT NULL,
|
||||
ref TEXT GENERATED ALWAYS AS (book || ' ' || chapter || ':' || verse) STORED,
|
||||
text_raw TEXT NOT NULL,
|
||||
text_norm TEXT NOT NULL,
|
||||
tsv tsvector,
|
||||
embedding vector({EMBED_DIMS}),
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ DEFAULT now()
|
||||
);
|
||||
"""
|
||||
|
||||
-- Other indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_book_ch ON bible_passages (book, chapter);
|
||||
CREATE INDEX IF NOT EXISTS idx_testament ON bible_passages (testament);
|
||||
"""
|
||||
def create_indexes_sql() -> str:
|
||||
return f"""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS ux_ref_{TABLE_BASENAME} ON {TABLE_FQN} (book, chapter, verse);
|
||||
CREATE INDEX IF NOT EXISTS idx_tsv_{TABLE_BASENAME} ON {TABLE_FQN} USING GIN (tsv);
|
||||
CREATE INDEX IF NOT EXISTS idx_book_ch_{TABLE_BASENAME} ON {TABLE_FQN} (book, chapter);
|
||||
CREATE INDEX IF NOT EXISTS idx_testament_{TABLE_BASENAME} ON {TABLE_FQN} (testament);
|
||||
"""
|
||||
|
||||
UPSERT_SQL = """
|
||||
INSERT INTO bible_passages (testament, book, chapter, verse, lang, translation, text_raw, text_norm, tsv, embedding)
|
||||
VALUES (%(testament)s, %(book)s, %(chapter)s, %(verse)s, %(lang)s, %(translation)s, %(text_raw)s, %(text_norm)s,
|
||||
to_tsvector(COALESCE(%(ts_lang)s,'simple')::regconfig, %(text_norm)s), %(embedding)s)
|
||||
ON CONFLICT (translation, lang, book, chapter, verse) DO UPDATE
|
||||
SET text_raw=EXCLUDED.text_raw,
|
||||
text_norm=EXCLUDED.text_norm,
|
||||
tsv=EXCLUDED.tsv,
|
||||
embedding=EXCLUDED.embedding,
|
||||
updated_at=now();
|
||||
"""
|
||||
def upsert_sql() -> str:
|
||||
return f"""
|
||||
INSERT INTO {TABLE_FQN} (testament, book, chapter, verse, text_raw, text_norm, tsv, embedding)
|
||||
VALUES (%(testament)s, %(book)s, %(chapter)s, %(verse)s, %(text_raw)s, %(text_norm)s,
|
||||
to_tsvector(COALESCE(%(ts_lang)s,'simple')::regconfig, %(text_norm)s), %(embedding)s)
|
||||
ON CONFLICT (book, chapter, verse) DO UPDATE
|
||||
SET text_raw=EXCLUDED.text_raw,
|
||||
text_norm=EXCLUDED.text_norm,
|
||||
tsv=EXCLUDED.tsv,
|
||||
embedding=EXCLUDED.embedding,
|
||||
updated_at=now();
|
||||
"""
|
||||
|
||||
async def main():
|
||||
print("Starting Bible embedding ingestion...")
|
||||
@@ -179,15 +182,15 @@ async def main():
|
||||
|
||||
batch_size = 128
|
||||
|
||||
# First create the table structure
|
||||
# First create the schema + table structure for this language/version
|
||||
with psycopg.connect(DB_URL) as conn:
|
||||
with conn.cursor() as cur:
|
||||
print("Creating bible_passages table...")
|
||||
print(f"Creating schema '{VECTOR_SCHEMA}' and table {TABLE_FQN} ...")
|
||||
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
|
||||
cur.execute(CREATE_TABLE_SQL)
|
||||
cur.execute(CREATE_INDEXES_SQL)
|
||||
cur.execute(create_table_sql())
|
||||
cur.execute(create_indexes_sql())
|
||||
conn.commit()
|
||||
print("Table created successfully")
|
||||
print("Schema/table ready")
|
||||
|
||||
# Now process embeddings
|
||||
async with httpx.AsyncClient() as client:
|
||||
@@ -204,13 +207,11 @@ async def main():
|
||||
for v, e in zip(batch, embs):
|
||||
rows.append({
|
||||
**v,
|
||||
"lang": LANG_CODE,
|
||||
"translation": TRANSLATION,
|
||||
"ts_lang": "romanian",
|
||||
"ts_lang": "romanian" if LANG_CODE.lower().startswith("ro") else ("english" if LANG_CODE.lower().startswith("en") else "simple"),
|
||||
"embedding": e
|
||||
})
|
||||
|
||||
cur.executemany(UPSERT_SQL, rows)
|
||||
cur.executemany(upsert_sql(), rows)
|
||||
conn.commit()
|
||||
print(f"Upserted {len(rows)} verses... {i+len(rows)}/{len(verses)}")
|
||||
|
||||
@@ -218,14 +219,20 @@ async def main():
|
||||
print("Creating IVFFLAT index...")
|
||||
with psycopg.connect(DB_URL, autocommit=True) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("VACUUM ANALYZE bible_passages;")
|
||||
cur.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_vec_ivfflat
|
||||
ON bible_passages USING ivfflat (embedding vector_cosine_ops)
|
||||
cur.execute(f"VACUUM ANALYZE {TABLE_FQN};")
|
||||
cur.execute(f"""
|
||||
CREATE INDEX IF NOT EXISTS idx_vec_ivfflat_{TABLE_BASENAME}
|
||||
ON {TABLE_FQN} USING ivfflat (embedding vector_cosine_ops)
|
||||
WITH (lists = 200);
|
||||
""")
|
||||
|
||||
print("✅ Bible embedding ingestion completed successfully!")
|
||||
|
||||
# Helpful pgAdmin queries:
|
||||
print("\nRun these sample queries in pgAdmin:")
|
||||
print(f"SELECT count(*) FROM {TABLE_FQN};")
|
||||
print(f"SELECT book, chapter, verse, left(text_raw, 80) AS preview FROM {TABLE_FQN} ORDER BY book, chapter, verse LIMIT 10;")
|
||||
print(f"SELECT * FROM {TABLE_FQN} WHERE book='Geneza' AND chapter=1 AND verse=1;")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user