biblical-guide.com/scripts/usfm-to-json.ts

#!/usr/bin/env tsx
import fs from 'fs'
import path from 'path'

/*
  Convert a directory of USFM files (e.g., WEB/KJV) into our OT/NT JSON format.

  Env:
    - INPUT_USFM_DIR: path to folder with *.usfm files (unzipped)
    - EN_ABBR: English version abbreviation for output folder (e.g., WEB or KJV)
    - OUTPUT_DIR (optional): defaults to data/en_bible/<EN_ABBR>

  Output:
    - <OUTPUT_DIR>/old_testament.json
    - <OUTPUT_DIR>/new_testament.json

  USFM markers parsed:
    - \id <BOOKID>
    - \h  <Header/Book name> (optional)
    - \c  <chapter number>
    - \v  <verse number> <text>
*/

const INPUT = process.env.INPUT_USFM_DIR || ''
const ABBR = (process.env.EN_ABBR || 'WEB').toUpperCase()
const OUTPUT_DIR = process.env.OUTPUT_DIR || path.join('data','en_bible', ABBR)

if (!INPUT || !fs.existsSync(INPUT)) {
  console.error('Missing or invalid INPUT_USFM_DIR. Set INPUT_USFM_DIR to a folder containing *.usfm files (unzipped).')
  process.exit(1)
}

function ensureDir(p: string) { fs.mkdirSync(p, { recursive: true }) }
function writeJson(file: string, obj: any) { ensureDir(path.dirname(file)); fs.writeFileSync(file, JSON.stringify(obj, null, 2), 'utf-8') }

// Canonical order + mapping from USFM book codes to English names + testament
// Based on standard Protestant canon 66 books
type CanonEntry = { code: string; name: string; testament: 'OT'|'NT' }
const CANON: CanonEntry[] = [
  {code:'GEN',name:'Genesis',testament:'OT'},{code:'EXO',name:'Exodus',testament:'OT'},{code:'LEV',name:'Leviticus',testament:'OT'},
  {code:'NUM',name:'Numbers',testament:'OT'},{code:'DEU',name:'Deuteronomy',testament:'OT'},{code:'JOS',name:'Joshua',testament:'OT'},
  {code:'JDG',name:'Judges',testament:'OT'},{code:'RUT',name:'Ruth',testament:'OT'},{code:'1SA',name:'1 Samuel',testament:'OT'},
  {code:'2SA',name:'2 Samuel',testament:'OT'},{code:'1KI',name:'1 Kings',testament:'OT'},{code:'2KI',name:'2 Kings',testament:'OT'},
  {code:'1CH',name:'1 Chronicles',testament:'OT'},{code:'2CH',name:'2 Chronicles',testament:'OT'},{code:'EZR',name:'Ezra',testament:'OT'},
  {code:'NEH',name:'Nehemiah',testament:'OT'},{code:'EST',name:'Esther',testament:'OT'},{code:'JOB',name:'Job',testament:'OT'},
  {code:'PSA',name:'Psalms',testament:'OT'},{code:'PRO',name:'Proverbs',testament:'OT'},{code:'ECC',name:'Ecclesiastes',testament:'OT'},
  {code:'SNG',name:'Song of Songs',testament:'OT'},{code:'ISA',name:'Isaiah',testament:'OT'},{code:'JER',name:'Jeremiah',testament:'OT'},
  {code:'LAM',name:'Lamentations',testament:'OT'},{code:'EZK',name:'Ezekiel',testament:'OT'},{code:'DAN',name:'Daniel',testament:'OT'},
  {code:'HOS',name:'Hosea',testament:'OT'},{code:'JOL',name:'Joel',testament:'OT'},{code:'AMO',name:'Amos',testament:'OT'},
  {code:'OBA',name:'Obadiah',testament:'OT'},{code:'JON',name:'Jonah',testament:'OT'},{code:'MIC',name:'Micah',testament:'OT'},
  {code:'NAM',name:'Nahum',testament:'OT'},{code:'HAB',name:'Habakkuk',testament:'OT'},{code:'ZEP',name:'Zephaniah',testament:'OT'},
  {code:'HAG',name:'Haggai',testament:'OT'},{code:'ZEC',name:'Zechariah',testament:'OT'},{code:'MAL',name:'Malachi',testament:'OT'},
  {code:'MAT',name:'Matthew',testament:'NT'},{code:'MRK',name:'Mark',testament:'NT'},{code:'LUK',name:'Luke',testament:'NT'},
  {code:'JHN',name:'John',testament:'NT'},{code:'ACT',name:'Acts',testament:'NT'},{code:'ROM',name:'Romans',testament:'NT'},
  {code:'1CO',name:'1 Corinthians',testament:'NT'},{code:'2CO',name:'2 Corinthians',testament:'NT'},{code:'GAL',name:'Galatians',testament:'NT'},
  {code:'EPH',name:'Ephesians',testament:'NT'},{code:'PHP',name:'Philippians',testament:'NT'},{code:'COL',name:'Colossians',testament:'NT'},
  {code:'1TH',name:'1 Thessalonians',testament:'NT'},{code:'2TH',name:'2 Thessalonians',testament:'NT'},{code:'1TI',name:'1 Timothy',testament:'NT'},
  {code:'2TI',name:'2 Timothy',testament:'NT'},{code:'TIT',name:'Titus',testament:'NT'},{code:'PHM',name:'Philemon',testament:'NT'},
  {code:'HEB',name:'Hebrews',testament:'NT'},{code:'JAS',name:'James',testament:'NT'},{code:'1PE',name:'1 Peter',testament:'NT'},
  {code:'2PE',name:'2 Peter',testament:'NT'},{code:'1JN',name:'1 John',testament:'NT'},{code:'2JN',name:'2 John',testament:'NT'},
  {code:'3JN',name:'3 John',testament:'NT'},{code:'JUD',name:'Jude',testament:'NT'},{code:'REV',name:'Revelation',testament:'NT'}
]
const CODE_TO_META = new Map(CANON.map((c,i)=>[c.code,{...c, order:i+1}]))

type Verse = { verseNum:number; text:string }
type Chapter = { chapterNum:number; verses:Verse[] }
type Book = { name:string; code:string; testament:'OT'|'NT'; chapters:Chapter[] }

function parseUsfmFile(file: string): Book | null {
  const lines = fs.readFileSync(file,'utf-8').split(/\r?\n/)
  let code = ''
  let name = ''
  let currentChapter = 0
  let currentVerses: Verse[] = []
  const chapters = new Map<number, Verse[]>()

  for (let raw of lines) {
    const line = raw.trim()
    if (/^\\id\s+/.test(line)) {
      const m = line.match(/^\\id\s+(\S+)/)
      if (m) code = m[1].toUpperCase()
      continue
    }
    if (/^\\h\s+/.test(line)) {
      // \h Genesis
      name = line.replace(/^\\h\s+/, '').trim()
      continue
    }
    if (/^\\c\s+/.test(line)) {
      // new chapter
      if (currentChapter > 0) chapters.set(currentChapter, currentVerses)
      currentChapter = parseInt(line.slice(3).trim(), 10)
      currentVerses = []
      continue
    }
    if (/^\\v\s+/.test(line)) {
      // \v 1 In the beginning God...
      const m = line.match(/^\\v\s+(\d+)\s+(.*)$/)
      if (m) {
        const verseNum = parseInt(m[1], 10)
        let text = m[2]
        // Strip inline USFM markup, preserving words
        // Remove word wrappers: \w Word|strong="..."\w* and \+w ... \+w*
        text = text.replace(/\\\+?w\s+/gi, '')
                   .replace(/\|strong="[^"]*"/gi, '')
                   .replace(/\\\+?w\*/gi, '')
        // Remove footnotes / cross-refs blocks: \f ... \f* and \x ... \x*
        text = text.replace(/\\f\s+.*?\\f\*/gis, ' ')
                   .replace(/\\x\s+.*?\\x\*/gis, ' ')
        // Remove any remaining inline tags like \\add, \\nd, \\qs, etc.
        text = text.replace(/\\[a-z0-9-]+\s*/gi, ' ')
        // Collapse whitespace
        text = text.replace(/\s+/g, ' ').trim()
        currentVerses.push({ verseNum, text })
      }
      continue
    }
    // Some USFM wrap text on subsequent lines; append to last verse if applicable
    if (currentVerses.length > 0 && line && !line.startsWith('\\')) {
      const last = currentVerses[currentVerses.length - 1]
      last.text = (last.text + ' ' + line).replace(/\s+/g,' ').trim()
    }
  }
  if (currentChapter > 0) chapters.set(currentChapter, currentVerses)

  // Resolve name/code/testament
  const meta = CODE_TO_META.get(code)
  if (!meta) return null
  const finalName = name || meta.name
  const book: Book = { name: finalName, code, testament: meta.testament, chapters: [] }
  for (const [ch, verses] of Array.from(chapters.entries()).sort((a,b)=>a[0]-b[0])) {
    if (verses.length > 0) book.chapters.push({ chapterNum: ch, verses })
  }
  return book
}

function main() {
  const files = fs.readdirSync(INPUT).filter(f=>f.toLowerCase().endsWith('.usfm'))
  console.log('USFM files found:', files.length)
  if (files.length === 0) {
    console.error('No .usfm files found in', INPUT)
    process.exit(1)
  }

  const books: Book[] = []
  for (const f of files) {
    const full = path.join(INPUT, f)
    const b = parseUsfmFile(full)
    if (b && b.chapters.length > 0) {
      books.push(b)
    } else {
      // basic debug
      // console.log('Skipping', f, 'parsed:', !!b, 'chapters:', b?.chapters.length)
    }
  }

  // Partition
  const otBooks = books.filter(b => b.testament === 'OT').sort((a,b)=>CODE_TO_META.get(a.code)!.order - CODE_TO_META.get(b.code)!.order)
  const ntBooks = books.filter(b => b.testament === 'NT').sort((a,b)=>CODE_TO_META.get(a.code)!.order - CODE_TO_META.get(b.code)!.order)

  const ot = { testament: 'Old Testament', books: otBooks.map(b=>({ name:b.name, chapters:b.chapters })) }
  const nt = { testament: 'New Testament', books: ntBooks.map(b=>({ name:b.name, chapters:b.chapters })) }

  const otFile = path.join(OUTPUT_DIR, 'old_testament.json')
  const ntFile = path.join(OUTPUT_DIR, 'new_testament.json')
  writeJson(otFile, ot)
  writeJson(ntFile, nt)
  console.log('Wrote:', otFile)
  console.log('Wrote:', ntFile)
  console.log('Books:', books.length, 'OT:', otBooks.length, 'NT:', ntBooks.length)
}

main()