Add embedded KJV support

2026-06-14 07:46:21 -05:00
parent 2d78d8e5c0
commit 544062902b
8 changed files with 31857 additions and 3 deletions
@@ -0,0 +1,144 @@
+import std/[os, strutils, tables]
+
+# Source archive: https://ebible.org/Scriptures/eng-kjv_usfm.zip
+
+const canonBookCodes = [
+  "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT",
+  "1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH",
+  "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER",
+  "LAM", "EZK", "DAN", "HOS", "JOL", "AMO", "OBA", "JON",
+  "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL", "MAT",
+  "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL",
+  "EPH", "PHP", "COL", "1TH", "2TH", "1TI", "2TI", "TIT",
+  "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN",
+  "JUD", "REV"
+]
+
+proc normalizeWhitespace(s: string): string =
+  var lastWasSpace = false
+  for ch in s:
+    if ch.isSpaceAscii:
+      if not lastWasSpace:
+        result.add(' ')
+      lastWasSpace = true
+    else:
+      result.add(ch)
+      lastWasSpace = false
+  result = result.strip
+
+proc removeFootnotes(s: string): string =
+  var i = 0
+  while i < s.len:
+    if s.continuesWith("\\f ", i) or s.continuesWith("\\f +", i):
+      let closeIdx = s.find("\\f*", i + 2)
+      if closeIdx < 0:
+        break
+      i = closeIdx + 3
+    else:
+      result.add(s[i])
+      inc i
+
+proc stripUsfmMarkup(s: string): string =
+  let withoutFootnotes = removeFootnotes(s)
+  var i = 0
+
+  while i < withoutFootnotes.len:
+    case withoutFootnotes[i]
+    of '\\':
+      inc i
+      if i < withoutFootnotes.len and withoutFootnotes[i] == '+':
+        inc i
+
+      while i < withoutFootnotes.len and
+          (withoutFootnotes[i].isAlphaAscii or
+           withoutFootnotes[i].isDigit or
+           withoutFootnotes[i] == '-'):
+        inc i
+
+      let isClosingMarker = i < withoutFootnotes.len and withoutFootnotes[i] == '*'
+      if isClosingMarker:
+        inc i
+
+      while not isClosingMarker and
+          i < withoutFootnotes.len and
+          withoutFootnotes[i].isSpaceAscii:
+        inc i
+    of '|':
+      while i < withoutFootnotes.len and withoutFootnotes[i] != '\\':
+        inc i
+    of '\t':
+      result.add(' ')
+      inc i
+    else:
+      result.add(withoutFootnotes[i])
+      inc i
+
+  result = normalizeWhitespace(result)
+
+proc parseVerseLine(line: string): tuple[verse: int, text: string] =
+  var rest = line[3..^1].strip
+  let numberEnd = rest.find(' ')
+  if numberEnd < 0:
+    raise newException(ValueError, "verse marker without text: " & line)
+
+  result.verse = parseInt(rest[0 ..< numberEnd])
+  result.text = stripUsfmMarkup(rest[numberEnd + 1 .. ^1])
+
+proc findCanonFiles(inputDir: string): Table[string, string] =
+  for path in walkFiles(inputDir / "*eng-kjv.usfm"):
+    let name = path.extractFilename
+    let dashIdx = name.find('-')
+    let suffixIdx = name.find("eng-kjv.usfm")
+    if dashIdx >= 0 and suffixIdx > dashIdx:
+      let code = name[dashIdx + 1 ..< suffixIdx]
+      if canonBookCodes.contains(code):
+        result[code] = path
+
+proc generate(inputDir, outputPath: string) =
+  let canonFiles = findCanonFiles(inputDir)
+  var rows: seq[string] = @[]
+
+  for code in canonBookCodes:
+    if not canonFiles.hasKey(code):
+      raise newException(ValueError, "missing USFM file for " & code)
+
+    var chapter = 0
+    var verse = 0
+    var verseText = ""
+
+    proc flushVerse() =
+      if chapter > 0 and verse > 0:
+        let text = normalizeWhitespace(verseText).replace("\t", " ")
+        if text.len > 0:
+          rows.add([code, $chapter, $verse, text].join("\t"))
+      verse = 0
+      verseText = ""
+
+    for rawLine in canonFiles[code].lines:
+      let line = rawLine.strip
+
+      if line.startsWith("\\c "):
+        flushVerse()
+        chapter = parseInt(line[3..^1].strip)
+      elif line.startsWith("\\v "):
+        flushVerse()
+        let parsed = parseVerseLine(line)
+        verse = parsed.verse
+        verseText = parsed.text
+      elif verse > 0:
+        let continued = stripUsfmMarkup(line)
+        if continued.len > 0:
+          if verseText.len > 0:
+            verseText.add(' ')
+          verseText.add(continued)
+
+    flushVerse()
+
+  createDir(outputPath.parentDir)
+  writeFile(outputPath, rows.join("\n") & "\n")
+
+when isMainModule:
+  if paramCount() != 2:
+    quit("Usage: generate_kjv_data <usfm-dir> <output-tsv>", QuitFailure)
+
+  generate(paramStr(1), paramStr(2))