Add embedded KJV support

This commit is contained in:
2026-06-14 07:46:21 -05:00
parent 2d78d8e5c0
commit 544062902b
8 changed files with 31857 additions and 3 deletions
+144
View File
@@ -0,0 +1,144 @@
import std/[os, strutils, tables]
# Source archive: https://ebible.org/Scriptures/eng-kjv_usfm.zip
const canonBookCodes = [
"GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT",
"1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH",
"EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER",
"LAM", "EZK", "DAN", "HOS", "JOL", "AMO", "OBA", "JON",
"MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL", "MAT",
"MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL",
"EPH", "PHP", "COL", "1TH", "2TH", "1TI", "2TI", "TIT",
"PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN",
"JUD", "REV"
]
proc normalizeWhitespace(s: string): string =
var lastWasSpace = false
for ch in s:
if ch.isSpaceAscii:
if not lastWasSpace:
result.add(' ')
lastWasSpace = true
else:
result.add(ch)
lastWasSpace = false
result = result.strip
proc removeFootnotes(s: string): string =
var i = 0
while i < s.len:
if s.continuesWith("\\f ", i) or s.continuesWith("\\f +", i):
let closeIdx = s.find("\\f*", i + 2)
if closeIdx < 0:
break
i = closeIdx + 3
else:
result.add(s[i])
inc i
proc stripUsfmMarkup(s: string): string =
let withoutFootnotes = removeFootnotes(s)
var i = 0
while i < withoutFootnotes.len:
case withoutFootnotes[i]
of '\\':
inc i
if i < withoutFootnotes.len and withoutFootnotes[i] == '+':
inc i
while i < withoutFootnotes.len and
(withoutFootnotes[i].isAlphaAscii or
withoutFootnotes[i].isDigit or
withoutFootnotes[i] == '-'):
inc i
let isClosingMarker = i < withoutFootnotes.len and withoutFootnotes[i] == '*'
if isClosingMarker:
inc i
while not isClosingMarker and
i < withoutFootnotes.len and
withoutFootnotes[i].isSpaceAscii:
inc i
of '|':
while i < withoutFootnotes.len and withoutFootnotes[i] != '\\':
inc i
of '\t':
result.add(' ')
inc i
else:
result.add(withoutFootnotes[i])
inc i
result = normalizeWhitespace(result)
proc parseVerseLine(line: string): tuple[verse: int, text: string] =
var rest = line[3..^1].strip
let numberEnd = rest.find(' ')
if numberEnd < 0:
raise newException(ValueError, "verse marker without text: " & line)
result.verse = parseInt(rest[0 ..< numberEnd])
result.text = stripUsfmMarkup(rest[numberEnd + 1 .. ^1])
proc findCanonFiles(inputDir: string): Table[string, string] =
for path in walkFiles(inputDir / "*eng-kjv.usfm"):
let name = path.extractFilename
let dashIdx = name.find('-')
let suffixIdx = name.find("eng-kjv.usfm")
if dashIdx >= 0 and suffixIdx > dashIdx:
let code = name[dashIdx + 1 ..< suffixIdx]
if canonBookCodes.contains(code):
result[code] = path
proc generate(inputDir, outputPath: string) =
let canonFiles = findCanonFiles(inputDir)
var rows: seq[string] = @[]
for code in canonBookCodes:
if not canonFiles.hasKey(code):
raise newException(ValueError, "missing USFM file for " & code)
var chapter = 0
var verse = 0
var verseText = ""
proc flushVerse() =
if chapter > 0 and verse > 0:
let text = normalizeWhitespace(verseText).replace("\t", " ")
if text.len > 0:
rows.add([code, $chapter, $verse, text].join("\t"))
verse = 0
verseText = ""
for rawLine in canonFiles[code].lines:
let line = rawLine.strip
if line.startsWith("\\c "):
flushVerse()
chapter = parseInt(line[3..^1].strip)
elif line.startsWith("\\v "):
flushVerse()
let parsed = parseVerseLine(line)
verse = parsed.verse
verseText = parsed.text
elif verse > 0:
let continued = stripUsfmMarkup(line)
if continued.len > 0:
if verseText.len > 0:
verseText.add(' ')
verseText.add(continued)
flushVerse()
createDir(outputPath.parentDir)
writeFile(outputPath, rows.join("\n") & "\n")
when isMainModule:
if paramCount() != 2:
quit("Usage: generate_kjv_data <usfm-dir> <output-tsv>", QuitFailure)
generate(paramStr(1), paramStr(2))