Add embedded KJV support
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
import std/[os, strutils, tables]
|
||||
|
||||
# Source archive: https://ebible.org/Scriptures/eng-kjv_usfm.zip
|
||||
|
||||
const canonBookCodes = [
|
||||
"GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT",
|
||||
"1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH",
|
||||
"EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER",
|
||||
"LAM", "EZK", "DAN", "HOS", "JOL", "AMO", "OBA", "JON",
|
||||
"MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL", "MAT",
|
||||
"MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL",
|
||||
"EPH", "PHP", "COL", "1TH", "2TH", "1TI", "2TI", "TIT",
|
||||
"PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN",
|
||||
"JUD", "REV"
|
||||
]
|
||||
|
||||
proc normalizeWhitespace(s: string): string =
|
||||
var lastWasSpace = false
|
||||
for ch in s:
|
||||
if ch.isSpaceAscii:
|
||||
if not lastWasSpace:
|
||||
result.add(' ')
|
||||
lastWasSpace = true
|
||||
else:
|
||||
result.add(ch)
|
||||
lastWasSpace = false
|
||||
result = result.strip
|
||||
|
||||
proc removeFootnotes(s: string): string =
|
||||
var i = 0
|
||||
while i < s.len:
|
||||
if s.continuesWith("\\f ", i) or s.continuesWith("\\f +", i):
|
||||
let closeIdx = s.find("\\f*", i + 2)
|
||||
if closeIdx < 0:
|
||||
break
|
||||
i = closeIdx + 3
|
||||
else:
|
||||
result.add(s[i])
|
||||
inc i
|
||||
|
||||
proc stripUsfmMarkup(s: string): string =
|
||||
let withoutFootnotes = removeFootnotes(s)
|
||||
var i = 0
|
||||
|
||||
while i < withoutFootnotes.len:
|
||||
case withoutFootnotes[i]
|
||||
of '\\':
|
||||
inc i
|
||||
if i < withoutFootnotes.len and withoutFootnotes[i] == '+':
|
||||
inc i
|
||||
|
||||
while i < withoutFootnotes.len and
|
||||
(withoutFootnotes[i].isAlphaAscii or
|
||||
withoutFootnotes[i].isDigit or
|
||||
withoutFootnotes[i] == '-'):
|
||||
inc i
|
||||
|
||||
let isClosingMarker = i < withoutFootnotes.len and withoutFootnotes[i] == '*'
|
||||
if isClosingMarker:
|
||||
inc i
|
||||
|
||||
while not isClosingMarker and
|
||||
i < withoutFootnotes.len and
|
||||
withoutFootnotes[i].isSpaceAscii:
|
||||
inc i
|
||||
of '|':
|
||||
while i < withoutFootnotes.len and withoutFootnotes[i] != '\\':
|
||||
inc i
|
||||
of '\t':
|
||||
result.add(' ')
|
||||
inc i
|
||||
else:
|
||||
result.add(withoutFootnotes[i])
|
||||
inc i
|
||||
|
||||
result = normalizeWhitespace(result)
|
||||
|
||||
proc parseVerseLine(line: string): tuple[verse: int, text: string] =
|
||||
var rest = line[3..^1].strip
|
||||
let numberEnd = rest.find(' ')
|
||||
if numberEnd < 0:
|
||||
raise newException(ValueError, "verse marker without text: " & line)
|
||||
|
||||
result.verse = parseInt(rest[0 ..< numberEnd])
|
||||
result.text = stripUsfmMarkup(rest[numberEnd + 1 .. ^1])
|
||||
|
||||
proc findCanonFiles(inputDir: string): Table[string, string] =
|
||||
for path in walkFiles(inputDir / "*eng-kjv.usfm"):
|
||||
let name = path.extractFilename
|
||||
let dashIdx = name.find('-')
|
||||
let suffixIdx = name.find("eng-kjv.usfm")
|
||||
if dashIdx >= 0 and suffixIdx > dashIdx:
|
||||
let code = name[dashIdx + 1 ..< suffixIdx]
|
||||
if canonBookCodes.contains(code):
|
||||
result[code] = path
|
||||
|
||||
proc generate(inputDir, outputPath: string) =
|
||||
let canonFiles = findCanonFiles(inputDir)
|
||||
var rows: seq[string] = @[]
|
||||
|
||||
for code in canonBookCodes:
|
||||
if not canonFiles.hasKey(code):
|
||||
raise newException(ValueError, "missing USFM file for " & code)
|
||||
|
||||
var chapter = 0
|
||||
var verse = 0
|
||||
var verseText = ""
|
||||
|
||||
proc flushVerse() =
|
||||
if chapter > 0 and verse > 0:
|
||||
let text = normalizeWhitespace(verseText).replace("\t", " ")
|
||||
if text.len > 0:
|
||||
rows.add([code, $chapter, $verse, text].join("\t"))
|
||||
verse = 0
|
||||
verseText = ""
|
||||
|
||||
for rawLine in canonFiles[code].lines:
|
||||
let line = rawLine.strip
|
||||
|
||||
if line.startsWith("\\c "):
|
||||
flushVerse()
|
||||
chapter = parseInt(line[3..^1].strip)
|
||||
elif line.startsWith("\\v "):
|
||||
flushVerse()
|
||||
let parsed = parseVerseLine(line)
|
||||
verse = parsed.verse
|
||||
verseText = parsed.text
|
||||
elif verse > 0:
|
||||
let continued = stripUsfmMarkup(line)
|
||||
if continued.len > 0:
|
||||
if verseText.len > 0:
|
||||
verseText.add(' ')
|
||||
verseText.add(continued)
|
||||
|
||||
flushVerse()
|
||||
|
||||
createDir(outputPath.parentDir)
|
||||
writeFile(outputPath, rows.join("\n") & "\n")
|
||||
|
||||
when isMainModule:
|
||||
if paramCount() != 2:
|
||||
quit("Usage: generate_kjv_data <usfm-dir> <output-tsv>", QuitFailure)
|
||||
|
||||
generate(paramStr(1), paramStr(2))
|
||||
Reference in New Issue
Block a user