145 lines
4.1 KiB
Nim
145 lines
4.1 KiB
Nim
import std/[os, strutils, tables]
|
|
|
|
# Source archive: https://ebible.org/Scriptures/eng-kjv_usfm.zip
|
|
|
|
const canonBookCodes = [
|
|
"GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT",
|
|
"1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH",
|
|
"EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER",
|
|
"LAM", "EZK", "DAN", "HOS", "JOL", "AMO", "OBA", "JON",
|
|
"MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL", "MAT",
|
|
"MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL",
|
|
"EPH", "PHP", "COL", "1TH", "2TH", "1TI", "2TI", "TIT",
|
|
"PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN",
|
|
"JUD", "REV"
|
|
]
|
|
|
|
proc normalizeWhitespace(s: string): string =
|
|
var lastWasSpace = false
|
|
for ch in s:
|
|
if ch.isSpaceAscii:
|
|
if not lastWasSpace:
|
|
result.add(' ')
|
|
lastWasSpace = true
|
|
else:
|
|
result.add(ch)
|
|
lastWasSpace = false
|
|
result = result.strip
|
|
|
|
proc removeFootnotes(s: string): string =
|
|
var i = 0
|
|
while i < s.len:
|
|
if s.continuesWith("\\f ", i) or s.continuesWith("\\f +", i):
|
|
let closeIdx = s.find("\\f*", i + 2)
|
|
if closeIdx < 0:
|
|
break
|
|
i = closeIdx + 3
|
|
else:
|
|
result.add(s[i])
|
|
inc i
|
|
|
|
proc stripUsfmMarkup(s: string): string =
|
|
let withoutFootnotes = removeFootnotes(s)
|
|
var i = 0
|
|
|
|
while i < withoutFootnotes.len:
|
|
case withoutFootnotes[i]
|
|
of '\\':
|
|
inc i
|
|
if i < withoutFootnotes.len and withoutFootnotes[i] == '+':
|
|
inc i
|
|
|
|
while i < withoutFootnotes.len and
|
|
(withoutFootnotes[i].isAlphaAscii or
|
|
withoutFootnotes[i].isDigit or
|
|
withoutFootnotes[i] == '-'):
|
|
inc i
|
|
|
|
let isClosingMarker = i < withoutFootnotes.len and withoutFootnotes[i] == '*'
|
|
if isClosingMarker:
|
|
inc i
|
|
|
|
while not isClosingMarker and
|
|
i < withoutFootnotes.len and
|
|
withoutFootnotes[i].isSpaceAscii:
|
|
inc i
|
|
of '|':
|
|
while i < withoutFootnotes.len and withoutFootnotes[i] != '\\':
|
|
inc i
|
|
of '\t':
|
|
result.add(' ')
|
|
inc i
|
|
else:
|
|
result.add(withoutFootnotes[i])
|
|
inc i
|
|
|
|
result = normalizeWhitespace(result)
|
|
|
|
proc parseVerseLine(line: string): tuple[verse: int, text: string] =
|
|
var rest = line[3..^1].strip
|
|
let numberEnd = rest.find(' ')
|
|
if numberEnd < 0:
|
|
raise newException(ValueError, "verse marker without text: " & line)
|
|
|
|
result.verse = parseInt(rest[0 ..< numberEnd])
|
|
result.text = stripUsfmMarkup(rest[numberEnd + 1 .. ^1])
|
|
|
|
proc findCanonFiles(inputDir: string): Table[string, string] =
|
|
for path in walkFiles(inputDir / "*eng-kjv.usfm"):
|
|
let name = path.extractFilename
|
|
let dashIdx = name.find('-')
|
|
let suffixIdx = name.find("eng-kjv.usfm")
|
|
if dashIdx >= 0 and suffixIdx > dashIdx:
|
|
let code = name[dashIdx + 1 ..< suffixIdx]
|
|
if canonBookCodes.contains(code):
|
|
result[code] = path
|
|
|
|
proc generate(inputDir, outputPath: string) =
|
|
let canonFiles = findCanonFiles(inputDir)
|
|
var rows: seq[string] = @[]
|
|
|
|
for code in canonBookCodes:
|
|
if not canonFiles.hasKey(code):
|
|
raise newException(ValueError, "missing USFM file for " & code)
|
|
|
|
var chapter = 0
|
|
var verse = 0
|
|
var verseText = ""
|
|
|
|
proc flushVerse() =
|
|
if chapter > 0 and verse > 0:
|
|
let text = normalizeWhitespace(verseText).replace("\t", " ")
|
|
if text.len > 0:
|
|
rows.add([code, $chapter, $verse, text].join("\t"))
|
|
verse = 0
|
|
verseText = ""
|
|
|
|
for rawLine in canonFiles[code].lines:
|
|
let line = rawLine.strip
|
|
|
|
if line.startsWith("\\c "):
|
|
flushVerse()
|
|
chapter = parseInt(line[3..^1].strip)
|
|
elif line.startsWith("\\v "):
|
|
flushVerse()
|
|
let parsed = parseVerseLine(line)
|
|
verse = parsed.verse
|
|
verseText = parsed.text
|
|
elif verse > 0:
|
|
let continued = stripUsfmMarkup(line)
|
|
if continued.len > 0:
|
|
if verseText.len > 0:
|
|
verseText.add(' ')
|
|
verseText.add(continued)
|
|
|
|
flushVerse()
|
|
|
|
createDir(outputPath.parentDir)
|
|
writeFile(outputPath, rows.join("\n") & "\n")
|
|
|
|
when isMainModule:
|
|
if paramCount() != 2:
|
|
quit("Usage: generate_kjv_data <usfm-dir> <output-tsv>", QuitFailure)
|
|
|
|
generate(paramStr(1), paramStr(2))
|