import std/[os, strutils, tables] # Source archive: https://ebible.org/Scriptures/eng-kjv_usfm.zip const canonBookCodes = [ "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT", "1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH", "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER", "LAM", "EZK", "DAN", "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL", "MAT", "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", "COL", "1TH", "2TH", "1TI", "2TI", "TIT", "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN", "JUD", "REV" ] proc normalizeWhitespace(s: string): string = var lastWasSpace = false for ch in s: if ch.isSpaceAscii: if not lastWasSpace: result.add(' ') lastWasSpace = true else: result.add(ch) lastWasSpace = false result = result.strip proc removeFootnotes(s: string): string = var i = 0 while i < s.len: if s.continuesWith("\\f ", i) or s.continuesWith("\\f +", i): let closeIdx = s.find("\\f*", i + 2) if closeIdx < 0: break i = closeIdx + 3 else: result.add(s[i]) inc i proc stripUsfmMarkup(s: string): string = let withoutFootnotes = removeFootnotes(s) var i = 0 while i < withoutFootnotes.len: case withoutFootnotes[i] of '\\': inc i if i < withoutFootnotes.len and withoutFootnotes[i] == '+': inc i while i < withoutFootnotes.len and (withoutFootnotes[i].isAlphaAscii or withoutFootnotes[i].isDigit or withoutFootnotes[i] == '-'): inc i let isClosingMarker = i < withoutFootnotes.len and withoutFootnotes[i] == '*' if isClosingMarker: inc i while not isClosingMarker and i < withoutFootnotes.len and withoutFootnotes[i].isSpaceAscii: inc i of '|': while i < withoutFootnotes.len and withoutFootnotes[i] != '\\': inc i of '\t': result.add(' ') inc i else: result.add(withoutFootnotes[i]) inc i result = normalizeWhitespace(result) proc parseVerseLine(line: string): tuple[verse: int, text: string] = var rest = line[3..^1].strip let numberEnd = rest.find(' ') if numberEnd < 0: raise newException(ValueError, "verse marker without text: " & line) result.verse = parseInt(rest[0 ..< numberEnd]) result.text = stripUsfmMarkup(rest[numberEnd + 1 .. ^1]) proc findCanonFiles(inputDir: string): Table[string, string] = for path in walkFiles(inputDir / "*eng-kjv.usfm"): let name = path.extractFilename let dashIdx = name.find('-') let suffixIdx = name.find("eng-kjv.usfm") if dashIdx >= 0 and suffixIdx > dashIdx: let code = name[dashIdx + 1 ..< suffixIdx] if canonBookCodes.contains(code): result[code] = path proc generate(inputDir, outputPath: string) = let canonFiles = findCanonFiles(inputDir) var rows: seq[string] = @[] for code in canonBookCodes: if not canonFiles.hasKey(code): raise newException(ValueError, "missing USFM file for " & code) var chapter = 0 var verse = 0 var verseText = "" proc flushVerse() = if chapter > 0 and verse > 0: let text = normalizeWhitespace(verseText).replace("\t", " ") if text.len > 0: rows.add([code, $chapter, $verse, text].join("\t")) verse = 0 verseText = "" for rawLine in canonFiles[code].lines: let line = rawLine.strip if line.startsWith("\\c "): flushVerse() chapter = parseInt(line[3..^1].strip) elif line.startsWith("\\v "): flushVerse() let parsed = parseVerseLine(line) verse = parsed.verse verseText = parsed.text elif verse > 0: let continued = stripUsfmMarkup(line) if continued.len > 0: if verseText.len > 0: verseText.add(' ') verseText.add(continued) flushVerse() createDir(outputPath.parentDir) writeFile(outputPath, rows.join("\n") & "\n") when isMainModule: if paramCount() != 2: quit("Usage: generate_kjv_data ", QuitFailure) generate(paramStr(1), paramStr(2))