bibleref/tools/generate_kjv_data.nim

import std/[os, strutils, tables]

# Source archive: https://ebible.org/Scriptures/eng-kjv_usfm.zip

const canonBookCodes = [
  "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT",
  "1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH",
  "EST", "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER",
  "LAM", "EZK", "DAN", "HOS", "JOL", "AMO", "OBA", "JON",
  "MIC", "NAM", "HAB", "ZEP", "HAG", "ZEC", "MAL", "MAT",
  "MRK", "LUK", "JHN", "ACT", "ROM", "1CO", "2CO", "GAL",
  "EPH", "PHP", "COL", "1TH", "2TH", "1TI", "2TI", "TIT",
  "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", "2JN", "3JN",
  "JUD", "REV"
]

proc normalizeWhitespace(s: string): string =
  var lastWasSpace = false
  for ch in s:
    if ch.isSpaceAscii:
      if not lastWasSpace:
        result.add(' ')
      lastWasSpace = true
    else:
      result.add(ch)
      lastWasSpace = false
  result = result.strip

proc removeFootnotes(s: string): string =
  var i = 0
  while i < s.len:
    if s.continuesWith("\\f ", i) or s.continuesWith("\\f +", i):
      let closeIdx = s.find("\\f*", i + 2)
      if closeIdx < 0:
        break
      i = closeIdx + 3
    else:
      result.add(s[i])
      inc i

proc stripUsfmMarkup(s: string): string =
  let withoutFootnotes = removeFootnotes(s)
  var i = 0

  while i < withoutFootnotes.len:
    case withoutFootnotes[i]
    of '\\':
      inc i
      if i < withoutFootnotes.len and withoutFootnotes[i] == '+':
        inc i

      while i < withoutFootnotes.len and
          (withoutFootnotes[i].isAlphaAscii or
           withoutFootnotes[i].isDigit or
           withoutFootnotes[i] == '-'):
        inc i

      let isClosingMarker = i < withoutFootnotes.len and withoutFootnotes[i] == '*'
      if isClosingMarker:
        inc i

      while not isClosingMarker and
          i < withoutFootnotes.len and
          withoutFootnotes[i].isSpaceAscii:
        inc i
    of '|':
      while i < withoutFootnotes.len and withoutFootnotes[i] != '\\':
        inc i
    of '\t':
      result.add(' ')
      inc i
    else:
      result.add(withoutFootnotes[i])
      inc i

  result = normalizeWhitespace(result)

proc parseVerseLine(line: string): tuple[verse: int, text: string] =
  var rest = line[3..^1].strip
  let numberEnd = rest.find(' ')
  if numberEnd < 0:
    raise newException(ValueError, "verse marker without text: " & line)

  result.verse = parseInt(rest[0 ..< numberEnd])
  result.text = stripUsfmMarkup(rest[numberEnd + 1 .. ^1])

proc findCanonFiles(inputDir: string): Table[string, string] =
  for path in walkFiles(inputDir / "*eng-kjv.usfm"):
    let name = path.extractFilename
    let dashIdx = name.find('-')
    let suffixIdx = name.find("eng-kjv.usfm")
    if dashIdx >= 0 and suffixIdx > dashIdx:
      let code = name[dashIdx + 1 ..< suffixIdx]
      if canonBookCodes.contains(code):
        result[code] = path

proc generate(inputDir, outputPath: string) =
  let canonFiles = findCanonFiles(inputDir)
  var rows: seq[string] = @[]

  for code in canonBookCodes:
    if not canonFiles.hasKey(code):
      raise newException(ValueError, "missing USFM file for " & code)

    var chapter = 0
    var verse = 0
    var verseText = ""

    proc flushVerse() =
      if chapter > 0 and verse > 0:
        let text = normalizeWhitespace(verseText).replace("\t", " ")
        if text.len > 0:
          rows.add([code, $chapter, $verse, text].join("\t"))
      verse = 0
      verseText = ""

    for rawLine in canonFiles[code].lines:
      let line = rawLine.strip

      if line.startsWith("\\c "):
        flushVerse()
        chapter = parseInt(line[3..^1].strip)
      elif line.startsWith("\\v "):
        flushVerse()
        let parsed = parseVerseLine(line)
        verse = parsed.verse
        verseText = parsed.text
      elif verse > 0:
        let continued = stripUsfmMarkup(line)
        if continued.len > 0:
          if verseText.len > 0:
            verseText.add(' ')
          verseText.add(continued)

    flushVerse()

  createDir(outputPath.parentDir)
  writeFile(outputPath, rows.join("\n") & "\n")

when isMainModule:
  if paramCount() != 2:
    quit("Usage: generate_kjv_data <usfm-dir> <output-tsv>", QuitFailure)

  generate(paramStr(1), paramStr(2))