Add private MEV embedded support

2026-06-14 08:14:27 -05:00
parent 544062902b
commit 42d2587704
6 changed files with 473 additions and 113 deletions
@@ -9,6 +9,7 @@ import cliutils, docopt, zero_functional
 import ./api_bible
 import ./esv
 import ./kjv
+import ./mev

 proc formatMarkdown(raw, translation: string): string =
  var reference = ""
@@ -80,6 +81,8 @@ proc fetchPassages(reference, translation: string, cfg: CombinedConfig): seq[str
      cfg.getVal("esv-api-root", "https://api.esv.org"))
  of "akjv", "kjv":
    kjv.fetchPassages(reference)
+  of "mev":
+    mev.fetchPassages(reference)
  of "amp", "nkjv", "niv":
    api_bible.fetchPassages(
      reference,
@@ -92,7 +95,7 @@ proc fetchPassages(reference, translation: string, cfg: CombinedConfig): seq[str
  else:
    raise newException(ValueError,
      "unsupported translation '" & translation &
-        "'; supported translations: akjv, amp, esv, kjv, nkjv, niv")
+        "'; supported translations: akjv, amp, esv, kjv, mev, nkjv, niv")

 when isMainModule:
  const USAGE = """Usage:
@@ -110,8 +113,8 @@ Options:

  -t, --translation <translation>
                                Select a specific translation. Supported values
-                                are 'akjv', 'amp', 'esv', 'kjv', 'nkjv', and
-                                'niv'. Defaults to 'esv'.
+                                are 'akjv', 'amp', 'esv', 'kjv', 'mev',
+                                'nkjv', and 'niv'. Defaults to 'esv'.

  --esv-api-token <token>       Provide the API token on the command line. By
                                default this will be read either from the
@@ -0,0 +1,112 @@
+import std/[strutils, tables]
+
+import ./reference_parser
+
+type BibleIndex = object
+  verses: Table[string, string]
+  lastVerseByChapter: Table[string, int]
+  lastChapterByBook: Table[string, int]
+  translationName: string
+
+proc verseKey(code: string, chapter, verse: int): string =
+  code & "\t" & $chapter & "\t" & $verse
+
+proc chapterKey(code: string, chapter: int): string =
+  code & "\t" & $chapter
+
+proc loadBibleIndex(rows, translationName: string): BibleIndex =
+  result.translationName = translationName
+
+  for line in rows.splitLines:
+    if line.strip.len == 0:
+      continue
+
+    let parts = line.split('\t', maxsplit = 3)
+    if parts.len != 4:
+      raise newException(ValueError,
+        "invalid embedded " & translationName & " row: " & line)
+
+    let code = parts[0]
+    let chapter = parseInt(parts[1])
+    let verse = parseInt(parts[2])
+    let text = parts[3]
+
+    result.verses[verseKey(code, chapter, verse)] = text
+
+    let cKey = chapterKey(code, chapter)
+    if not result.lastVerseByChapter.hasKey(cKey) or
+        verse > result.lastVerseByChapter[cKey]:
+      result.lastVerseByChapter[cKey] = verse
+
+    if not result.lastChapterByBook.hasKey(code) or
+        chapter > result.lastChapterByBook[code]:
+      result.lastChapterByBook[code] = chapter
+
+proc requireLastChapter(index: BibleIndex, code: string): int =
+  if not index.lastChapterByBook.hasKey(code):
+    raise newException(ValueError,
+      "no embedded " & index.translationName & " data for " & code)
+  index.lastChapterByBook[code]
+
+proc requireLastVerse(index: BibleIndex, code: string, chapter: int): int =
+  let cKey = chapterKey(code, chapter)
+  if not index.lastVerseByChapter.hasKey(cKey):
+    raise newException(ValueError,
+      "no embedded " & index.translationName & " data for " &
+        bookInfo(code).name & " " & $chapter)
+  index.lastVerseByChapter[cKey]
+
+proc requireVerse(index: BibleIndex, code: string, chapter, verse: int): string =
+  let vKey = verseKey(code, chapter, verse)
+  if not index.verses.hasKey(vKey):
+    raise newException(ValueError,
+      "no embedded " & index.translationName & " data for " &
+        bookInfo(code).name & " " & $chapter & ":" & $verse)
+  index.verses[vKey]
+
+proc addVerseLines(
+    lines: var seq[string],
+    index: BibleIndex,
+    reference: PassageReference,
+    range: RefRange) =
+
+  let code = reference.book.code
+  discard index.requireLastChapter(code)
+
+  for chapter in range.start.chapter .. range.finish.chapter:
+    let startVerse =
+      if chapter == range.start.chapter and range.start.verse > 0:
+        range.start.verse
+      else:
+        1
+
+    let endVerse =
+      if chapter == range.finish.chapter and range.finish.verse > 0:
+        range.finish.verse
+      else:
+        index.requireLastVerse(code, chapter)
+
+    if startVerse > endVerse:
+      raise newException(ValueError, "reference range starts after it ends")
+
+    for verse in startVerse .. endVerse:
+      lines.add("  [" & $verse & "] " & index.requireVerse(code, chapter, verse))
+
+proc fetchReference(index: BibleIndex, reference: PassageReference): string =
+  var lines = @[$reference]
+  let code = reference.book.code
+
+  if reference.ranges.len == 0:
+    for chapter in 1 .. index.requireLastChapter(code):
+      for verse in 1 .. index.requireLastVerse(code, chapter):
+        lines.add("  [" & $verse & "] " & index.requireVerse(code, chapter, verse))
+  else:
+    for range in reference.ranges:
+      lines.addVerseLines(index, reference, range)
+
+  lines.join("\n")
+
+proc fetchPassages*(rows, reference, translationName: string): seq[string] =
+  let index = loadBibleIndex(rows, translationName)
+  for parsedReference in parseReferences(reference):
+    result.add(fetchReference(index, parsedReference))
@@ -1,109 +1,7 @@
-import std/[strutils, tables]
-
+import ./embedded_bible
 import ./offline_data
-import ./reference_parser

 const kjvRows = embeddedTranslationData("kjv")

-type BibleIndex = object
-  verses: Table[string, string]
-  lastVerseByChapter: Table[string, int]
-  lastChapterByBook: Table[string, int]
-
-proc verseKey(code: string, chapter, verse: int): string =
-  code & "\t" & $chapter & "\t" & $verse
-
-proc chapterKey(code: string, chapter: int): string =
-  code & "\t" & $chapter
-
-proc loadBibleIndex(): BibleIndex =
-  for line in kjvRows.splitLines:
-    if line.strip.len == 0:
-      continue
-
-    let parts = line.split('\t', maxsplit = 3)
-    if parts.len != 4:
-      raise newException(ValueError, "invalid embedded KJV row: " & line)
-
-    let code = parts[0]
-    let chapter = parseInt(parts[1])
-    let verse = parseInt(parts[2])
-    let text = parts[3]
-
-    result.verses[verseKey(code, chapter, verse)] = text
-
-    let cKey = chapterKey(code, chapter)
-    if not result.lastVerseByChapter.hasKey(cKey) or
-        verse > result.lastVerseByChapter[cKey]:
-      result.lastVerseByChapter[cKey] = verse
-
-    if not result.lastChapterByBook.hasKey(code) or
-        chapter > result.lastChapterByBook[code]:
-      result.lastChapterByBook[code] = chapter
-
-proc requireLastChapter(index: BibleIndex, code: string): int =
-  if not index.lastChapterByBook.hasKey(code):
-    raise newException(ValueError, "no embedded KJV data for " & code)
-  index.lastChapterByBook[code]
-
-proc requireLastVerse(index: BibleIndex, code: string, chapter: int): int =
-  let cKey = chapterKey(code, chapter)
-  if not index.lastVerseByChapter.hasKey(cKey):
-    raise newException(ValueError,
-      "no embedded KJV data for " & bookInfo(code).name & " " & $chapter)
-  index.lastVerseByChapter[cKey]
-
-proc requireVerse(index: BibleIndex, code: string, chapter, verse: int): string =
-  let vKey = verseKey(code, chapter, verse)
-  if not index.verses.hasKey(vKey):
-    raise newException(ValueError,
-      "no embedded KJV data for " & bookInfo(code).name & " " &
-        $chapter & ":" & $verse)
-  index.verses[vKey]
-
-proc addVerseLines(
-    lines: var seq[string],
-    index: BibleIndex,
-    reference: PassageReference,
-    range: RefRange) =
-
-  let code = reference.book.code
-  discard index.requireLastChapter(code)
-
-  for chapter in range.start.chapter .. range.finish.chapter:
-    let startVerse =
-      if chapter == range.start.chapter and range.start.verse > 0:
-        range.start.verse
-      else:
-        1
-
-    let endVerse =
-      if chapter == range.finish.chapter and range.finish.verse > 0:
-        range.finish.verse
-      else:
-        index.requireLastVerse(code, chapter)
-
-    if startVerse > endVerse:
-      raise newException(ValueError, "reference range starts after it ends")
-
-    for verse in startVerse .. endVerse:
-      lines.add("  [" & $verse & "] " & index.requireVerse(code, chapter, verse))
-
-proc fetchReference(index: BibleIndex, reference: PassageReference): string =
-  var lines = @[$reference]
-  let code = reference.book.code
-
-  if reference.ranges.len == 0:
-    for chapter in 1 .. index.requireLastChapter(code):
-      for verse in 1 .. index.requireLastVerse(code, chapter):
-        lines.add("  [" & $verse & "] " & index.requireVerse(code, chapter, verse))
-  else:
-    for range in reference.ranges:
-      lines.addVerseLines(index, reference, range)
-
-  lines.join("\n")
-
 proc fetchPassages*(reference: string): seq[string] =
-  let index = loadBibleIndex()
-  for parsedReference in parseReferences(reference):
-    result.add(fetchReference(index, parsedReference))
+  embedded_bible.fetchPassages(kjvRows, reference, "KJV")
@@ -0,0 +1,13 @@
+import ./offline_data
+
+when hasEmbeddedTranslationData("mev"):
+  import ./embedded_bible
+
+  const mevRows = embeddedTranslationData("mev")
+
+proc fetchPassages*(reference: string): seq[string] =
+  when hasEmbeddedTranslationData("mev"):
+    embedded_bible.fetchPassages(mevRows, reference, "MEV")
+  else:
+    raise newException(ValueError,
+      "MEV data is not embedded; generate data/private/mev.tsv and rebuild")
@@ -1,11 +1,15 @@
 import std/os

-template embeddedTranslationData*(name: static[string]): string =
+template translationDataPath(name: static[string], visibility: static[string]): string =
  const dataRoot = currentSourcePath().parentDir.parentDir / "data"
-  const privatePath = dataRoot / "private" / (name & ".tsv")
-  const publicPath = dataRoot / "public" / (name & ".tsv")
+  dataRoot / visibility / (name & ".tsv")

-  when fileExists(privatePath):
-    staticRead(privatePath)
+template hasEmbeddedTranslationData*(name: static[string]): bool =
+  fileExists(translationDataPath(name, "private")) or
+    fileExists(translationDataPath(name, "public"))
+
+template embeddedTranslationData*(name: static[string]): string =
+  when fileExists(translationDataPath(name, "private")):
+    staticRead(translationDataPath(name, "private"))
  else:
-    staticRead(publicPath)
+    staticRead(translationDataPath(name, "public"))
@@ -0,0 +1,330 @@
+import std/[
+  htmlparser,
+  os,
+  osproc,
+  streams,
+  strutils,
+  xmlparser,
+  xmltree
+]
+
+import ../src/reference_parser
+
+type
+  TocEntry = object
+    label: string
+    code: string
+    fileIndex: int
+
+  BookSource = object
+    code: string
+    startIndex: int
+    endIndex: int
+
+  ParseState = object
+    code: string
+    chapter: int
+    verse: int
+    verseText: string
+    rows: seq[string]
+
+proc normalizeWhitespace(s: string): string =
+  var lastWasSpace = false
+  for ch in s.replace("\xC2\xA0", " "):
+    if ch.isSpaceAscii:
+      if not lastWasSpace:
+        result.add(' ')
+      lastWasSpace = true
+    else:
+      result.add(ch)
+      lastWasSpace = false
+  result = result.strip
+
+proc markerText(s: string): string =
+  normalizeWhitespace(s).replace(" ", "")
+
+proc numberAfterPrefix(s, prefix: string): int =
+  let text = normalizeWhitespace(s).toUpperAscii
+  if not text.startsWith(prefix):
+    return 0
+
+  var digits = ""
+  for ch in text[prefix.len .. ^1].strip:
+    if ch.isDigit:
+      digits.add(ch)
+    elif digits.len > 0:
+      break
+    elif not ch.isSpaceAscii:
+      break
+
+  if digits.len > 0:
+    result = parseInt(digits)
+
+proc isPositiveIntText(s: string): bool =
+  let text = markerText(s)
+  text.len > 0 and text.allCharsInSet({'0'..'9'}) and parseInt(text) > 0
+
+proc readEpubEntry(epubPath, entryPath: string): string =
+  let process = startProcess(
+    "unzip",
+    args = ["-p", epubPath, entryPath],
+    options = {poUsePath, poStdErrToStdOut})
+  result = process.outputStream.readAll()
+  let exitCode = process.waitForExit()
+  process.close()
+
+  if exitCode != 0:
+    raise newException(IOError,
+      "could not read " & entryPath & " from " & epubPath & ": " & result)
+
+proc textContent(node: XmlNode): string =
+  case node.kind
+  of xnText:
+    result = node.text
+  of xnElement:
+    for child in node.items:
+      result.add(textContent(child))
+  else:
+    discard
+
+proc firstDescendant(node: XmlNode, tag: string): XmlNode =
+  if node.kind == xnElement:
+    if node.tag == tag:
+      return node
+
+    for child in node.items:
+      let found = firstDescendant(child, tag)
+      if not found.isNil:
+        return found
+
+proc descendantText(node: XmlNode, tag: string): string =
+  let found = firstDescendant(node, tag)
+  if found.isNil: ""
+  else: normalizeWhitespace(textContent(found))
+
+proc descendantAttr(node: XmlNode, tag, attrName: string): string =
+  let found = firstDescendant(node, tag)
+  if found.isNil: ""
+  else: found.attr(attrName)
+
+proc bookCodeForLabel(label: string): string =
+  let bookName = label.split("(", maxsplit = 1)[0].strip
+  if bookName == "Solomon":
+    return "SNG"
+
+  for book in CanonBooks:
+    if book.name == bookName:
+      return book.code
+
+proc indexFromSplitFile(path: string): int =
+  let filename = path.split('#', maxsplit = 1)[0].extractFilename
+  if not filename.startsWith("index_split_") or not filename.endsWith(".html"):
+    return 0
+
+  parseInt(filename["index_split_".len ..< filename.len - ".html".len])
+
+proc parseTocEntries(epubPath: string): seq[TocEntry] =
+  let toc = parseXml(newStringStream(readEpubEntry(epubPath, "toc.ncx")))
+  var entries: seq[TocEntry] = @[]
+
+  proc walk(node: XmlNode) =
+    if node.kind == xnElement and node.tag == "navPoint":
+      let label = node.descendantText("text")
+      let src = node.descendantAttr("content", "src")
+      let fileIndex = indexFromSplitFile(src)
+      if fileIndex > 0:
+        entries.add(TocEntry(
+          label: label,
+          code: bookCodeForLabel(label),
+          fileIndex: fileIndex))
+
+    if node.kind == xnElement:
+      for child in node.items:
+        walk(child)
+
+  walk(toc)
+  entries
+
+proc bookSources(entries: seq[TocEntry]): seq[BookSource] =
+  for idx, entry in entries:
+    if entry.code.len == 0:
+      continue
+
+    let endIndex =
+      if idx + 1 < entries.len:
+        entries[idx + 1].fileIndex - 1
+      else:
+        entry.fileIndex
+
+    result.add(BookSource(
+      code: entry.code,
+      startIndex: entry.fileIndex,
+      endIndex: endIndex))
+
+  if result.len != CanonBooks.len:
+    raise newException(ValueError,
+      "expected " & $CanonBooks.len & " canonical books in EPUB TOC, found " &
+        $result.len)
+
+  for idx, book in CanonBooks:
+    if result[idx].code != book.code:
+      raise newException(ValueError,
+        "expected " & book.code & " at position " & $idx & ", found " &
+          result[idx].code)
+
+proc hasClass(node: XmlNode, className: string): bool =
+  if node.kind != xnElement:
+    return false
+
+  for value in node.attr("class").splitWhitespace:
+    if value == className:
+      return true
+
+proc shouldSkipElement(node: XmlNode): bool =
+  node.hasClass("calibre_29") or # section headings
+    node.hasClass("calibre_6") or # parallel/cross-reference paragraphs
+    node.hasClass("calibre_26")   # Psalm superscriptions/cross-references
+
+proc hasHref(node: XmlNode): bool =
+  if node.kind == xnElement:
+    if node.attr("href").len > 0:
+      return true
+
+    for child in node.items:
+      if hasHref(child):
+        return true
+
+proc isBlockElement(node: XmlNode): bool =
+  node.kind == xnElement and
+    node.tag in ["blockquote", "br", "div", "h1", "h2", "h3", "li", "p"]
+
+proc chapterMarker(node: XmlNode): int =
+  if node.kind == xnElement and node.tag == "span" and node.hasClass("calibre1"):
+    let text = markerText(textContent(node))
+    if text.isPositiveIntText:
+      return parseInt(text)
+
+proc headingChapterMarker(node: XmlNode, code: string): int =
+  if node.kind != xnElement or node.tag != "p":
+    return 0
+
+  let text = textContent(node)
+  result = numberAfterPrefix(text, "CHAPTER ")
+  if result > 0:
+    return
+
+  if code == "PSA":
+    result = numberAfterPrefix(text, "PSALM ")
+
+proc verseMarker(node: XmlNode): int =
+  if node.kind == xnElement and node.tag == "sup" and not node.hasHref:
+    let text = markerText(textContent(node))
+    if text.isPositiveIntText:
+      return parseInt(text)
+
+proc leadingVerseText(s: string): tuple[verse: int, rest: string] =
+  let text = s.replace("\xC2\xA0", " ")
+  var idx = 0
+  while idx < text.len and text[idx].isSpaceAscii:
+    inc idx
+
+  let digitStart = idx
+  while idx < text.len and text[idx].isDigit:
+    inc idx
+
+  if idx == digitStart:
+    return
+
+  let numberText = text[digitStart ..< idx]
+  while idx < text.len and text[idx].isSpaceAscii:
+    inc idx
+
+  result.verse = parseInt(numberText)
+  if idx < text.len:
+    result.rest = text[idx .. ^1]
+
+proc flushVerse(state: var ParseState) =
+  if state.chapter > 0 and state.verse > 0:
+    let text = normalizeWhitespace(state.verseText).replace("\t", " ")
+    if text.len > 0:
+      state.rows.add([state.code, $state.chapter, $state.verse, text].join("\t"))
+
+  state.verseText = ""
+
+proc walkPassageText(node: XmlNode, state: var ParseState) =
+  case node.kind
+  of xnText:
+    if state.chapter > 0:
+      if state.verse == 0:
+        let leading = leadingVerseText(node.text)
+        if leading.verse > 0:
+          state.verse = leading.verse
+          state.verseText.add(leading.rest)
+      elif state.verse > 0:
+        state.verseText.add(node.text)
+  of xnElement:
+    let headingChapter = headingChapterMarker(node, state.code)
+    if headingChapter > 0:
+      state.flushVerse()
+      state.chapter = headingChapter
+      state.verse = 0
+      return
+
+    if node.shouldSkipElement:
+      return
+
+    let chapter = chapterMarker(node)
+    if chapter > 0:
+      state.flushVerse()
+      state.chapter = chapter
+      state.verse = 1
+      return
+
+    let verse = verseMarker(node)
+    if verse > 0:
+      state.flushVerse()
+      state.verse = verse
+      return
+
+    if node.tag == "sup":
+      return
+
+    for child in node.items:
+      walkPassageText(child, state)
+
+    if node.isBlockElement and state.chapter > 0 and state.verse > 0:
+      state.verseText.add(' ')
+  else:
+    discard
+
+proc indexSplitFile(index: int): string =
+  "index_split_" & align($index, 3, '0') & ".html"
+
+proc parseBook(epubPath: string, source: BookSource): seq[string] =
+  var state = ParseState(code: source.code)
+  if bookInfo(source.code).singleChapter:
+    state.chapter = 1
+
+  for index in source.startIndex .. source.endIndex:
+    let html = readEpubEntry(epubPath, indexSplitFile(index))
+    let doc = parseHtml(newStringStream(html))
+    walkPassageText(doc, state)
+
+  state.flushVerse()
+  state.rows
+
+proc generate(epubPath, outputPath: string) =
+  let sources = bookSources(parseTocEntries(epubPath))
+  var rows: seq[string] = @[]
+
+  for source in sources:
+    rows.add(parseBook(epubPath, source))
+
+  createDir(outputPath.parentDir)
+  writeFile(outputPath, rows.join("\n") & "\n")
+
+when isMainModule:
+  if paramCount() != 2:
+    quit("Usage: generate_mev_data <mev-epub> <output-tsv>", QuitFailure)
+
+  generate(paramStr(1), paramStr(2))