bibleref/tools/generate_mev_data.nim

import std/[
  htmlparser,
  os,
  osproc,
  streams,
  strutils,
  xmlparser,
  xmltree
]

import ../src/reference_parser

type
  TocEntry = object
    label: string
    code: string
    fileIndex: int

  BookSource = object
    code: string
    startIndex: int
    endIndex: int

  ParseState = object
    code: string
    chapter: int
    verse: int
    verseText: string
    rows: seq[string]

proc normalizeWhitespace(s: string): string =
  var lastWasSpace = false
  for ch in s.replace("\xC2\xA0", " "):
    if ch.isSpaceAscii:
      if not lastWasSpace:
        result.add(' ')
      lastWasSpace = true
    else:
      result.add(ch)
      lastWasSpace = false
  result = result.strip

proc markerText(s: string): string =
  normalizeWhitespace(s).replace(" ", "")

proc numberAfterPrefix(s, prefix: string): int =
  let text = normalizeWhitespace(s).toUpperAscii
  if not text.startsWith(prefix):
    return 0

  var digits = ""
  for ch in text[prefix.len .. ^1].strip:
    if ch.isDigit:
      digits.add(ch)
    elif digits.len > 0:
      break
    elif not ch.isSpaceAscii:
      break

  if digits.len > 0:
    result = parseInt(digits)

proc isPositiveIntText(s: string): bool =
  let text = markerText(s)
  text.len > 0 and text.allCharsInSet({'0'..'9'}) and parseInt(text) > 0

proc readEpubEntry(epubPath, entryPath: string): string =
  let process = startProcess(
    "unzip",
    args = ["-p", epubPath, entryPath],
    options = {poUsePath, poStdErrToStdOut})
  result = process.outputStream.readAll()
  let exitCode = process.waitForExit()
  process.close()

  if exitCode != 0:
    raise newException(IOError,
      "could not read " & entryPath & " from " & epubPath & ": " & result)

proc textContent(node: XmlNode): string =
  case node.kind
  of xnText:
    result = node.text
  of xnElement:
    for child in node.items:
      result.add(textContent(child))
  else:
    discard

proc firstDescendant(node: XmlNode, tag: string): XmlNode =
  if node.kind == xnElement:
    if node.tag == tag:
      return node

    for child in node.items:
      let found = firstDescendant(child, tag)
      if not found.isNil:
        return found

proc descendantText(node: XmlNode, tag: string): string =
  let found = firstDescendant(node, tag)
  if found.isNil: ""
  else: normalizeWhitespace(textContent(found))

proc descendantAttr(node: XmlNode, tag, attrName: string): string =
  let found = firstDescendant(node, tag)
  if found.isNil: ""
  else: found.attr(attrName)

proc bookCodeForLabel(label: string): string =
  let bookName = label.split("(", maxsplit = 1)[0].strip
  if bookName == "Solomon":
    return "SNG"

  for book in CanonBooks:
    if book.name == bookName:
      return book.code

proc indexFromSplitFile(path: string): int =
  let filename = path.split('#', maxsplit = 1)[0].extractFilename
  if not filename.startsWith("index_split_") or not filename.endsWith(".html"):
    return 0

  parseInt(filename["index_split_".len ..< filename.len - ".html".len])

proc parseTocEntries(epubPath: string): seq[TocEntry] =
  let toc = parseXml(newStringStream(readEpubEntry(epubPath, "toc.ncx")))
  var entries: seq[TocEntry] = @[]

  proc walk(node: XmlNode) =
    if node.kind == xnElement and node.tag == "navPoint":
      let label = node.descendantText("text")
      let src = node.descendantAttr("content", "src")
      let fileIndex = indexFromSplitFile(src)
      if fileIndex > 0:
        entries.add(TocEntry(
          label: label,
          code: bookCodeForLabel(label),
          fileIndex: fileIndex))

    if node.kind == xnElement:
      for child in node.items:
        walk(child)

  walk(toc)
  entries

proc bookSources(entries: seq[TocEntry]): seq[BookSource] =
  for idx, entry in entries:
    if entry.code.len == 0:
      continue

    let endIndex =
      if idx + 1 < entries.len:
        entries[idx + 1].fileIndex - 1
      else:
        entry.fileIndex

    result.add(BookSource(
      code: entry.code,
      startIndex: entry.fileIndex,
      endIndex: endIndex))

  if result.len != CanonBooks.len:
    raise newException(ValueError,
      "expected " & $CanonBooks.len & " canonical books in EPUB TOC, found " &
        $result.len)

  for idx, book in CanonBooks:
    if result[idx].code != book.code:
      raise newException(ValueError,
        "expected " & book.code & " at position " & $idx & ", found " &
          result[idx].code)

proc hasClass(node: XmlNode, className: string): bool =
  if node.kind != xnElement:
    return false

  for value in node.attr("class").splitWhitespace:
    if value == className:
      return true

proc shouldSkipElement(node: XmlNode): bool =
  node.hasClass("calibre_29") or # section headings
    node.hasClass("calibre_6") or # parallel/cross-reference paragraphs
    node.hasClass("calibre_26")   # Psalm superscriptions/cross-references

proc hasHref(node: XmlNode): bool =
  if node.kind == xnElement:
    if node.attr("href").len > 0:
      return true

    for child in node.items:
      if hasHref(child):
        return true

proc isBlockElement(node: XmlNode): bool =
  node.kind == xnElement and
    node.tag in ["blockquote", "br", "div", "h1", "h2", "h3", "li", "p"]

proc chapterMarker(node: XmlNode): int =
  if node.kind == xnElement and node.tag == "span" and node.hasClass("calibre1"):
    let text = markerText(textContent(node))
    if text.isPositiveIntText:
      return parseInt(text)

proc headingChapterMarker(node: XmlNode, code: string): int =
  if node.kind != xnElement or node.tag != "p":
    return 0

  let text = textContent(node)
  result = numberAfterPrefix(text, "CHAPTER ")
  if result > 0:
    return

  if code == "PSA":
    result = numberAfterPrefix(text, "PSALM ")

proc verseMarker(node: XmlNode): int =
  if node.kind == xnElement and node.tag == "sup" and not node.hasHref:
    let text = markerText(textContent(node))
    if text.isPositiveIntText:
      return parseInt(text)

proc leadingVerseText(s: string): tuple[verse: int, rest: string] =
  let text = s.replace("\xC2\xA0", " ")
  var idx = 0
  while idx < text.len and text[idx].isSpaceAscii:
    inc idx

  let digitStart = idx
  while idx < text.len and text[idx].isDigit:
    inc idx

  if idx == digitStart:
    return

  let numberText = text[digitStart ..< idx]
  while idx < text.len and text[idx].isSpaceAscii:
    inc idx

  result.verse = parseInt(numberText)
  if idx < text.len:
    result.rest = text[idx .. ^1]

proc flushVerse(state: var ParseState) =
  if state.chapter > 0 and state.verse > 0:
    let text = normalizeWhitespace(state.verseText).replace("\t", " ")
    if text.len > 0:
      state.rows.add([state.code, $state.chapter, $state.verse, text].join("\t"))

  state.verseText = ""

proc walkPassageText(node: XmlNode, state: var ParseState) =
  case node.kind
  of xnText:
    if state.chapter > 0:
      if state.verse == 0:
        let leading = leadingVerseText(node.text)
        if leading.verse > 0:
          state.verse = leading.verse
          state.verseText.add(leading.rest)
      elif state.verse > 0:
        state.verseText.add(node.text)
  of xnElement:
    let headingChapter = headingChapterMarker(node, state.code)
    if headingChapter > 0:
      state.flushVerse()
      state.chapter = headingChapter
      state.verse = 0
      return

    if node.shouldSkipElement:
      return

    let chapter = chapterMarker(node)
    if chapter > 0:
      state.flushVerse()
      state.chapter = chapter
      state.verse = 1
      return

    let verse = verseMarker(node)
    if verse > 0:
      state.flushVerse()
      state.verse = verse
      return

    if node.tag == "sup":
      return

    for child in node.items:
      walkPassageText(child, state)

    if node.isBlockElement and state.chapter > 0 and state.verse > 0:
      state.verseText.add(' ')
  else:
    discard

proc indexSplitFile(index: int): string =
  "index_split_" & align($index, 3, '0') & ".html"

proc parseBook(epubPath: string, source: BookSource): seq[string] =
  var state = ParseState(code: source.code)
  if bookInfo(source.code).singleChapter:
    state.chapter = 1

  for index in source.startIndex .. source.endIndex:
    let html = readEpubEntry(epubPath, indexSplitFile(index))
    let doc = parseHtml(newStringStream(html))
    walkPassageText(doc, state)

  state.flushVerse()
  state.rows

proc generate(epubPath, outputPath: string) =
  let sources = bookSources(parseTocEntries(epubPath))
  var rows: seq[string] = @[]

  for source in sources:
    rows.add(parseBook(epubPath, source))

  createDir(outputPath.parentDir)
  writeFile(outputPath, rows.join("\n") & "\n")

when isMainModule:
  if paramCount() != 2:
    quit("Usage: generate_mev_data <mev-epub> <output-tsv>", QuitFailure)

  generate(paramStr(1), paramStr(2))