import std/[ htmlparser, os, osproc, streams, strutils, xmlparser, xmltree ] import ../src/reference_parser type TocEntry = object label: string code: string fileIndex: int BookSource = object code: string startIndex: int endIndex: int ParseState = object code: string chapter: int verse: int verseText: string rows: seq[string] proc normalizeWhitespace(s: string): string = var lastWasSpace = false for ch in s.replace("\xC2\xA0", " "): if ch.isSpaceAscii: if not lastWasSpace: result.add(' ') lastWasSpace = true else: result.add(ch) lastWasSpace = false result = result.strip proc markerText(s: string): string = normalizeWhitespace(s).replace(" ", "") proc numberAfterPrefix(s, prefix: string): int = let text = normalizeWhitespace(s).toUpperAscii if not text.startsWith(prefix): return 0 var digits = "" for ch in text[prefix.len .. ^1].strip: if ch.isDigit: digits.add(ch) elif digits.len > 0: break elif not ch.isSpaceAscii: break if digits.len > 0: result = parseInt(digits) proc isPositiveIntText(s: string): bool = let text = markerText(s) text.len > 0 and text.allCharsInSet({'0'..'9'}) and parseInt(text) > 0 proc readEpubEntry(epubPath, entryPath: string): string = let process = startProcess( "unzip", args = ["-p", epubPath, entryPath], options = {poUsePath, poStdErrToStdOut}) result = process.outputStream.readAll() let exitCode = process.waitForExit() process.close() if exitCode != 0: raise newException(IOError, "could not read " & entryPath & " from " & epubPath & ": " & result) proc textContent(node: XmlNode): string = case node.kind of xnText: result = node.text of xnElement: for child in node.items: result.add(textContent(child)) else: discard proc firstDescendant(node: XmlNode, tag: string): XmlNode = if node.kind == xnElement: if node.tag == tag: return node for child in node.items: let found = firstDescendant(child, tag) if not found.isNil: return found proc descendantText(node: XmlNode, tag: string): string = let found = firstDescendant(node, tag) if found.isNil: "" else: normalizeWhitespace(textContent(found)) proc descendantAttr(node: XmlNode, tag, attrName: string): string = let found = firstDescendant(node, tag) if found.isNil: "" else: found.attr(attrName) proc bookCodeForLabel(label: string): string = let bookName = label.split("(", maxsplit = 1)[0].strip if bookName == "Solomon": return "SNG" for book in CanonBooks: if book.name == bookName: return book.code proc indexFromSplitFile(path: string): int = let filename = path.split('#', maxsplit = 1)[0].extractFilename if not filename.startsWith("index_split_") or not filename.endsWith(".html"): return 0 parseInt(filename["index_split_".len ..< filename.len - ".html".len]) proc parseTocEntries(epubPath: string): seq[TocEntry] = let toc = parseXml(newStringStream(readEpubEntry(epubPath, "toc.ncx"))) var entries: seq[TocEntry] = @[] proc walk(node: XmlNode) = if node.kind == xnElement and node.tag == "navPoint": let label = node.descendantText("text") let src = node.descendantAttr("content", "src") let fileIndex = indexFromSplitFile(src) if fileIndex > 0: entries.add(TocEntry( label: label, code: bookCodeForLabel(label), fileIndex: fileIndex)) if node.kind == xnElement: for child in node.items: walk(child) walk(toc) entries proc bookSources(entries: seq[TocEntry]): seq[BookSource] = for idx, entry in entries: if entry.code.len == 0: continue let endIndex = if idx + 1 < entries.len: entries[idx + 1].fileIndex - 1 else: entry.fileIndex result.add(BookSource( code: entry.code, startIndex: entry.fileIndex, endIndex: endIndex)) if result.len != CanonBooks.len: raise newException(ValueError, "expected " & $CanonBooks.len & " canonical books in EPUB TOC, found " & $result.len) for idx, book in CanonBooks: if result[idx].code != book.code: raise newException(ValueError, "expected " & book.code & " at position " & $idx & ", found " & result[idx].code) proc hasClass(node: XmlNode, className: string): bool = if node.kind != xnElement: return false for value in node.attr("class").splitWhitespace: if value == className: return true proc shouldSkipElement(node: XmlNode): bool = node.hasClass("calibre_29") or # section headings node.hasClass("calibre_6") or # parallel/cross-reference paragraphs node.hasClass("calibre_26") # Psalm superscriptions/cross-references proc hasHref(node: XmlNode): bool = if node.kind == xnElement: if node.attr("href").len > 0: return true for child in node.items: if hasHref(child): return true proc isBlockElement(node: XmlNode): bool = node.kind == xnElement and node.tag in ["blockquote", "br", "div", "h1", "h2", "h3", "li", "p"] proc chapterMarker(node: XmlNode): int = if node.kind == xnElement and node.tag == "span" and node.hasClass("calibre1"): let text = markerText(textContent(node)) if text.isPositiveIntText: return parseInt(text) proc headingChapterMarker(node: XmlNode, code: string): int = if node.kind != xnElement or node.tag != "p": return 0 let text = textContent(node) result = numberAfterPrefix(text, "CHAPTER ") if result > 0: return if code == "PSA": result = numberAfterPrefix(text, "PSALM ") proc verseMarker(node: XmlNode): int = if node.kind == xnElement and node.tag == "sup" and not node.hasHref: let text = markerText(textContent(node)) if text.isPositiveIntText: return parseInt(text) proc leadingVerseText(s: string): tuple[verse: int, rest: string] = let text = s.replace("\xC2\xA0", " ") var idx = 0 while idx < text.len and text[idx].isSpaceAscii: inc idx let digitStart = idx while idx < text.len and text[idx].isDigit: inc idx if idx == digitStart: return let numberText = text[digitStart ..< idx] while idx < text.len and text[idx].isSpaceAscii: inc idx result.verse = parseInt(numberText) if idx < text.len: result.rest = text[idx .. ^1] proc flushVerse(state: var ParseState) = if state.chapter > 0 and state.verse > 0: let text = normalizeWhitespace(state.verseText).replace("\t", " ") if text.len > 0: state.rows.add([state.code, $state.chapter, $state.verse, text].join("\t")) state.verseText = "" proc walkPassageText(node: XmlNode, state: var ParseState) = case node.kind of xnText: if state.chapter > 0: if state.verse == 0: let leading = leadingVerseText(node.text) if leading.verse > 0: state.verse = leading.verse state.verseText.add(leading.rest) elif state.verse > 0: state.verseText.add(node.text) of xnElement: let headingChapter = headingChapterMarker(node, state.code) if headingChapter > 0: state.flushVerse() state.chapter = headingChapter state.verse = 0 return if node.shouldSkipElement: return let chapter = chapterMarker(node) if chapter > 0: state.flushVerse() state.chapter = chapter state.verse = 1 return let verse = verseMarker(node) if verse > 0: state.flushVerse() state.verse = verse return if node.tag == "sup": return for child in node.items: walkPassageText(child, state) if node.isBlockElement and state.chapter > 0 and state.verse > 0: state.verseText.add(' ') else: discard proc indexSplitFile(index: int): string = "index_split_" & align($index, 3, '0') & ".html" proc parseBook(epubPath: string, source: BookSource): seq[string] = var state = ParseState(code: source.code) if bookInfo(source.code).singleChapter: state.chapter = 1 for index in source.startIndex .. source.endIndex: let html = readEpubEntry(epubPath, indexSplitFile(index)) let doc = parseHtml(newStringStream(html)) walkPassageText(doc, state) state.flushVerse() state.rows proc generate(epubPath, outputPath: string) = let sources = bookSources(parseTocEntries(epubPath)) var rows: seq[string] = @[] for source in sources: rows.add(parseBook(epubPath, source)) createDir(outputPath.parentDir) writeFile(outputPath, rows.join("\n") & "\n") when isMainModule: if paramCount() != 2: quit("Usage: generate_mev_data ", QuitFailure) generate(paramStr(1), paramStr(2))