Add private MEV embedded support
This commit is contained in:
@@ -0,0 +1,330 @@
|
||||
import std/[
|
||||
htmlparser,
|
||||
os,
|
||||
osproc,
|
||||
streams,
|
||||
strutils,
|
||||
xmlparser,
|
||||
xmltree
|
||||
]
|
||||
|
||||
import ../src/reference_parser
|
||||
|
||||
type
|
||||
TocEntry = object
|
||||
label: string
|
||||
code: string
|
||||
fileIndex: int
|
||||
|
||||
BookSource = object
|
||||
code: string
|
||||
startIndex: int
|
||||
endIndex: int
|
||||
|
||||
ParseState = object
|
||||
code: string
|
||||
chapter: int
|
||||
verse: int
|
||||
verseText: string
|
||||
rows: seq[string]
|
||||
|
||||
proc normalizeWhitespace(s: string): string =
|
||||
var lastWasSpace = false
|
||||
for ch in s.replace("\xC2\xA0", " "):
|
||||
if ch.isSpaceAscii:
|
||||
if not lastWasSpace:
|
||||
result.add(' ')
|
||||
lastWasSpace = true
|
||||
else:
|
||||
result.add(ch)
|
||||
lastWasSpace = false
|
||||
result = result.strip
|
||||
|
||||
proc markerText(s: string): string =
|
||||
normalizeWhitespace(s).replace(" ", "")
|
||||
|
||||
proc numberAfterPrefix(s, prefix: string): int =
|
||||
let text = normalizeWhitespace(s).toUpperAscii
|
||||
if not text.startsWith(prefix):
|
||||
return 0
|
||||
|
||||
var digits = ""
|
||||
for ch in text[prefix.len .. ^1].strip:
|
||||
if ch.isDigit:
|
||||
digits.add(ch)
|
||||
elif digits.len > 0:
|
||||
break
|
||||
elif not ch.isSpaceAscii:
|
||||
break
|
||||
|
||||
if digits.len > 0:
|
||||
result = parseInt(digits)
|
||||
|
||||
proc isPositiveIntText(s: string): bool =
|
||||
let text = markerText(s)
|
||||
text.len > 0 and text.allCharsInSet({'0'..'9'}) and parseInt(text) > 0
|
||||
|
||||
proc readEpubEntry(epubPath, entryPath: string): string =
|
||||
let process = startProcess(
|
||||
"unzip",
|
||||
args = ["-p", epubPath, entryPath],
|
||||
options = {poUsePath, poStdErrToStdOut})
|
||||
result = process.outputStream.readAll()
|
||||
let exitCode = process.waitForExit()
|
||||
process.close()
|
||||
|
||||
if exitCode != 0:
|
||||
raise newException(IOError,
|
||||
"could not read " & entryPath & " from " & epubPath & ": " & result)
|
||||
|
||||
proc textContent(node: XmlNode): string =
|
||||
case node.kind
|
||||
of xnText:
|
||||
result = node.text
|
||||
of xnElement:
|
||||
for child in node.items:
|
||||
result.add(textContent(child))
|
||||
else:
|
||||
discard
|
||||
|
||||
proc firstDescendant(node: XmlNode, tag: string): XmlNode =
|
||||
if node.kind == xnElement:
|
||||
if node.tag == tag:
|
||||
return node
|
||||
|
||||
for child in node.items:
|
||||
let found = firstDescendant(child, tag)
|
||||
if not found.isNil:
|
||||
return found
|
||||
|
||||
proc descendantText(node: XmlNode, tag: string): string =
|
||||
let found = firstDescendant(node, tag)
|
||||
if found.isNil: ""
|
||||
else: normalizeWhitespace(textContent(found))
|
||||
|
||||
proc descendantAttr(node: XmlNode, tag, attrName: string): string =
|
||||
let found = firstDescendant(node, tag)
|
||||
if found.isNil: ""
|
||||
else: found.attr(attrName)
|
||||
|
||||
proc bookCodeForLabel(label: string): string =
|
||||
let bookName = label.split("(", maxsplit = 1)[0].strip
|
||||
if bookName == "Solomon":
|
||||
return "SNG"
|
||||
|
||||
for book in CanonBooks:
|
||||
if book.name == bookName:
|
||||
return book.code
|
||||
|
||||
proc indexFromSplitFile(path: string): int =
|
||||
let filename = path.split('#', maxsplit = 1)[0].extractFilename
|
||||
if not filename.startsWith("index_split_") or not filename.endsWith(".html"):
|
||||
return 0
|
||||
|
||||
parseInt(filename["index_split_".len ..< filename.len - ".html".len])
|
||||
|
||||
proc parseTocEntries(epubPath: string): seq[TocEntry] =
|
||||
let toc = parseXml(newStringStream(readEpubEntry(epubPath, "toc.ncx")))
|
||||
var entries: seq[TocEntry] = @[]
|
||||
|
||||
proc walk(node: XmlNode) =
|
||||
if node.kind == xnElement and node.tag == "navPoint":
|
||||
let label = node.descendantText("text")
|
||||
let src = node.descendantAttr("content", "src")
|
||||
let fileIndex = indexFromSplitFile(src)
|
||||
if fileIndex > 0:
|
||||
entries.add(TocEntry(
|
||||
label: label,
|
||||
code: bookCodeForLabel(label),
|
||||
fileIndex: fileIndex))
|
||||
|
||||
if node.kind == xnElement:
|
||||
for child in node.items:
|
||||
walk(child)
|
||||
|
||||
walk(toc)
|
||||
entries
|
||||
|
||||
proc bookSources(entries: seq[TocEntry]): seq[BookSource] =
|
||||
for idx, entry in entries:
|
||||
if entry.code.len == 0:
|
||||
continue
|
||||
|
||||
let endIndex =
|
||||
if idx + 1 < entries.len:
|
||||
entries[idx + 1].fileIndex - 1
|
||||
else:
|
||||
entry.fileIndex
|
||||
|
||||
result.add(BookSource(
|
||||
code: entry.code,
|
||||
startIndex: entry.fileIndex,
|
||||
endIndex: endIndex))
|
||||
|
||||
if result.len != CanonBooks.len:
|
||||
raise newException(ValueError,
|
||||
"expected " & $CanonBooks.len & " canonical books in EPUB TOC, found " &
|
||||
$result.len)
|
||||
|
||||
for idx, book in CanonBooks:
|
||||
if result[idx].code != book.code:
|
||||
raise newException(ValueError,
|
||||
"expected " & book.code & " at position " & $idx & ", found " &
|
||||
result[idx].code)
|
||||
|
||||
proc hasClass(node: XmlNode, className: string): bool =
|
||||
if node.kind != xnElement:
|
||||
return false
|
||||
|
||||
for value in node.attr("class").splitWhitespace:
|
||||
if value == className:
|
||||
return true
|
||||
|
||||
proc shouldSkipElement(node: XmlNode): bool =
|
||||
node.hasClass("calibre_29") or # section headings
|
||||
node.hasClass("calibre_6") or # parallel/cross-reference paragraphs
|
||||
node.hasClass("calibre_26") # Psalm superscriptions/cross-references
|
||||
|
||||
proc hasHref(node: XmlNode): bool =
|
||||
if node.kind == xnElement:
|
||||
if node.attr("href").len > 0:
|
||||
return true
|
||||
|
||||
for child in node.items:
|
||||
if hasHref(child):
|
||||
return true
|
||||
|
||||
proc isBlockElement(node: XmlNode): bool =
|
||||
node.kind == xnElement and
|
||||
node.tag in ["blockquote", "br", "div", "h1", "h2", "h3", "li", "p"]
|
||||
|
||||
proc chapterMarker(node: XmlNode): int =
|
||||
if node.kind == xnElement and node.tag == "span" and node.hasClass("calibre1"):
|
||||
let text = markerText(textContent(node))
|
||||
if text.isPositiveIntText:
|
||||
return parseInt(text)
|
||||
|
||||
proc headingChapterMarker(node: XmlNode, code: string): int =
|
||||
if node.kind != xnElement or node.tag != "p":
|
||||
return 0
|
||||
|
||||
let text = textContent(node)
|
||||
result = numberAfterPrefix(text, "CHAPTER ")
|
||||
if result > 0:
|
||||
return
|
||||
|
||||
if code == "PSA":
|
||||
result = numberAfterPrefix(text, "PSALM ")
|
||||
|
||||
proc verseMarker(node: XmlNode): int =
|
||||
if node.kind == xnElement and node.tag == "sup" and not node.hasHref:
|
||||
let text = markerText(textContent(node))
|
||||
if text.isPositiveIntText:
|
||||
return parseInt(text)
|
||||
|
||||
proc leadingVerseText(s: string): tuple[verse: int, rest: string] =
|
||||
let text = s.replace("\xC2\xA0", " ")
|
||||
var idx = 0
|
||||
while idx < text.len and text[idx].isSpaceAscii:
|
||||
inc idx
|
||||
|
||||
let digitStart = idx
|
||||
while idx < text.len and text[idx].isDigit:
|
||||
inc idx
|
||||
|
||||
if idx == digitStart:
|
||||
return
|
||||
|
||||
let numberText = text[digitStart ..< idx]
|
||||
while idx < text.len and text[idx].isSpaceAscii:
|
||||
inc idx
|
||||
|
||||
result.verse = parseInt(numberText)
|
||||
if idx < text.len:
|
||||
result.rest = text[idx .. ^1]
|
||||
|
||||
proc flushVerse(state: var ParseState) =
|
||||
if state.chapter > 0 and state.verse > 0:
|
||||
let text = normalizeWhitespace(state.verseText).replace("\t", " ")
|
||||
if text.len > 0:
|
||||
state.rows.add([state.code, $state.chapter, $state.verse, text].join("\t"))
|
||||
|
||||
state.verseText = ""
|
||||
|
||||
proc walkPassageText(node: XmlNode, state: var ParseState) =
|
||||
case node.kind
|
||||
of xnText:
|
||||
if state.chapter > 0:
|
||||
if state.verse == 0:
|
||||
let leading = leadingVerseText(node.text)
|
||||
if leading.verse > 0:
|
||||
state.verse = leading.verse
|
||||
state.verseText.add(leading.rest)
|
||||
elif state.verse > 0:
|
||||
state.verseText.add(node.text)
|
||||
of xnElement:
|
||||
let headingChapter = headingChapterMarker(node, state.code)
|
||||
if headingChapter > 0:
|
||||
state.flushVerse()
|
||||
state.chapter = headingChapter
|
||||
state.verse = 0
|
||||
return
|
||||
|
||||
if node.shouldSkipElement:
|
||||
return
|
||||
|
||||
let chapter = chapterMarker(node)
|
||||
if chapter > 0:
|
||||
state.flushVerse()
|
||||
state.chapter = chapter
|
||||
state.verse = 1
|
||||
return
|
||||
|
||||
let verse = verseMarker(node)
|
||||
if verse > 0:
|
||||
state.flushVerse()
|
||||
state.verse = verse
|
||||
return
|
||||
|
||||
if node.tag == "sup":
|
||||
return
|
||||
|
||||
for child in node.items:
|
||||
walkPassageText(child, state)
|
||||
|
||||
if node.isBlockElement and state.chapter > 0 and state.verse > 0:
|
||||
state.verseText.add(' ')
|
||||
else:
|
||||
discard
|
||||
|
||||
proc indexSplitFile(index: int): string =
|
||||
"index_split_" & align($index, 3, '0') & ".html"
|
||||
|
||||
proc parseBook(epubPath: string, source: BookSource): seq[string] =
|
||||
var state = ParseState(code: source.code)
|
||||
if bookInfo(source.code).singleChapter:
|
||||
state.chapter = 1
|
||||
|
||||
for index in source.startIndex .. source.endIndex:
|
||||
let html = readEpubEntry(epubPath, indexSplitFile(index))
|
||||
let doc = parseHtml(newStringStream(html))
|
||||
walkPassageText(doc, state)
|
||||
|
||||
state.flushVerse()
|
||||
state.rows
|
||||
|
||||
proc generate(epubPath, outputPath: string) =
|
||||
let sources = bookSources(parseTocEntries(epubPath))
|
||||
var rows: seq[string] = @[]
|
||||
|
||||
for source in sources:
|
||||
rows.add(parseBook(epubPath, source))
|
||||
|
||||
createDir(outputPath.parentDir)
|
||||
writeFile(outputPath, rows.join("\n") & "\n")
|
||||
|
||||
when isMainModule:
|
||||
if paramCount() != 2:
|
||||
quit("Usage: generate_mev_data <mev-epub> <output-tsv>", QuitFailure)
|
||||
|
||||
generate(paramStr(1), paramStr(2))
|
||||
Reference in New Issue
Block a user