Add private MEV embedded support
This commit is contained in:
+6
-3
@@ -9,6 +9,7 @@ import cliutils, docopt, zero_functional
|
|||||||
import ./api_bible
|
import ./api_bible
|
||||||
import ./esv
|
import ./esv
|
||||||
import ./kjv
|
import ./kjv
|
||||||
|
import ./mev
|
||||||
|
|
||||||
proc formatMarkdown(raw, translation: string): string =
|
proc formatMarkdown(raw, translation: string): string =
|
||||||
var reference = ""
|
var reference = ""
|
||||||
@@ -80,6 +81,8 @@ proc fetchPassages(reference, translation: string, cfg: CombinedConfig): seq[str
|
|||||||
cfg.getVal("esv-api-root", "https://api.esv.org"))
|
cfg.getVal("esv-api-root", "https://api.esv.org"))
|
||||||
of "akjv", "kjv":
|
of "akjv", "kjv":
|
||||||
kjv.fetchPassages(reference)
|
kjv.fetchPassages(reference)
|
||||||
|
of "mev":
|
||||||
|
mev.fetchPassages(reference)
|
||||||
of "amp", "nkjv", "niv":
|
of "amp", "nkjv", "niv":
|
||||||
api_bible.fetchPassages(
|
api_bible.fetchPassages(
|
||||||
reference,
|
reference,
|
||||||
@@ -92,7 +95,7 @@ proc fetchPassages(reference, translation: string, cfg: CombinedConfig): seq[str
|
|||||||
else:
|
else:
|
||||||
raise newException(ValueError,
|
raise newException(ValueError,
|
||||||
"unsupported translation '" & translation &
|
"unsupported translation '" & translation &
|
||||||
"'; supported translations: akjv, amp, esv, kjv, nkjv, niv")
|
"'; supported translations: akjv, amp, esv, kjv, mev, nkjv, niv")
|
||||||
|
|
||||||
when isMainModule:
|
when isMainModule:
|
||||||
const USAGE = """Usage:
|
const USAGE = """Usage:
|
||||||
@@ -110,8 +113,8 @@ Options:
|
|||||||
|
|
||||||
-t, --translation <translation>
|
-t, --translation <translation>
|
||||||
Select a specific translation. Supported values
|
Select a specific translation. Supported values
|
||||||
are 'akjv', 'amp', 'esv', 'kjv', 'nkjv', and
|
are 'akjv', 'amp', 'esv', 'kjv', 'mev',
|
||||||
'niv'. Defaults to 'esv'.
|
'nkjv', and 'niv'. Defaults to 'esv'.
|
||||||
|
|
||||||
--esv-api-token <token> Provide the API token on the command line. By
|
--esv-api-token <token> Provide the API token on the command line. By
|
||||||
default this will be read either from the
|
default this will be read either from the
|
||||||
|
|||||||
@@ -0,0 +1,112 @@
|
|||||||
|
import std/[strutils, tables]
|
||||||
|
|
||||||
|
import ./reference_parser
|
||||||
|
|
||||||
|
type BibleIndex = object
|
||||||
|
verses: Table[string, string]
|
||||||
|
lastVerseByChapter: Table[string, int]
|
||||||
|
lastChapterByBook: Table[string, int]
|
||||||
|
translationName: string
|
||||||
|
|
||||||
|
proc verseKey(code: string, chapter, verse: int): string =
|
||||||
|
code & "\t" & $chapter & "\t" & $verse
|
||||||
|
|
||||||
|
proc chapterKey(code: string, chapter: int): string =
|
||||||
|
code & "\t" & $chapter
|
||||||
|
|
||||||
|
proc loadBibleIndex(rows, translationName: string): BibleIndex =
|
||||||
|
result.translationName = translationName
|
||||||
|
|
||||||
|
for line in rows.splitLines:
|
||||||
|
if line.strip.len == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
let parts = line.split('\t', maxsplit = 3)
|
||||||
|
if parts.len != 4:
|
||||||
|
raise newException(ValueError,
|
||||||
|
"invalid embedded " & translationName & " row: " & line)
|
||||||
|
|
||||||
|
let code = parts[0]
|
||||||
|
let chapter = parseInt(parts[1])
|
||||||
|
let verse = parseInt(parts[2])
|
||||||
|
let text = parts[3]
|
||||||
|
|
||||||
|
result.verses[verseKey(code, chapter, verse)] = text
|
||||||
|
|
||||||
|
let cKey = chapterKey(code, chapter)
|
||||||
|
if not result.lastVerseByChapter.hasKey(cKey) or
|
||||||
|
verse > result.lastVerseByChapter[cKey]:
|
||||||
|
result.lastVerseByChapter[cKey] = verse
|
||||||
|
|
||||||
|
if not result.lastChapterByBook.hasKey(code) or
|
||||||
|
chapter > result.lastChapterByBook[code]:
|
||||||
|
result.lastChapterByBook[code] = chapter
|
||||||
|
|
||||||
|
proc requireLastChapter(index: BibleIndex, code: string): int =
|
||||||
|
if not index.lastChapterByBook.hasKey(code):
|
||||||
|
raise newException(ValueError,
|
||||||
|
"no embedded " & index.translationName & " data for " & code)
|
||||||
|
index.lastChapterByBook[code]
|
||||||
|
|
||||||
|
proc requireLastVerse(index: BibleIndex, code: string, chapter: int): int =
|
||||||
|
let cKey = chapterKey(code, chapter)
|
||||||
|
if not index.lastVerseByChapter.hasKey(cKey):
|
||||||
|
raise newException(ValueError,
|
||||||
|
"no embedded " & index.translationName & " data for " &
|
||||||
|
bookInfo(code).name & " " & $chapter)
|
||||||
|
index.lastVerseByChapter[cKey]
|
||||||
|
|
||||||
|
proc requireVerse(index: BibleIndex, code: string, chapter, verse: int): string =
|
||||||
|
let vKey = verseKey(code, chapter, verse)
|
||||||
|
if not index.verses.hasKey(vKey):
|
||||||
|
raise newException(ValueError,
|
||||||
|
"no embedded " & index.translationName & " data for " &
|
||||||
|
bookInfo(code).name & " " & $chapter & ":" & $verse)
|
||||||
|
index.verses[vKey]
|
||||||
|
|
||||||
|
proc addVerseLines(
|
||||||
|
lines: var seq[string],
|
||||||
|
index: BibleIndex,
|
||||||
|
reference: PassageReference,
|
||||||
|
range: RefRange) =
|
||||||
|
|
||||||
|
let code = reference.book.code
|
||||||
|
discard index.requireLastChapter(code)
|
||||||
|
|
||||||
|
for chapter in range.start.chapter .. range.finish.chapter:
|
||||||
|
let startVerse =
|
||||||
|
if chapter == range.start.chapter and range.start.verse > 0:
|
||||||
|
range.start.verse
|
||||||
|
else:
|
||||||
|
1
|
||||||
|
|
||||||
|
let endVerse =
|
||||||
|
if chapter == range.finish.chapter and range.finish.verse > 0:
|
||||||
|
range.finish.verse
|
||||||
|
else:
|
||||||
|
index.requireLastVerse(code, chapter)
|
||||||
|
|
||||||
|
if startVerse > endVerse:
|
||||||
|
raise newException(ValueError, "reference range starts after it ends")
|
||||||
|
|
||||||
|
for verse in startVerse .. endVerse:
|
||||||
|
lines.add(" [" & $verse & "] " & index.requireVerse(code, chapter, verse))
|
||||||
|
|
||||||
|
proc fetchReference(index: BibleIndex, reference: PassageReference): string =
|
||||||
|
var lines = @[$reference]
|
||||||
|
let code = reference.book.code
|
||||||
|
|
||||||
|
if reference.ranges.len == 0:
|
||||||
|
for chapter in 1 .. index.requireLastChapter(code):
|
||||||
|
for verse in 1 .. index.requireLastVerse(code, chapter):
|
||||||
|
lines.add(" [" & $verse & "] " & index.requireVerse(code, chapter, verse))
|
||||||
|
else:
|
||||||
|
for range in reference.ranges:
|
||||||
|
lines.addVerseLines(index, reference, range)
|
||||||
|
|
||||||
|
lines.join("\n")
|
||||||
|
|
||||||
|
proc fetchPassages*(rows, reference, translationName: string): seq[string] =
|
||||||
|
let index = loadBibleIndex(rows, translationName)
|
||||||
|
for parsedReference in parseReferences(reference):
|
||||||
|
result.add(fetchReference(index, parsedReference))
|
||||||
+2
-104
@@ -1,109 +1,7 @@
|
|||||||
import std/[strutils, tables]
|
import ./embedded_bible
|
||||||
|
|
||||||
import ./offline_data
|
import ./offline_data
|
||||||
import ./reference_parser
|
|
||||||
|
|
||||||
const kjvRows = embeddedTranslationData("kjv")
|
const kjvRows = embeddedTranslationData("kjv")
|
||||||
|
|
||||||
type BibleIndex = object
|
|
||||||
verses: Table[string, string]
|
|
||||||
lastVerseByChapter: Table[string, int]
|
|
||||||
lastChapterByBook: Table[string, int]
|
|
||||||
|
|
||||||
proc verseKey(code: string, chapter, verse: int): string =
|
|
||||||
code & "\t" & $chapter & "\t" & $verse
|
|
||||||
|
|
||||||
proc chapterKey(code: string, chapter: int): string =
|
|
||||||
code & "\t" & $chapter
|
|
||||||
|
|
||||||
proc loadBibleIndex(): BibleIndex =
|
|
||||||
for line in kjvRows.splitLines:
|
|
||||||
if line.strip.len == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
let parts = line.split('\t', maxsplit = 3)
|
|
||||||
if parts.len != 4:
|
|
||||||
raise newException(ValueError, "invalid embedded KJV row: " & line)
|
|
||||||
|
|
||||||
let code = parts[0]
|
|
||||||
let chapter = parseInt(parts[1])
|
|
||||||
let verse = parseInt(parts[2])
|
|
||||||
let text = parts[3]
|
|
||||||
|
|
||||||
result.verses[verseKey(code, chapter, verse)] = text
|
|
||||||
|
|
||||||
let cKey = chapterKey(code, chapter)
|
|
||||||
if not result.lastVerseByChapter.hasKey(cKey) or
|
|
||||||
verse > result.lastVerseByChapter[cKey]:
|
|
||||||
result.lastVerseByChapter[cKey] = verse
|
|
||||||
|
|
||||||
if not result.lastChapterByBook.hasKey(code) or
|
|
||||||
chapter > result.lastChapterByBook[code]:
|
|
||||||
result.lastChapterByBook[code] = chapter
|
|
||||||
|
|
||||||
proc requireLastChapter(index: BibleIndex, code: string): int =
|
|
||||||
if not index.lastChapterByBook.hasKey(code):
|
|
||||||
raise newException(ValueError, "no embedded KJV data for " & code)
|
|
||||||
index.lastChapterByBook[code]
|
|
||||||
|
|
||||||
proc requireLastVerse(index: BibleIndex, code: string, chapter: int): int =
|
|
||||||
let cKey = chapterKey(code, chapter)
|
|
||||||
if not index.lastVerseByChapter.hasKey(cKey):
|
|
||||||
raise newException(ValueError,
|
|
||||||
"no embedded KJV data for " & bookInfo(code).name & " " & $chapter)
|
|
||||||
index.lastVerseByChapter[cKey]
|
|
||||||
|
|
||||||
proc requireVerse(index: BibleIndex, code: string, chapter, verse: int): string =
|
|
||||||
let vKey = verseKey(code, chapter, verse)
|
|
||||||
if not index.verses.hasKey(vKey):
|
|
||||||
raise newException(ValueError,
|
|
||||||
"no embedded KJV data for " & bookInfo(code).name & " " &
|
|
||||||
$chapter & ":" & $verse)
|
|
||||||
index.verses[vKey]
|
|
||||||
|
|
||||||
proc addVerseLines(
|
|
||||||
lines: var seq[string],
|
|
||||||
index: BibleIndex,
|
|
||||||
reference: PassageReference,
|
|
||||||
range: RefRange) =
|
|
||||||
|
|
||||||
let code = reference.book.code
|
|
||||||
discard index.requireLastChapter(code)
|
|
||||||
|
|
||||||
for chapter in range.start.chapter .. range.finish.chapter:
|
|
||||||
let startVerse =
|
|
||||||
if chapter == range.start.chapter and range.start.verse > 0:
|
|
||||||
range.start.verse
|
|
||||||
else:
|
|
||||||
1
|
|
||||||
|
|
||||||
let endVerse =
|
|
||||||
if chapter == range.finish.chapter and range.finish.verse > 0:
|
|
||||||
range.finish.verse
|
|
||||||
else:
|
|
||||||
index.requireLastVerse(code, chapter)
|
|
||||||
|
|
||||||
if startVerse > endVerse:
|
|
||||||
raise newException(ValueError, "reference range starts after it ends")
|
|
||||||
|
|
||||||
for verse in startVerse .. endVerse:
|
|
||||||
lines.add(" [" & $verse & "] " & index.requireVerse(code, chapter, verse))
|
|
||||||
|
|
||||||
proc fetchReference(index: BibleIndex, reference: PassageReference): string =
|
|
||||||
var lines = @[$reference]
|
|
||||||
let code = reference.book.code
|
|
||||||
|
|
||||||
if reference.ranges.len == 0:
|
|
||||||
for chapter in 1 .. index.requireLastChapter(code):
|
|
||||||
for verse in 1 .. index.requireLastVerse(code, chapter):
|
|
||||||
lines.add(" [" & $verse & "] " & index.requireVerse(code, chapter, verse))
|
|
||||||
else:
|
|
||||||
for range in reference.ranges:
|
|
||||||
lines.addVerseLines(index, reference, range)
|
|
||||||
|
|
||||||
lines.join("\n")
|
|
||||||
|
|
||||||
proc fetchPassages*(reference: string): seq[string] =
|
proc fetchPassages*(reference: string): seq[string] =
|
||||||
let index = loadBibleIndex()
|
embedded_bible.fetchPassages(kjvRows, reference, "KJV")
|
||||||
for parsedReference in parseReferences(reference):
|
|
||||||
result.add(fetchReference(index, parsedReference))
|
|
||||||
|
|||||||
+13
@@ -0,0 +1,13 @@
|
|||||||
|
import ./offline_data
|
||||||
|
|
||||||
|
when hasEmbeddedTranslationData("mev"):
|
||||||
|
import ./embedded_bible
|
||||||
|
|
||||||
|
const mevRows = embeddedTranslationData("mev")
|
||||||
|
|
||||||
|
proc fetchPassages*(reference: string): seq[string] =
|
||||||
|
when hasEmbeddedTranslationData("mev"):
|
||||||
|
embedded_bible.fetchPassages(mevRows, reference, "MEV")
|
||||||
|
else:
|
||||||
|
raise newException(ValueError,
|
||||||
|
"MEV data is not embedded; generate data/private/mev.tsv and rebuild")
|
||||||
+10
-6
@@ -1,11 +1,15 @@
|
|||||||
import std/os
|
import std/os
|
||||||
|
|
||||||
template embeddedTranslationData*(name: static[string]): string =
|
template translationDataPath(name: static[string], visibility: static[string]): string =
|
||||||
const dataRoot = currentSourcePath().parentDir.parentDir / "data"
|
const dataRoot = currentSourcePath().parentDir.parentDir / "data"
|
||||||
const privatePath = dataRoot / "private" / (name & ".tsv")
|
dataRoot / visibility / (name & ".tsv")
|
||||||
const publicPath = dataRoot / "public" / (name & ".tsv")
|
|
||||||
|
|
||||||
when fileExists(privatePath):
|
template hasEmbeddedTranslationData*(name: static[string]): bool =
|
||||||
staticRead(privatePath)
|
fileExists(translationDataPath(name, "private")) or
|
||||||
|
fileExists(translationDataPath(name, "public"))
|
||||||
|
|
||||||
|
template embeddedTranslationData*(name: static[string]): string =
|
||||||
|
when fileExists(translationDataPath(name, "private")):
|
||||||
|
staticRead(translationDataPath(name, "private"))
|
||||||
else:
|
else:
|
||||||
staticRead(publicPath)
|
staticRead(translationDataPath(name, "public"))
|
||||||
|
|||||||
@@ -0,0 +1,330 @@
|
|||||||
|
import std/[
|
||||||
|
htmlparser,
|
||||||
|
os,
|
||||||
|
osproc,
|
||||||
|
streams,
|
||||||
|
strutils,
|
||||||
|
xmlparser,
|
||||||
|
xmltree
|
||||||
|
]
|
||||||
|
|
||||||
|
import ../src/reference_parser
|
||||||
|
|
||||||
|
type
|
||||||
|
TocEntry = object
|
||||||
|
label: string
|
||||||
|
code: string
|
||||||
|
fileIndex: int
|
||||||
|
|
||||||
|
BookSource = object
|
||||||
|
code: string
|
||||||
|
startIndex: int
|
||||||
|
endIndex: int
|
||||||
|
|
||||||
|
ParseState = object
|
||||||
|
code: string
|
||||||
|
chapter: int
|
||||||
|
verse: int
|
||||||
|
verseText: string
|
||||||
|
rows: seq[string]
|
||||||
|
|
||||||
|
proc normalizeWhitespace(s: string): string =
|
||||||
|
var lastWasSpace = false
|
||||||
|
for ch in s.replace("\xC2\xA0", " "):
|
||||||
|
if ch.isSpaceAscii:
|
||||||
|
if not lastWasSpace:
|
||||||
|
result.add(' ')
|
||||||
|
lastWasSpace = true
|
||||||
|
else:
|
||||||
|
result.add(ch)
|
||||||
|
lastWasSpace = false
|
||||||
|
result = result.strip
|
||||||
|
|
||||||
|
proc markerText(s: string): string =
|
||||||
|
normalizeWhitespace(s).replace(" ", "")
|
||||||
|
|
||||||
|
proc numberAfterPrefix(s, prefix: string): int =
|
||||||
|
let text = normalizeWhitespace(s).toUpperAscii
|
||||||
|
if not text.startsWith(prefix):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
var digits = ""
|
||||||
|
for ch in text[prefix.len .. ^1].strip:
|
||||||
|
if ch.isDigit:
|
||||||
|
digits.add(ch)
|
||||||
|
elif digits.len > 0:
|
||||||
|
break
|
||||||
|
elif not ch.isSpaceAscii:
|
||||||
|
break
|
||||||
|
|
||||||
|
if digits.len > 0:
|
||||||
|
result = parseInt(digits)
|
||||||
|
|
||||||
|
proc isPositiveIntText(s: string): bool =
|
||||||
|
let text = markerText(s)
|
||||||
|
text.len > 0 and text.allCharsInSet({'0'..'9'}) and parseInt(text) > 0
|
||||||
|
|
||||||
|
proc readEpubEntry(epubPath, entryPath: string): string =
|
||||||
|
let process = startProcess(
|
||||||
|
"unzip",
|
||||||
|
args = ["-p", epubPath, entryPath],
|
||||||
|
options = {poUsePath, poStdErrToStdOut})
|
||||||
|
result = process.outputStream.readAll()
|
||||||
|
let exitCode = process.waitForExit()
|
||||||
|
process.close()
|
||||||
|
|
||||||
|
if exitCode != 0:
|
||||||
|
raise newException(IOError,
|
||||||
|
"could not read " & entryPath & " from " & epubPath & ": " & result)
|
||||||
|
|
||||||
|
proc textContent(node: XmlNode): string =
|
||||||
|
case node.kind
|
||||||
|
of xnText:
|
||||||
|
result = node.text
|
||||||
|
of xnElement:
|
||||||
|
for child in node.items:
|
||||||
|
result.add(textContent(child))
|
||||||
|
else:
|
||||||
|
discard
|
||||||
|
|
||||||
|
proc firstDescendant(node: XmlNode, tag: string): XmlNode =
|
||||||
|
if node.kind == xnElement:
|
||||||
|
if node.tag == tag:
|
||||||
|
return node
|
||||||
|
|
||||||
|
for child in node.items:
|
||||||
|
let found = firstDescendant(child, tag)
|
||||||
|
if not found.isNil:
|
||||||
|
return found
|
||||||
|
|
||||||
|
proc descendantText(node: XmlNode, tag: string): string =
|
||||||
|
let found = firstDescendant(node, tag)
|
||||||
|
if found.isNil: ""
|
||||||
|
else: normalizeWhitespace(textContent(found))
|
||||||
|
|
||||||
|
proc descendantAttr(node: XmlNode, tag, attrName: string): string =
|
||||||
|
let found = firstDescendant(node, tag)
|
||||||
|
if found.isNil: ""
|
||||||
|
else: found.attr(attrName)
|
||||||
|
|
||||||
|
proc bookCodeForLabel(label: string): string =
|
||||||
|
let bookName = label.split("(", maxsplit = 1)[0].strip
|
||||||
|
if bookName == "Solomon":
|
||||||
|
return "SNG"
|
||||||
|
|
||||||
|
for book in CanonBooks:
|
||||||
|
if book.name == bookName:
|
||||||
|
return book.code
|
||||||
|
|
||||||
|
proc indexFromSplitFile(path: string): int =
|
||||||
|
let filename = path.split('#', maxsplit = 1)[0].extractFilename
|
||||||
|
if not filename.startsWith("index_split_") or not filename.endsWith(".html"):
|
||||||
|
return 0
|
||||||
|
|
||||||
|
parseInt(filename["index_split_".len ..< filename.len - ".html".len])
|
||||||
|
|
||||||
|
proc parseTocEntries(epubPath: string): seq[TocEntry] =
|
||||||
|
let toc = parseXml(newStringStream(readEpubEntry(epubPath, "toc.ncx")))
|
||||||
|
var entries: seq[TocEntry] = @[]
|
||||||
|
|
||||||
|
proc walk(node: XmlNode) =
|
||||||
|
if node.kind == xnElement and node.tag == "navPoint":
|
||||||
|
let label = node.descendantText("text")
|
||||||
|
let src = node.descendantAttr("content", "src")
|
||||||
|
let fileIndex = indexFromSplitFile(src)
|
||||||
|
if fileIndex > 0:
|
||||||
|
entries.add(TocEntry(
|
||||||
|
label: label,
|
||||||
|
code: bookCodeForLabel(label),
|
||||||
|
fileIndex: fileIndex))
|
||||||
|
|
||||||
|
if node.kind == xnElement:
|
||||||
|
for child in node.items:
|
||||||
|
walk(child)
|
||||||
|
|
||||||
|
walk(toc)
|
||||||
|
entries
|
||||||
|
|
||||||
|
proc bookSources(entries: seq[TocEntry]): seq[BookSource] =
|
||||||
|
for idx, entry in entries:
|
||||||
|
if entry.code.len == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
let endIndex =
|
||||||
|
if idx + 1 < entries.len:
|
||||||
|
entries[idx + 1].fileIndex - 1
|
||||||
|
else:
|
||||||
|
entry.fileIndex
|
||||||
|
|
||||||
|
result.add(BookSource(
|
||||||
|
code: entry.code,
|
||||||
|
startIndex: entry.fileIndex,
|
||||||
|
endIndex: endIndex))
|
||||||
|
|
||||||
|
if result.len != CanonBooks.len:
|
||||||
|
raise newException(ValueError,
|
||||||
|
"expected " & $CanonBooks.len & " canonical books in EPUB TOC, found " &
|
||||||
|
$result.len)
|
||||||
|
|
||||||
|
for idx, book in CanonBooks:
|
||||||
|
if result[idx].code != book.code:
|
||||||
|
raise newException(ValueError,
|
||||||
|
"expected " & book.code & " at position " & $idx & ", found " &
|
||||||
|
result[idx].code)
|
||||||
|
|
||||||
|
proc hasClass(node: XmlNode, className: string): bool =
|
||||||
|
if node.kind != xnElement:
|
||||||
|
return false
|
||||||
|
|
||||||
|
for value in node.attr("class").splitWhitespace:
|
||||||
|
if value == className:
|
||||||
|
return true
|
||||||
|
|
||||||
|
proc shouldSkipElement(node: XmlNode): bool =
|
||||||
|
node.hasClass("calibre_29") or # section headings
|
||||||
|
node.hasClass("calibre_6") or # parallel/cross-reference paragraphs
|
||||||
|
node.hasClass("calibre_26") # Psalm superscriptions/cross-references
|
||||||
|
|
||||||
|
proc hasHref(node: XmlNode): bool =
|
||||||
|
if node.kind == xnElement:
|
||||||
|
if node.attr("href").len > 0:
|
||||||
|
return true
|
||||||
|
|
||||||
|
for child in node.items:
|
||||||
|
if hasHref(child):
|
||||||
|
return true
|
||||||
|
|
||||||
|
proc isBlockElement(node: XmlNode): bool =
|
||||||
|
node.kind == xnElement and
|
||||||
|
node.tag in ["blockquote", "br", "div", "h1", "h2", "h3", "li", "p"]
|
||||||
|
|
||||||
|
proc chapterMarker(node: XmlNode): int =
|
||||||
|
if node.kind == xnElement and node.tag == "span" and node.hasClass("calibre1"):
|
||||||
|
let text = markerText(textContent(node))
|
||||||
|
if text.isPositiveIntText:
|
||||||
|
return parseInt(text)
|
||||||
|
|
||||||
|
proc headingChapterMarker(node: XmlNode, code: string): int =
|
||||||
|
if node.kind != xnElement or node.tag != "p":
|
||||||
|
return 0
|
||||||
|
|
||||||
|
let text = textContent(node)
|
||||||
|
result = numberAfterPrefix(text, "CHAPTER ")
|
||||||
|
if result > 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
if code == "PSA":
|
||||||
|
result = numberAfterPrefix(text, "PSALM ")
|
||||||
|
|
||||||
|
proc verseMarker(node: XmlNode): int =
|
||||||
|
if node.kind == xnElement and node.tag == "sup" and not node.hasHref:
|
||||||
|
let text = markerText(textContent(node))
|
||||||
|
if text.isPositiveIntText:
|
||||||
|
return parseInt(text)
|
||||||
|
|
||||||
|
proc leadingVerseText(s: string): tuple[verse: int, rest: string] =
|
||||||
|
let text = s.replace("\xC2\xA0", " ")
|
||||||
|
var idx = 0
|
||||||
|
while idx < text.len and text[idx].isSpaceAscii:
|
||||||
|
inc idx
|
||||||
|
|
||||||
|
let digitStart = idx
|
||||||
|
while idx < text.len and text[idx].isDigit:
|
||||||
|
inc idx
|
||||||
|
|
||||||
|
if idx == digitStart:
|
||||||
|
return
|
||||||
|
|
||||||
|
let numberText = text[digitStart ..< idx]
|
||||||
|
while idx < text.len and text[idx].isSpaceAscii:
|
||||||
|
inc idx
|
||||||
|
|
||||||
|
result.verse = parseInt(numberText)
|
||||||
|
if idx < text.len:
|
||||||
|
result.rest = text[idx .. ^1]
|
||||||
|
|
||||||
|
proc flushVerse(state: var ParseState) =
|
||||||
|
if state.chapter > 0 and state.verse > 0:
|
||||||
|
let text = normalizeWhitespace(state.verseText).replace("\t", " ")
|
||||||
|
if text.len > 0:
|
||||||
|
state.rows.add([state.code, $state.chapter, $state.verse, text].join("\t"))
|
||||||
|
|
||||||
|
state.verseText = ""
|
||||||
|
|
||||||
|
proc walkPassageText(node: XmlNode, state: var ParseState) =
|
||||||
|
case node.kind
|
||||||
|
of xnText:
|
||||||
|
if state.chapter > 0:
|
||||||
|
if state.verse == 0:
|
||||||
|
let leading = leadingVerseText(node.text)
|
||||||
|
if leading.verse > 0:
|
||||||
|
state.verse = leading.verse
|
||||||
|
state.verseText.add(leading.rest)
|
||||||
|
elif state.verse > 0:
|
||||||
|
state.verseText.add(node.text)
|
||||||
|
of xnElement:
|
||||||
|
let headingChapter = headingChapterMarker(node, state.code)
|
||||||
|
if headingChapter > 0:
|
||||||
|
state.flushVerse()
|
||||||
|
state.chapter = headingChapter
|
||||||
|
state.verse = 0
|
||||||
|
return
|
||||||
|
|
||||||
|
if node.shouldSkipElement:
|
||||||
|
return
|
||||||
|
|
||||||
|
let chapter = chapterMarker(node)
|
||||||
|
if chapter > 0:
|
||||||
|
state.flushVerse()
|
||||||
|
state.chapter = chapter
|
||||||
|
state.verse = 1
|
||||||
|
return
|
||||||
|
|
||||||
|
let verse = verseMarker(node)
|
||||||
|
if verse > 0:
|
||||||
|
state.flushVerse()
|
||||||
|
state.verse = verse
|
||||||
|
return
|
||||||
|
|
||||||
|
if node.tag == "sup":
|
||||||
|
return
|
||||||
|
|
||||||
|
for child in node.items:
|
||||||
|
walkPassageText(child, state)
|
||||||
|
|
||||||
|
if node.isBlockElement and state.chapter > 0 and state.verse > 0:
|
||||||
|
state.verseText.add(' ')
|
||||||
|
else:
|
||||||
|
discard
|
||||||
|
|
||||||
|
proc indexSplitFile(index: int): string =
|
||||||
|
"index_split_" & align($index, 3, '0') & ".html"
|
||||||
|
|
||||||
|
proc parseBook(epubPath: string, source: BookSource): seq[string] =
|
||||||
|
var state = ParseState(code: source.code)
|
||||||
|
if bookInfo(source.code).singleChapter:
|
||||||
|
state.chapter = 1
|
||||||
|
|
||||||
|
for index in source.startIndex .. source.endIndex:
|
||||||
|
let html = readEpubEntry(epubPath, indexSplitFile(index))
|
||||||
|
let doc = parseHtml(newStringStream(html))
|
||||||
|
walkPassageText(doc, state)
|
||||||
|
|
||||||
|
state.flushVerse()
|
||||||
|
state.rows
|
||||||
|
|
||||||
|
proc generate(epubPath, outputPath: string) =
|
||||||
|
let sources = bookSources(parseTocEntries(epubPath))
|
||||||
|
var rows: seq[string] = @[]
|
||||||
|
|
||||||
|
for source in sources:
|
||||||
|
rows.add(parseBook(epubPath, source))
|
||||||
|
|
||||||
|
createDir(outputPath.parentDir)
|
||||||
|
writeFile(outputPath, rows.join("\n") & "\n")
|
||||||
|
|
||||||
|
when isMainModule:
|
||||||
|
if paramCount() != 2:
|
||||||
|
quit("Usage: generate_mev_data <mev-epub> <output-tsv>", QuitFailure)
|
||||||
|
|
||||||
|
generate(paramStr(1), paramStr(2))
|
||||||
Reference in New Issue
Block a user