Jonathan Bernard 935f1bae2f WIP documentation
- The documentation is cluttered enough as it is with the large number
  of procedures supporting vCard 3 and 4. Split common out into the
  publicly exposed bits and the private internals. This makes it obvious
  which common functionality a client can expect to have exposed on the
  main vcard module.

- Add documentation (WIP) on the vcard3 module.
2023-05-03 02:16:18 -05:00

519 lines
17 KiB
Nim

# vCard-specific Lexer
# © 2022-2023 Jonathan Bernard
## This module defines a lexer with functionality useful for parsing vCard
## content. Specifically:
## - it understands the vCard line-folding logic and transparently joins folded
## lines as it read input off its input stream.
## - it supports multiple nested bookmarks to make look-ahead decisions more
## convenient
## - it supports reading from the underlying stream byte-wise or as unicode
## runes.
##
## This parser uses a ring buffer underneath, only growing the size of its
## buffer when it is completely full.
import std/[streams, unicode]
type VCardLexer* = object of RootObj
input: Stream
buffer*: string ## buffer of bytes read
bufStart: int ## starting boundary for the buffer
bufEnd: int ## ending boundary for the buffer
pos*: int ## current read position
bookmark*: seq[int] ## bookmark to support rewind functionality
bookmarkVal*: seq[string] ## value read since the bookmark was set
lineNumber*: int ## how many newlines have we seen so far
lineStart: int ## buffer index buffer for the start of the current line
lineVal*: string ## value read since the start of the current line
proc skipUtf8Bom(vcl: var VCardLexer) =
if (vcl.buffer[0] == '\xEF') and (vcl.buffer[1] == '\xBB') and (vcl.buffer[2] == '\xBF'):
inc(vcl.pos, 3)
template wrappedIdx(idx: untyped): int = idx mod vcl.buffer.len
## Map an index into the buffer bounds (mod)
proc newStartIdx(vcl: VCardLexer): int =
## Get the latest safe index to use as a new start index. The implication is
## that anything prior to this index has been read and processed and can be
## safely overwritten in the buffer.
if vcl.bookmark.len > 0: vcl.bookmark[0] else: vcl.pos
func isFull(vcl: VCardLexer): bool {.inline.} =
return wrappedIdx(vcl.bufEnd + 1) == vcl.newStartIdx
func atEnd(vcl: VCardLexer): bool {.inline.} =
vcl.pos == vcl.bufEnd
proc doubleBuffer(vcl: var VCardLexer) =
## Double the capacity of the buffer, copying the contents of the current
## buffer into the beginning of the newly expanded buffer.
let oldBuf = vcl.buffer
vcl.buffer = newString(oldBuf.len * 2)
var newIdx = 0
var oldIdx = vcl.bufStart
while oldIdx != vcl.bufEnd or newIdx == 0:
vcl.buffer[newIdx] = oldBuf[oldIdx]
inc(newIdx)
oldIdx = (newIdx + vcl.bufStart) mod oldBuf.len
# We know that for all existing indices, their location in the new buffer can
# be calculated as a function of the distance we moved the start of the
# buffer: `idx = idx + (newBufStart - oldBufStart)`. Since we know the new
# bufStart will be 0, we know that we can calculate all of the new indices as
# `idx -= oldBufStart` Currently vcl.bufStart is still the old bufStart.
vcl.pos -= vcl.bufStart
vcl.lineStart -= vcl.bufStart
if vcl.bookmark.len > 0: vcl.bookmark[0] -= vcl.bufStart
# Now that we've updated all of the existing indices, we can reset the
# buffer start and end indices to their new values.
vcl.bufStart = 0
vcl.bufEnd = newIdx
proc fillBuffer(vcl: var VCardLexer) =
## Read data into the buffer from the underlying stream until the buffer is
## full. If the buffer is already full, double the buffer beforehand.
# Note that we do not *completely* fill the buffer. We always leave one index
# of the array empty. This allows us to differentiate between an empty buffer
# (`bufStart == bufEnd`) and a completly full buffer (`bufStart ==
# wrappedIdx(bufEnd + 1)`).
var charsRead: int
# check to see if we have a full buffer
if vcl.isFull: vcl.doubleBuffer()
# discard used portions of the buffer
vcl.bufStart = vcl.newStartIdx
# We have three conditions that the ring buffer may be in:
if vcl.bufEnd < vcl.bufStart:
# The unused portion of the buffer is all in the middle and we can just
# read date into the space from bufEnd (e) to bufStart (s).
# e s
# 0 1 2 3 4 5 6 7 8 9
charsRead = vcl.input.readDataStr(vcl.buffer,
vcl.bufEnd ..< (vcl.bufStart - 1))
vcl.bufEnd += charsRead
elif vcl.bufStart == 0:
# The unused portion is entirely at the end of the buffer. We can read data
# from the bufEnd (e) to the end of our buffer capacity.
# s e
# 0 1 2 3 4 5 6 7 8 9
charsRead = vcl.input.readDataStr(vcl.buffer,
vcl.bufEnd ..< (vcl.buffer.len - 1))
vcl.bufEnd = wrappedIdx(vcl.bufEnd + charsRead)
else:
# The used portion of the buffer is in the middle, and the unused portion
# is on either side ot that. We need to read from bufEnd (e) to the end of
# our underlying buffer, and then from the start of our underlying buffer
# to bufStart (s)
# s e
# 0 1 2 3 4 5 6 7 8 9
charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd..<vcl.buffer.len)
if charsRead == vcl.buffer.len - vcl.bufEnd:
# Only read into the front part of the buffer if we were able to
# completely fill the back part.
vcl.bufEnd = vcl.input.readDataStr(vcl.buffer, 0 ..< (vcl.bufStart - 1))
proc close*(vcl: var VCardLexer) = vcl.input.close
## Close this VCardLexer and its underlying stream.
proc open*(vcl: var VCardLexer, input: Stream, bufLen = 16384) =
## Open the given stream and initialize the given VCardLexer to read from it.
assert(bufLen > 0)
assert(input != nil)
vcl.input = input
vcl.pos = 0
vcl.bookmark = @[]
vcl.buffer = newString(bufLen)
vcl.bufStart = 0
vcl.bufEnd = 0
vcl.lineNumber = 0
vcl.lineStart = 0
vcl.fillBuffer
vcl.skipUtf8Bom
proc setBookmark*(vcl: var VCardLexer) =
## Set a bookmark into the lexer's buffer. This will prevent the lexer from
## discarding data from this bookmark forward when it refills.
##
## The bookmark must be cleared using either `returnToBookmark` or
## `unsetBookmark`, otherwise this lexer will no function as a streaming
## lexer and will end up reading the entire remainder of the input stream
## into memory (if it can).
##
## This function can be called multiple times in order to create nested
## bookmarks. For example, we might set a bookmark at the beginning of a line
## to be able to reset if we fail to parse the line, then set a bookmark
## midway through when attempting to parse a parameter value. Care should be
## taken when nesting bookmarks as all bookmarks must be released to avoid
## the behavior described above.
vcl.bookmark.add(vcl.pos)
vcl.bookmarkVal.add(newStringOfCap(32))
proc returnToBookmark*(vcl: var VCardLexer) =
## Unset the most recent bookmark, resetting the lexer's read position to the
## position of the bookmark.
if vcl.bookmark.len == 0: return
vcl.pos = vcl.bookmark.pop()
let valRead = vcl.bookmarkVal.pop()
for idx in 0..<vcl.bookmarkVal.len:
if vcl.bookmarkVal[idx].len > valRead.len:
vcl.bookmarkVal[idx] = vcl.bookmarkVal[idx][0 ..< ^valRead.len]
proc unsetBookmark*(vcl: var VCardLexer) =
## Discard the most recent bookmark, leaving the lexer's read position at its
## current position..
if vcl.bookmark.len == 0: return
discard vcl.bookmark.pop()
discard vcl.bookmarkVal.pop()
proc readSinceBookmark*(vcl: var VCardLexer): string =
## Get the value read since the last bookmark.
if vcl.bookmarkVal.len > 0:
return vcl.bookmarkVal[^1]
else: return ""
proc isLineWrap(vcl: var VCardLexer, allowRefill = true): bool =
## Answers the question "is this a folded line"? Transparently handles the
## case where it needs to refill the buffer to answer this question.
if vcl.buffer[vcl.pos] != '\r': return false
# less than three characters in the buffer
if wrappedIdx(vcl.pos + 3) > vcl.bufEnd:
# Only try to refill the buffer once. If we re-enter and still don't have
# three characters, we know we were unable to fill the buffer and are
# likely at the end.
if allowRefill:
vcl.fillBuffer()
return vcl.isLineWrap(false)
else: return false
# at least three characters in the buffer
else:
return vcl.buffer[wrappedIdx(vcl.pos + 1)] == '\n' and
vcl.buffer[wrappedIdx(vcl.pos + 2)] == ' '
proc read*(vcl: var VCardLexer, peek = false): char =
## Read one byte off of the input stream. By default this will advance the
## lexer read position by one byte. If `peek` is set to `true`, this will
## leave the read position at the same logical position. The underlying
## buffer position may still change if, for example, the next byte is the
## beginning of a folded line wrap. In this case the internal buffer position
## will advance past that line wrap.
if vcl.atEnd: vcl.fillBuffer()
if vcl.isLineWrap:
vcl.pos += 3
vcl.lineNumber += 1
vcl.lineStart = vcl.pos
vcl.lineVal = newStringOfCap(84)
if vcl.atEnd: vcl.fillBuffer()
elif vcl.buffer[vcl.pos] == '\n' and not peek:
vcl.lineNumber += 1
vcl.lineStart = wrappedIdx(vcl.pos + 1)
vcl.lineVal = newStringOfCap(84)
result = vcl.buffer[vcl.pos]
if not peek:
for idx in 0..<vcl.bookmarkVal.len: vcl.bookmarkVal[idx].add(result)
vcl.lineVal.add(result)
vcl.pos = wrappedIdx(vcl.pos + 1)
proc readLen*(vcl: var VCardLexer, bytesToRead: int, peek = false): string =
## Convenience procedure to read multiple bytes (if able) and return the
## value read.
result = newStringOfCap(bytesToRead)
for i in 0..<bytesToRead: result.add(vcl.read)
proc readRune*(vcl: var VCardLexer, peek = false): Rune =
## Read one unicode rune off of the input stream. By default this will
## advance the lexer read position by the byte length of the rune read. If
## `peek` is set to `true`, this will leave the read position at the same
## logical position. The underlying buffer position may still change if, for
## example, the next rune is the beginning of a folded line wrap. In this
## case the internal buffer position will advance past that line wrap.
if vcl.atEnd: vcl.fillBuffer()
if vcl.isLineWrap:
vcl.pos += 3
vcl.lineNumber += 1
vcl.lineStart = vcl.pos
vcl.lineVal = newStringOfCap(84)
if vcl.atEnd: vcl.fillBuffer()
elif vcl.buffer[vcl.pos] == '\n':
vcl.lineNumber += 1
vcl.lineStart = wrappedIdx(vcl.pos + 1)
vcl.lineVal = newStringOfCap(84)
result = vcl.buffer.runeAt(vcl.pos)
if not peek:
for idx in 0..<vcl.bookmarkVal.len: vcl.bookmarkVal[idx].add(result)
vcl.lineVal.add(result)
vcl.pos += vcl.buffer.runeLenAt(vcl.pos)
proc readRunesLen*(vcl: var VCardLexer, runesToRead: int, peek = false): string =
## Convenience procedure to read multiple runes (if able) and return the
## value read.
result = newStringOfCap(runesToRead * 4)
for i in 0..<runesToRead: result.add(vcl.readRune)
proc peek*(vcl: var VCardLexer): char =
## Convenience method to call `read(peek = true)`
return vcl.read(peek = true)
proc peekRune*(vcl: var VCardLexer): Rune =
## Convenience method to call `read(peek = true)`
return vcl.readRune(peek = true)
proc getColNumber*(vcl: VCardLexer, pos: int): int =
## Calculate the column number of the lexer's current read position relative
## to the start of the most recent line.
if vcl.lineStart < pos: return pos - vcl.lineStart
else: return (vcl.buffer.len - vcl.lineStart) + pos
proc dumpLexerState*(l: VCardLexer): string =
result =
"pos = " & $l.pos & "\p" &
"bookmark = " & $l.bookmark & "\p" &
"lineNumber = " & $l.lineNumber & "\p" &
"lineStart = " & $l.lineStart & "\p" &
"bufStart = " & $l.bufStart & "\p" &
"bufEnd = " & $l.bufEnd & "\p" &
"buffer = " & l.buffer & "\p"
## Unit Tests
## ============================================================================
proc runVcardLexerPrivateTests*() =
const longTestString =
"This is my test string. There are many like it but this one is mine."
proc bufferIs(vcl: VCardLexer, s: string): bool =
#debugEcho vcl.buffer & " : " & $vcl.bufStart & "-" & $vcl.bufEnd
# for i in vcl.bufStart..<vcl.bufEnd:
# debugEcho $i & ": " & vcl.buffer[i]
for i in 0..<s.len:
# debugEcho "i:" & $i & "\tl.bufStart:" & $(vcl.bufStart + i)
# debugEcho s[i] & " == " & vcl.buffer[vcl.bufStart + i]
if s[i] != vcl.buffer[wrappedIdx(vcl.bufStart + i)]:
return false
return true
proc readExpected(vcl: var VCardLexer, s: string): bool =
for i in 0..<s.len:
if vcl.read != s[i]:
return false
return true
# "can open and fill buffer":
block:
var l: VCardLexer
l.open(newStringStream("test"))
assert l.bufferIs("test")
assert not l.isFull
assert l.readExpected("test")
# "refills buffer when emptied":
block:
var l: VCardLexer
l.open(newStringStream("test"), 3)
assert l.bufferIs("te")
assert l.isFull
assert l.read == 't'
assert l.read == 'e'
assert l.read == 's'
assert l.bufferIs("st")
assert l.read == 't'
# "isFull correctness":
block:
var l = VCardLexer(
pos: 0,
bookmark: @[],
buffer: "0123456789",
bufStart: 0,
bufEnd: 9)
# s e
# 0 1 2 3 4 5 6 7 8 9
assert l.isFull
# s p e
# 0 1 2 3 4 5 6 7 8 9
discard l.read
assert not l.isFull
# e s
# 0 1 2 3 4 5 6 7 8 9
l.bufStart = 3
l.pos = 3
l.bufEnd = 2
assert l.isFull
# e s p
# 0 1 2 3 4 5 6 7 8 9
discard l.read
assert l.pos == 4
assert not l.isFull
# e s
# 0 1 2 3 4 5 6 7 8 9
l.bufStart = 9
l.pos = 9
l.bufEnd = 8
assert l.isFull
# p e s
# 0 1 2 3 4 5 6 7 8 9
discard l.read
assert l.pos == 0
assert not l.isFull
# "handles wrapped lines":
block:
var l: VCardLexer
l.open(newStringStream("line\r\n wrap\r\nline 2"), 3)
assert l.readExpected("line wrap\r\nline 2")
# "fillBuffer correctness":
block:
var l: VCardLexer
l.open(newStringStream(longTestString), 5)
assert l.bufferIs(longTestString[0..<4])
assert l.isFull
assert l.bufStart == 0
assert l.bufEnd == 4
assert l.pos == 0
assert l.readExpected("Th")
assert not l.isFull
assert not l.atEnd
assert l.pos == 2
l.fillBuffer
assert l.isFull
assert l.bufEnd == 1
assert l.pos == 2
assert l.bufStart == 2
# "bookmark preserves the buffer":
block:
var l: VCardLexer
l.open(newStringStream(longTestString), 7)
assert l.buffer.len == 7
assert l.bufferIs(longTestString[0..<6])
assert l.isFull
assert l.bufEnd == 6
assert l.pos == 0
assert l.bookmark == @[]
assert l.readExpected(longTestString[0..<5])
assert not l.isFull
assert not l.atEnd
assert l.pos == 5
l.setBookmark
# read enough to require us to refill the buffer.
assert l.bookmark == @[5]
assert l.readExpected(longTestString[5..<10])
assert l.pos == 3
assert newStartIdx(l) == 5
assert l.buffer.len == 7
l.returnToBookmark
assert l.bookmark == @[]
assert l.pos == 5
# "can set and unset multiple bookmarks"
block:
var l: VCardLexer
l.open(newStringStream(longTestString))
assert l.pos == 0
assert l.bookmark == @[]
assert l.readExpected("This is my ")
l.setBookmark
assert l.bookmark == @[11]
assert l.bookmarkVal == @[""]
assert l.readExpected("test string")
assert l.bookmark == @[11]
assert l.bookmarkVal == @["test string"]
assert l.readSinceBookmark == "test string"
l.setBookmark
assert l.bookmark == @[11, 22]
assert l.bookmarkVal == @["test string", ""]
assert l.readExpected(". There are many")
assert l.bookmarkVal == @["test string. There are many", ". There are many"]
assert l.readSinceBookmark == ". There are many"
assert l.pos == 38
l.unsetBookmark
assert l.pos == 38
assert l.bookmark == @[11]
assert l.bookmarkVal == @["test string. There are many"]
assert l.readSinceBookmark == "test string. There are many"
l.unsetBookmark
assert l.pos == 38
assert l.bookmark == @[]
assert l.bookmarkVal == @[]
assert l.readSinceBookmark == ""
# "can set and return to multiple bookmarks"
block:
var l: VCardLexer
l.open(newStringStream(longTestString))
assert l.pos == 0
assert l.bookmark == @[]
assert l.readExpected("This is my ")
l.setBookmark
assert l.readExpected("test string")
l.setBookmark
assert l.bookmark == @[11, 22]
assert l.readExpected(". There are many")
assert l.bookmarkVal == @["test string. There are many", ". There are many"]
assert l.pos == 38
l.returnToBookmark
assert l.pos == 22
assert l.bookmark == @[11]
assert l.bookmarkVal == @["test string"]
assert l.readSinceBookmark == "test string"
l.returnToBookmark
assert l.pos == 11
assert l.bookmark == @[]
assert l.bookmarkVal == @[]
# "readRune":
block:
var l: VCardLexer
l.open(newStringStream("TEST"))
assert l.bufferIs("TEST")
assert l.peekRune == Rune('T')
assert l.readRune == Rune('T')
assert l.readRune == Rune('E')
assert l.readRune == Rune('S')
assert l.readRune == Rune('T')
when isMainModule: runVcardLexerPrivateTests()