# vCard-specific Lexer # © 2022-2023 Jonathan Bernard ## This module defines a lexer with functionality useful for parsing vCard ## content. Specifically: ## - it understands the vCard line-folding logic and transparently joins folded ## lines as it read input off its input stream. ## - it supports multiple nested bookmarks to make look-ahead decisions more ## convenient ## - it supports reading from the underlying stream byte-wise or as unicode ## runes. ## ## This parser uses a ring buffer underneath, only growing the size of its ## buffer when it is completely full. import std/[streams, unicode] type VCardLexer* = object of RootObj input: Stream buffer*: string ## buffer of bytes read bufStart: int ## starting boundary for the buffer bufEnd: int ## ending boundary for the buffer pos*: int ## current read position bookmark*: seq[int] ## bookmark to support rewind functionality bookmarkVal*: seq[string] ## value read since the bookmark was set lineNumber*: int ## how many newlines have we seen so far lineStart: int ## buffer index buffer for the start of the current line lineVal*: string ## value read since the start of the current line proc skipUtf8Bom(vcl: var VCardLexer) = if (vcl.buffer[0] == '\xEF') and (vcl.buffer[1] == '\xBB') and (vcl.buffer[2] == '\xBF'): inc(vcl.pos, 3) template wrappedIdx(idx: untyped): int = idx mod vcl.buffer.len ## Map an index into the buffer bounds (mod) proc newStartIdx(vcl: VCardLexer): int = ## Get the latest safe index to use as a new start index. The implication is ## that anything prior to this index has been read and processed and can be ## safely overwritten in the buffer. if vcl.bookmark.len > 0: vcl.bookmark[0] else: vcl.pos func isFull(vcl: VCardLexer): bool {.inline.} = return wrappedIdx(vcl.bufEnd + 1) == vcl.newStartIdx func atEnd(vcl: VCardLexer): bool {.inline.} = vcl.pos == vcl.bufEnd proc doubleBuffer(vcl: var VCardLexer) = ## Double the capacity of the buffer, copying the contents of the current ## buffer into the beginning of the newly expanded buffer. let oldBuf = vcl.buffer vcl.buffer = newString(oldBuf.len * 2) var newIdx = 0 var oldIdx = vcl.bufStart while oldIdx != vcl.bufEnd or newIdx == 0: vcl.buffer[newIdx] = oldBuf[oldIdx] inc(newIdx) oldIdx = (newIdx + vcl.bufStart) mod oldBuf.len # We know that for all existing indices, their location in the new buffer can # be calculated as a function of the distance we moved the start of the # buffer: `idx = idx + (newBufStart - oldBufStart)`. Since we know the new # bufStart will be 0, we know that we can calculate all of the new indices as # `idx -= oldBufStart` Currently vcl.bufStart is still the old bufStart. vcl.pos -= vcl.bufStart vcl.lineStart -= vcl.bufStart if vcl.bookmark.len > 0: vcl.bookmark[0] -= vcl.bufStart # Now that we've updated all of the existing indices, we can reset the # buffer start and end indices to their new values. vcl.bufStart = 0 vcl.bufEnd = newIdx proc fillBuffer(vcl: var VCardLexer) = ## Read data into the buffer from the underlying stream until the buffer is ## full. If the buffer is already full, double the buffer beforehand. # Note that we do not *completely* fill the buffer. We always leave one index # of the array empty. This allows us to differentiate between an empty buffer # (`bufStart == bufEnd`) and a completly full buffer (`bufStart == # wrappedIdx(bufEnd + 1)`). var charsRead: int # check to see if we have a full buffer if vcl.isFull: vcl.doubleBuffer() # discard used portions of the buffer vcl.bufStart = vcl.newStartIdx # We have three conditions that the ring buffer may be in: if vcl.bufEnd < vcl.bufStart: # The unused portion of the buffer is all in the middle and we can just # read date into the space from bufEnd (e) to bufStart (s). # e s # 0 1 2 3 4 5 6 7 8 9 charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd ..< (vcl.bufStart - 1)) vcl.bufEnd += charsRead elif vcl.bufStart == 0: # The unused portion is entirely at the end of the buffer. We can read data # from the bufEnd (e) to the end of our buffer capacity. # s e # 0 1 2 3 4 5 6 7 8 9 charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd ..< (vcl.buffer.len - 1)) vcl.bufEnd = wrappedIdx(vcl.bufEnd + charsRead) else: # The used portion of the buffer is in the middle, and the unused portion # is on either side ot that. We need to read from bufEnd (e) to the end of # our underlying buffer, and then from the start of our underlying buffer # to bufStart (s) # s e # 0 1 2 3 4 5 6 7 8 9 charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd.. 0) assert(input != nil) vcl.input = input vcl.pos = 0 vcl.bookmark = @[] vcl.buffer = newString(bufLen) vcl.bufStart = 0 vcl.bufEnd = 0 vcl.lineNumber = 0 vcl.lineStart = 0 vcl.fillBuffer vcl.skipUtf8Bom proc setBookmark*(vcl: var VCardLexer) = ## Set a bookmark into the lexer's buffer. This will prevent the lexer from ## discarding data from this bookmark forward when it refills. ## ## The bookmark must be cleared using either `returnToBookmark` or ## `unsetBookmark`, otherwise this lexer will no function as a streaming ## lexer and will end up reading the entire remainder of the input stream ## into memory (if it can). ## ## This function can be called multiple times in order to create nested ## bookmarks. For example, we might set a bookmark at the beginning of a line ## to be able to reset if we fail to parse the line, then set a bookmark ## midway through when attempting to parse a parameter value. Care should be ## taken when nesting bookmarks as all bookmarks must be released to avoid ## the behavior described above. vcl.bookmark.add(vcl.pos) vcl.bookmarkVal.add(newStringOfCap(32)) proc returnToBookmark*(vcl: var VCardLexer) = ## Unset the most recent bookmark, resetting the lexer's read position to the ## position of the bookmark. if vcl.bookmark.len == 0: return vcl.pos = vcl.bookmark.pop() let valRead = vcl.bookmarkVal.pop() for idx in 0.. valRead.len: vcl.bookmarkVal[idx] = vcl.bookmarkVal[idx][0 ..< ^valRead.len] proc unsetBookmark*(vcl: var VCardLexer) = ## Discard the most recent bookmark, leaving the lexer's read position at its ## current position.. if vcl.bookmark.len == 0: return discard vcl.bookmark.pop() discard vcl.bookmarkVal.pop() proc readSinceBookmark*(vcl: var VCardLexer): string = ## Get the value read since the last bookmark. if vcl.bookmarkVal.len > 0: return vcl.bookmarkVal[^1] else: return "" proc isLineWrap(vcl: var VCardLexer, allowRefill = true): bool = ## Answers the question "is this a folded line"? Transparently handles the ## case where it needs to refill the buffer to answer this question. if vcl.buffer[vcl.pos] != '\r': return false # less than three characters in the buffer if wrappedIdx(vcl.pos + 3) > vcl.bufEnd: # Only try to refill the buffer once. If we re-enter and still don't have # three characters, we know we were unable to fill the buffer and are # likely at the end. if allowRefill: vcl.fillBuffer() return vcl.isLineWrap(false) else: return false # at least three characters in the buffer else: return vcl.buffer[wrappedIdx(vcl.pos + 1)] == '\n' and vcl.buffer[wrappedIdx(vcl.pos + 2)] == ' ' proc read*(vcl: var VCardLexer, peek = false): char = ## Read one byte off of the input stream. By default this will advance the ## lexer read position by one byte. If `peek` is set to `true`, this will ## leave the read position at the same logical position. The underlying ## buffer position may still change if, for example, the next byte is the ## beginning of a folded line wrap. In this case the internal buffer position ## will advance past that line wrap. if vcl.atEnd: vcl.fillBuffer() if vcl.isLineWrap: vcl.pos += 3 vcl.lineNumber += 1 vcl.lineStart = vcl.pos vcl.lineVal = newStringOfCap(84) if vcl.atEnd: vcl.fillBuffer() elif vcl.buffer[vcl.pos] == '\n' and not peek: vcl.lineNumber += 1 vcl.lineStart = wrappedIdx(vcl.pos + 1) vcl.lineVal = newStringOfCap(84) result = vcl.buffer[vcl.pos] if not peek: for idx in 0..