From 2bc5d5c74fc766d0772fd7ef14520a45cbf2aaf9 Mon Sep 17 00:00:00 2001 From: Jonathan Bernard Date: Sat, 1 Apr 2023 16:29:30 -0500 Subject: [PATCH] WIP vcard 3.0 lexer and parser. --- src/vcard/private/lexer.nim | 147 +++++++++++++++++++++ src/vcard3.nim | 251 +++++++++++++++++++++++++++++++++--- 2 files changed, 378 insertions(+), 20 deletions(-) create mode 100644 src/vcard/private/lexer.nim diff --git a/src/vcard/private/lexer.nim b/src/vcard/private/lexer.nim new file mode 100644 index 0000000..f20ca5b --- /dev/null +++ b/src/vcard/private/lexer.nim @@ -0,0 +1,147 @@ +import std/[streams, unicode] + +type VCardLexer* = object of RootObj + input: Stream + + pos*: int # current read position + bookmark*: int # bookmark to support rewind functionality + buffer*: string # buffer of bytes read + lineNumber*: int # how many newlines have we seen so far + lineStart: int # index into the buffer for the start of the current line + + bufStart: int # starting boundary for the buffer + bufEnd: int # ending boundary for the buffer + +proc skipUtf8Bom(vcl: var VCardLexer) = + if (vcl.buffer[0] == '\xEF') and (vcl.buffer[1] == '\xBB') and (vcl.buffer[2] == '\xBF'): + inc(vcl.pos, 3) + +proc newStartIdx(vcl: VCardLexer): int = + if vcl.bookmark > 0: vcl.bookmark else: vcl.pos + +proc doubleBuffer(vcl: var VCardLexer) = + let oldBuf = vcl.buffer + vcl.buffer = newString(oldBuf.len * 2) + + var newIdx = 0 + var oldIdx = vcl.bufStart + + while oldIdx != vcl.bufEnd or newIdx == 0: + vcl.buffer[newIdx] = oldBuf[oldIdx] + inc(newIdx) + oldIdx = (newIdx + vcl.bufStart) mod oldBuf.len + + vcl.pos -= vcl.bufStart + vcl.lineStart -= vcl.bufStart + if vcl.bookmark >= 0: vcl.bookmark -= vcl.bufStart + vcl.bufStart = 0 + vcl.bufEnd = newIdx + +proc fillBuffer(vcl: var VCardLexer) = + + var charsRead: int + + # check to see if we have a full buffer + if (vcl.bufStart == 0 and vcl.bufEnd == vcl.buffer.len) or + vcl.bufEnd == vcl.bufStart - 1: + vcl.doubleBuffer() + + # discard used portions of the buffer + vcl.bufStart = vcl.newStartIdx + + if vcl.bufEnd < vcl.bufStart: + charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd ..< vcl.bufStart) + vcl.bufEnd += charsRead + else: + charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd ..< vcl.buffer.len) + vcl.bufEnd += charsRead + if charsRead == vcl.buffer.len - vcl.bufEnd: + vcl.bufEnd = vcl.input.readDataStr(vcl.buffer, 0 ..< vcl.bufStart) + +proc close*(vcl: var VCardLexer) = vcl.input.close + +proc open*(vcl: var VCardLexer, input: Stream, bufLen = 16384) = + assert(bufLen > 0) + assert(input != nil) + vcl.input = input + vcl.pos = 0 + vcl.bookmark = -1 + vcl.buffer = newString(bufLen) + vcl.lineNumber = 0 + vcl.lineStart = 0 + vcl.fillBuffer + vcl.skipUtf8Bom + +proc setBookmark*(vcl: var VCardLexer) = + vcl.bookmark = vcl.pos + +proc returnToBookmark*(vcl: var VCardLexer) = + vcl.pos = vcl.bookmark + vcl.bookmark = -1 + +proc unsetBookmark*(vcl: var VCardLexer) = + vcl.bookmark = -1 + +proc readSinceBookmark*(vcl: var VCardLexer): string = + if vcl.pos < vcl.bookmark: + vcl.buffer[vcl.bookmark ..< vcl.buffer.len] & vcl.buffer[0 ..< vcl.pos] + else: vcl.buffer[vcl.pos ..< vcl.bookmark] + +template wrappedIdx(idx: untyped): int = idx mod vcl.buffer.len + +proc isLineWrap(vcl: var VCardLexer, allowRefill = true): bool = + if vcl.buffer[vcl.pos] != '\r': return false + + # less than three characters in the buffer + if wrappedIdx(vcl.pos + 3) > vcl.bufEnd: + if allowRefill: + vcl.fillBuffer() + return vcl.isLineWrap(false) + else: return false + + # at least three characters in the buffer + else: + return vcl.buffer[wrappedIdx(vcl.pos + 1)] == '\n' and + vcl.buffer[wrappedIdx(vcl.pos + 2)] == ' ' + +proc read*(vcl: var VCardLexer, peek = false): char = + if vcl.pos == vcl.bufEnd: vcl.fillBuffer() + + if vcl.isLineWrap: + vcl.pos += 3 + vcl.lineNumber += 1 + vcl.lineStart = vcl.pos + if vcl.pos == vcl.bufEnd: vcl.fillBuffer() + + elif vcl.buffer[vcl.pos] == '\n': + vcl.lineNumber += 1 + vcl.lineStart = wrappedIdx(vcl.pos + 1) + + result = vcl.buffer[vcl.pos] + if not peek: vcl.pos = wrappedIdx(vcl.pos + 1) + +proc readRune*(vcl: var VCardLexer, peek = false): Rune = + if vcl.pos == vcl.bufEnd: vcl.fillBuffer() + + if vcl.isLineWrap: + vcl.pos += 3 + vcl.lineNumber += 1 + vcl.lineStart = vcl.pos + if vcl.pos == vcl.bufEnd: vcl.fillBuffer() + + elif vcl.buffer[vcl.pos] == '\n': + vcl.lineNumber += 1 + vcl.lineStart = wrappedIdx(vcl.pos + 1) + + result = vcl.buffer.runeAt(vcl.pos) + if not peek: vcl.pos += vcl.buffer.runeLenAt(vcl.pos) + +proc peek*(vcl: var VCardLexer): char = + return vcl.read(peek = true) + +proc peekRune*(vcl: var VCardLexer): Rune = + return vcl.readRune(peek = true) + +proc getColNumber*(vcl: VCardLexer, pos: int): int = + if vcl.lineStart < pos: return pos - vcl.lineStart + else: return (vcl.buffer.len - vcl.lineStart) + pos diff --git a/src/vcard3.nim b/src/vcard3.nim index c28b918..c33c7d8 100644 --- a/src/vcard3.nim +++ b/src/vcard3.nim @@ -9,10 +9,10 @@ ## [rfc2426]: https://tools.ietf.org/html/rfc2426 ## [rfc6350]: https://tools.ietf.org/html/rfc6350 -import std/base64, std/lexbase, std/macros, std/options, std/sequtils, - std/streams, std/strutils, std/times +import std/[base64, lexbase, macros, options, sequtils, streams, strutils, + times, unicode] -import vcard/private/util +import vcard/private/[util, lexer] type #[ @@ -221,6 +221,7 @@ type nextContentId: int content*: seq[VC3_Content] +const CRLF = "\r\n" const DATE_FMT = "yyyy-MM-dd" const DATETIME_FMT = "yyyy-MM-dd'T'HH:mm:sszz" @@ -252,11 +253,6 @@ macro assignFields(assign: untyped, fields: varargs[untyped]): untyped = # Initializers # ============================================================================= -func clone(vc3: VCard3): VCard3 = - result = VCard3( - nextContentId: vc3.nextContentId, - content: vc3.content) - func newVC3_Name*(value: string, group = none[string]()): VC3_Name = return VC3_Name(name: "NAME", value: value, group: group) @@ -434,21 +430,23 @@ func newVC3_Org*( value: seq[string], isPText = false, language = none[string](), - xParams: seq[VC3_XParam] = @[]): VC3_Org = + xParams: seq[VC3_XParam] = @[], + group = none[string]()): VC3_Org = return assignFields( VC3_Org(name: "ORG"), - value, isPText, language, xParams) + value, isPText, language, xParams, group) func newVC3_Categories*( value: seq[string], isPText = false, language = none[string](), - xParams: seq[VC3_XParam] = @[]): VC3_Categories = + xParams: seq[VC3_XParam] = @[], + group = none[string]()): VC3_Categories = return assignFields( VC3_Categories(name: "CATEGORIES"), - value, isPText, language, xParams) + value, isPText, language, xParams, group) func newVC3_Note*( value: string, @@ -688,6 +686,7 @@ func updateOrAdd*[T](vc3: var VCard3, content: seq[T]): VCard3 = if existingIdx < 0: vc3.content.add(c) else: c.content[existingIdx] = c +#[ func setName*(vc3: var VCard3, name: string, group = none[string]()): void = var name = newVC3_Name(name, group) vc3.setContent(name) @@ -1074,12 +1073,81 @@ func addLogo*( result = vc3 result.addLogo(logo, valueType, binaryType, isInline, group) -func setAgent +func setAgent*( + vc3: var VCard3, + agent: string, + isInline = true, + group = none[string]()): void = + + var c = newVC3_Agent(agent, isInline, group) + vc3.add(c) + +func setAgent*( + vc3: VCard3, + agent: string, + isInline = true, + group = none[string]()): VCard3 = + + result = vc3 + result.setAgent(agent, isInline, group) + +func setOrg*( + vc3: var VCard3, + org: seq[string], + isPText = false, + language = none[string](), + xParams: seq[VC3_XParam] = @[], + group = none[string]()): void = + + var c = newVC3_Org(org, isPText, language, xParams, group) + vc3.setContent(c) + +func setOrg*( + vc3: VCard3, + org: seq[string], + isPText = false, + language = none[string](), + xParams: seq[VC3_XParam] = @[], + group = none[string]()): VCard3 = + + result = vc3 + result.setOrg(org, isPText, language, xParams, group) + +func setCategories*( + vc3: var VCard3, + categories: seq[string], + isPText = false, + language = none[string](), + xParams: seq[VC3_XParam] = @[], + group = none[string]()): void = + + var c = newVC3_Categories(categories, isPText, language, xParams, group) + vc3.setContent(c) + +func setCategories*( + vc3: VCard3, + categories: seq[string], + isPText = false, + language = none[string](), + xParams: seq[VC3_XParam] = @[], + group = none[string]()): VCard3 = + + result = vc3 + result.setCategories(categories, isPText, language, xParams, group) + +func addNote( + vc3: VCard3, + value: string, + language = none[string](), + isPText = false, + xParams: seq[VC3_XParam] = @[], + group = none[string]()): VCard3 = + + var c = newVC3_Note(value, language, isPText, xParams, group) + vc3.add(c) +]# #[ # TODO -agent -org -categories note prodid rev @@ -1241,8 +1309,151 @@ proc serialize(c: VC3_Content): string = return serialize(cast[VC3_BinaryContent](c)) proc `$`*(vc3: VCard3): string = - result = "BEGIN:vCard\r\n" - result &= "VERSION:3.0\r\n" + result = "BEGIN:vCard" & CRLF + result &= "VERSION:3.0" & CRLF for c in vc3.content.filterIt(not (it of VC3_Version)): - result &= foldContentLine(serialize(c)) & "\r\n" - result &= "END:vCard\r\n" + result &= foldContentLine(serialize(c)) & CRLF + result &= "END:vCard" & CRLF + + +# Parsing +# ============================================================================= + +type + VC3_ParseEvent = enum + peStart, + peContentLine, + peName, + peParam, + peParamName, + peParamValue, + prPText, + peQuoted, + peSafeChar, + peQSafeChar, + peValue, + peEnd + + VC3Parser = object of VCardLexer + filename: string + state: seq[VC3_ParseEvent] + + VCard3ParsingError = object of ValueError + +const NON_ASCII = { '\x80'..'\xFF' } +const WSP = {' ', '\t'} +const SAFE_CHARS = WSP + { '\x21', '\x23'..'\x2B', '\x2D'..'\x39', '\x3C'..'\x7E' } + NON_ASCII +const QSAFE_CHARS = WSP + { '\x21', '\x23'..'\x7E' } + NON_ASCII +const VALUE_CHAR = WSP + { '\x21'..'\x7E' } + NON_ASCII +const ALPHA_NUM = { 'a'..'z', 'A'..'Z', '0'..'9' } +const NAME_CHARS = { 'a'..'z', 'A'..'Z', '0'..'9' } + +proc error(p: VC3Parser, msg: string) = + raise newException(VCard3ParsingError, "$1($2, $3) Error: $4] " % + [ p.filename, $p.lineNumber, $p.getColNumber(p.pos), msg ]) + +proc readGroup(p: var VC3Parser): Option[string] = + p.setBookmark + + var ch = p.read + while ALPHA_NUM.contains(ch): ch = p.read + + if (ch == '.'): + p.unsetBookmark + return some(readSinceBookmark(p)[0..^1]) + else: + p.returnToBookmark + return none[string]() + +proc readName(p: var VC3Parser): string = + while ALPHA_ + +proc expect(p: var VC3Parser, expected: string, caseSensitive = false) = + p.setBookmark + + if caseSensitive: + for ch in expected: + if p.read != ch: + p.error("expected '$1' but found '$2'" % + [expected, p.readSinceBookmark]) + + else: + for rune in expected.runes: + if p.readRune.toLower != rune.toLower: + p.error("expected '$1' but found '$2'" % + [ expected, p.readSinceBookmark ]) + + p.unsetBookmark + +proc skip(p: var VC3Parser, expected: string, caseSensitive = false): bool = + p.setBookmark + if caseSensitive: + for ch in expected: + if p.read != ch: + p.returnToBookmark + return false + + else: + for rune in expected.runes: + if p.readRune.toLower != rune.toLower: + p.returnToBookmark + return false + + p.unsetBookmark + return true + +proc parseContentLines(p: var VC3Parser): seq[VC3_Content] = + while true: + let group = p.readGroup + let name = p.readName + if name.toLower == "end": + p.expect(":VCARD\r\n") + break + + +proc parseVCard3*(input: Stream, filename = "input"): seq[VCard3] = + var p: VC3Parser + lexer.open(p, input) + p.state = @[peStart] + + discard p.readGroup + p.expect("begin:vcard") + while (p.skip("\r\n", true)): discard + + + +proc parseVCard3*(content: string, filename = "input"): seq[VCard3] = + parseVCard3(newStringStream(content), filename) + +proc parseVCard3File*(filepath: string): seq[VCard3] = + parseVCard3(newFileStream(filepath, fmRead), filepath) + +#[ +Simplified Parsing Diagram + +```mermaid +stateDiagram-v2 + [*] --> StartVCard + StartVCard --> ContentLine: "BEGIN VCARD" CRLF + ContentLine --> EndVCard: "END VCARD" CRLF + ContentLine --> Name + Name --> Name: 0-9/a-z/-/. + Name --> Param: SEMICOLON + Name --> Value: COLON + Param --> Value: COLON + Value --> ContentLine: CRLF + + state Param { + [*] --> ParamName + ParamName --> ParamName: 0-9/a-z/-/. + ParamName --> ParamValue: "=" + ParamValue --> ParamValue: "," + ParamValue --> PText + ParamValue --> Quoted + PText --> PText: SAFE-CHAR + PText --> [*] + Quoted --> Quoted: QSAFE-CHAR + Quoted --> [*] + } +``` +]#