The lexer now tracks the data that has been read since the start of the current line. While this may have use in parsers, the immediate use is by the common error reporting procedure. The `common#error` procedure already reports the column and line number where an error occurs. The `common#expect` function is broadly used by parsers and generates the majority of parser errors. It now uses the lexer's record of the current line to format its error message with a direct pointer to the location of the unmet expectation.
424 lines
12 KiB
Nim
424 lines
12 KiB
Nim
import std/[streams, unicode]
|
|
|
|
type VCardLexer* = object of RootObj
|
|
input: Stream
|
|
|
|
buffer*: string # buffer of bytes read
|
|
bufStart: int # starting boundary for the buffer
|
|
bufEnd: int # ending boundary for the buffer
|
|
pos*: int # current read position
|
|
bookmark*: seq[int] # bookmark to support rewind functionality
|
|
bookmarkVal*: seq[string] # value read since the bookmark was set
|
|
lineNumber*: int # how many newlines have we seen so far
|
|
lineStart: int # buffer index buffer for the start of the current line
|
|
lineVal*: string # value read since the start of the current line
|
|
|
|
proc skipUtf8Bom(vcl: var VCardLexer) =
|
|
if (vcl.buffer[0] == '\xEF') and (vcl.buffer[1] == '\xBB') and (vcl.buffer[2] == '\xBF'):
|
|
inc(vcl.pos, 3)
|
|
|
|
template wrappedIdx(idx: untyped): int = idx mod vcl.buffer.len
|
|
|
|
proc newStartIdx(vcl: VCardLexer): int =
|
|
if vcl.bookmark.len > 0: vcl.bookmark[0] else: vcl.pos
|
|
|
|
func isFull(vcl: VCardLexer): bool {.inline.} =
|
|
return wrappedIdx(vcl.bufEnd + 1) == vcl.newStartIdx
|
|
|
|
func atEnd(vcl: VCardLexer): bool {.inline.} =
|
|
vcl.pos == vcl.bufEnd
|
|
|
|
proc doubleBuffer(vcl: var VCardLexer) =
|
|
let oldBuf = vcl.buffer
|
|
vcl.buffer = newString(oldBuf.len * 2)
|
|
|
|
var newIdx = 0
|
|
var oldIdx = vcl.bufStart
|
|
|
|
while oldIdx != vcl.bufEnd or newIdx == 0:
|
|
vcl.buffer[newIdx] = oldBuf[oldIdx]
|
|
inc(newIdx)
|
|
oldIdx = (newIdx + vcl.bufStart) mod oldBuf.len
|
|
|
|
vcl.pos -= vcl.bufStart
|
|
vcl.lineStart -= vcl.bufStart
|
|
if vcl.bookmark.len > 0: vcl.bookmark[0] -= vcl.bufStart
|
|
vcl.bufStart = 0
|
|
vcl.bufEnd = newIdx
|
|
|
|
proc fillBuffer(vcl: var VCardLexer) =
|
|
|
|
var charsRead: int
|
|
|
|
# check to see if we have a full buffer
|
|
if vcl.isFull: vcl.doubleBuffer()
|
|
|
|
# discard used portions of the buffer
|
|
vcl.bufStart = vcl.newStartIdx
|
|
|
|
if vcl.bufEnd < vcl.bufStart:
|
|
# e s
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
charsRead = vcl.input.readDataStr(vcl.buffer,
|
|
vcl.bufEnd ..< (vcl.bufStart - 1))
|
|
vcl.bufEnd += charsRead
|
|
|
|
elif vcl.bufStart == 0:
|
|
# s e
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
charsRead = vcl.input.readDataStr(vcl.buffer,
|
|
vcl.bufEnd ..< (vcl.buffer.len - 1))
|
|
vcl.bufEnd = wrappedIdx(vcl.bufEnd + charsRead)
|
|
|
|
else:
|
|
# s e
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
charsRead = vcl.input.readDataStr(vcl.buffer, vcl.bufEnd..<vcl.buffer.len)
|
|
if charsRead == vcl.buffer.len - vcl.bufEnd:
|
|
vcl.bufEnd = vcl.input.readDataStr(vcl.buffer, 0 ..< (vcl.bufStart - 1))
|
|
|
|
|
|
proc close*(vcl: var VCardLexer) = vcl.input.close
|
|
|
|
proc open*(vcl: var VCardLexer, input: Stream, bufLen = 16384) =
|
|
assert(bufLen > 0)
|
|
assert(input != nil)
|
|
vcl.input = input
|
|
vcl.pos = 0
|
|
vcl.bookmark = @[]
|
|
vcl.buffer = newString(bufLen)
|
|
vcl.bufStart = 0
|
|
vcl.bufEnd = 0
|
|
vcl.lineNumber = 0
|
|
vcl.lineStart = 0
|
|
vcl.fillBuffer
|
|
vcl.skipUtf8Bom
|
|
|
|
proc setBookmark*(vcl: var VCardLexer) =
|
|
vcl.bookmark.add(vcl.pos)
|
|
vcl.bookmarkVal.add(newStringOfCap(32))
|
|
|
|
proc returnToBookmark*(vcl: var VCardLexer) =
|
|
if vcl.bookmark.len == 0: return
|
|
vcl.pos = vcl.bookmark.pop()
|
|
let valRead = vcl.bookmarkVal.pop()
|
|
for idx in 0..<vcl.bookmarkVal.len:
|
|
if vcl.bookmarkVal[idx].len > valRead.len:
|
|
vcl.bookmarkVal[idx] = vcl.bookmarkVal[idx][0 ..< ^valRead.len]
|
|
|
|
proc unsetBookmark*(vcl: var VCardLexer) =
|
|
if vcl.bookmark.len == 0: return
|
|
discard vcl.bookmark.pop()
|
|
discard vcl.bookmarkVal.pop()
|
|
|
|
proc readSinceBookmark*(vcl: var VCardLexer): string =
|
|
if vcl.bookmarkVal.len > 0:
|
|
return vcl.bookmarkVal[^1]
|
|
else: return ""
|
|
|
|
proc isLineWrap(vcl: var VCardLexer, allowRefill = true): bool =
|
|
if vcl.buffer[vcl.pos] != '\r': return false
|
|
|
|
# less than three characters in the buffer
|
|
if wrappedIdx(vcl.pos + 3) > vcl.bufEnd:
|
|
if allowRefill:
|
|
vcl.fillBuffer()
|
|
return vcl.isLineWrap(false)
|
|
else: return false
|
|
|
|
# at least three characters in the buffer
|
|
else:
|
|
return vcl.buffer[wrappedIdx(vcl.pos + 1)] == '\n' and
|
|
vcl.buffer[wrappedIdx(vcl.pos + 2)] == ' '
|
|
|
|
proc read*(vcl: var VCardLexer, peek = false): char =
|
|
if vcl.atEnd: vcl.fillBuffer()
|
|
|
|
if vcl.isLineWrap:
|
|
vcl.pos += 3
|
|
vcl.lineNumber += 1
|
|
vcl.lineStart = vcl.pos
|
|
vcl.lineVal = newStringOfCap(84)
|
|
if vcl.atEnd: vcl.fillBuffer()
|
|
|
|
elif vcl.buffer[vcl.pos] == '\n':
|
|
vcl.lineNumber += 1
|
|
vcl.lineStart = wrappedIdx(vcl.pos + 1)
|
|
vcl.lineVal = newStringOfCap(84)
|
|
|
|
result = vcl.buffer[vcl.pos]
|
|
if not peek:
|
|
for idx in 0..<vcl.bookmarkVal.len: vcl.bookmarkVal[idx].add(result)
|
|
vcl.lineVal.add(result)
|
|
vcl.pos = wrappedIdx(vcl.pos + 1)
|
|
|
|
proc readLen*(vcl: var VCardLexer, bytesToRead: int, peek = false): string =
|
|
result = newStringOfCap(bytesToRead)
|
|
for i in 0..<bytesToRead: result.add(vcl.read)
|
|
|
|
proc readRune*(vcl: var VCardLexer, peek = false): Rune =
|
|
if vcl.atEnd: vcl.fillBuffer()
|
|
|
|
if vcl.isLineWrap:
|
|
vcl.pos += 3
|
|
vcl.lineNumber += 1
|
|
vcl.lineStart = vcl.pos
|
|
vcl.lineVal = newStringOfCap(84)
|
|
if vcl.atEnd: vcl.fillBuffer()
|
|
|
|
elif vcl.buffer[vcl.pos] == '\n':
|
|
vcl.lineNumber += 1
|
|
vcl.lineStart = wrappedIdx(vcl.pos + 1)
|
|
vcl.lineVal = newStringOfCap(84)
|
|
|
|
result = vcl.buffer.runeAt(vcl.pos)
|
|
if not peek:
|
|
for idx in 0..<vcl.bookmarkVal.len: vcl.bookmarkVal[idx].add(result)
|
|
vcl.lineVal.add(result)
|
|
vcl.pos += vcl.buffer.runeLenAt(vcl.pos)
|
|
|
|
proc readRunesLen*(vcl: var VCardLexer, runesToRead: int, peek = false): string =
|
|
result = newStringOfCap(runesToRead * 4)
|
|
for i in 0..<runesToRead: result.add(vcl.readRune)
|
|
|
|
proc peek*(vcl: var VCardLexer): char =
|
|
return vcl.read(peek = true)
|
|
|
|
proc peekRune*(vcl: var VCardLexer): Rune =
|
|
return vcl.readRune(peek = true)
|
|
|
|
proc getColNumber*(vcl: VCardLexer, pos: int): int =
|
|
if vcl.lineStart < pos: return pos - vcl.lineStart
|
|
else: return (vcl.buffer.len - vcl.lineStart) + pos
|
|
|
|
proc dumpLexerState*(l: VCardLexer): string =
|
|
result =
|
|
"pos = " & $l.pos & "\p" &
|
|
"bookmark = " & $l.bookmark & "\p" &
|
|
"lineNumber = " & $l.lineNumber & "\p" &
|
|
"lineStart = " & $l.lineStart & "\p" &
|
|
"bufStart = " & $l.bufStart & "\p" &
|
|
"bufEnd = " & $l.bufEnd & "\p" &
|
|
"buffer = " & l.buffer & "\p"
|
|
|
|
## Unit Tests
|
|
## ============================================================================
|
|
proc runVcardLexerPrivateTests*() =
|
|
|
|
const longTestString =
|
|
"This is my test string. There are many like it but this one is mine."
|
|
|
|
proc bufferIs(vcl: VCardLexer, s: string): bool =
|
|
#debugEcho vcl.buffer & " : " & $vcl.bufStart & "-" & $vcl.bufEnd
|
|
# for i in vcl.bufStart..<vcl.bufEnd:
|
|
# debugEcho $i & ": " & vcl.buffer[i]
|
|
|
|
for i in 0..<s.len:
|
|
# debugEcho "i:" & $i & "\tl.bufStart:" & $(vcl.bufStart + i)
|
|
# debugEcho s[i] & " == " & vcl.buffer[vcl.bufStart + i]
|
|
if s[i] != vcl.buffer[wrappedIdx(vcl.bufStart + i)]:
|
|
return false
|
|
return true
|
|
|
|
proc readExpected(vcl: var VCardLexer, s: string): bool =
|
|
for i in 0..<s.len:
|
|
if vcl.read != s[i]:
|
|
return false
|
|
return true
|
|
|
|
# "can open and fill buffer":
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream("test"))
|
|
assert l.bufferIs("test")
|
|
assert not l.isFull
|
|
assert l.readExpected("test")
|
|
|
|
# "refills buffer when emptied":
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream("test"), 3)
|
|
assert l.bufferIs("te")
|
|
assert l.isFull
|
|
assert l.read == 't'
|
|
assert l.read == 'e'
|
|
assert l.read == 's'
|
|
assert l.bufferIs("st")
|
|
assert l.read == 't'
|
|
|
|
# "isFull correctness":
|
|
block:
|
|
var l = VCardLexer(
|
|
pos: 0,
|
|
bookmark: @[],
|
|
buffer: "0123456789",
|
|
bufStart: 0,
|
|
bufEnd: 9)
|
|
|
|
# s e
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
assert l.isFull
|
|
|
|
# s p e
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
discard l.read
|
|
assert not l.isFull
|
|
|
|
# e s
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
l.bufStart = 3
|
|
l.pos = 3
|
|
l.bufEnd = 2
|
|
assert l.isFull
|
|
|
|
# e s p
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
discard l.read
|
|
assert l.pos == 4
|
|
assert not l.isFull
|
|
|
|
# e s
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
l.bufStart = 9
|
|
l.pos = 9
|
|
l.bufEnd = 8
|
|
assert l.isFull
|
|
|
|
# p e s
|
|
# 0 1 2 3 4 5 6 7 8 9
|
|
discard l.read
|
|
assert l.pos == 0
|
|
assert not l.isFull
|
|
|
|
# "handles wrapped lines":
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream("line\r\n wrap\r\nline 2"), 3)
|
|
|
|
assert l.readExpected("line wrap\r\nline 2")
|
|
|
|
# "fillBuffer correctness":
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream(longTestString), 5)
|
|
assert l.bufferIs(longTestString[0..<4])
|
|
assert l.isFull
|
|
assert l.bufStart == 0
|
|
assert l.bufEnd == 4
|
|
assert l.pos == 0
|
|
assert l.readExpected("Th")
|
|
assert not l.isFull
|
|
assert not l.atEnd
|
|
assert l.pos == 2
|
|
|
|
l.fillBuffer
|
|
assert l.isFull
|
|
assert l.bufEnd == 1
|
|
assert l.pos == 2
|
|
assert l.bufStart == 2
|
|
|
|
# "bookmark preserves the buffer":
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream(longTestString), 7)
|
|
assert l.buffer.len == 7
|
|
assert l.bufferIs(longTestString[0..<6])
|
|
assert l.isFull
|
|
assert l.bufEnd == 6
|
|
assert l.pos == 0
|
|
assert l.bookmark == @[]
|
|
assert l.readExpected(longTestString[0..<5])
|
|
assert not l.isFull
|
|
assert not l.atEnd
|
|
assert l.pos == 5
|
|
|
|
l.setBookmark
|
|
# read enough to require us to refill the buffer.
|
|
assert l.bookmark == @[5]
|
|
assert l.readExpected(longTestString[5..<10])
|
|
assert l.pos == 3
|
|
assert newStartIdx(l) == 5
|
|
assert l.buffer.len == 7
|
|
|
|
l.returnToBookmark
|
|
assert l.bookmark == @[]
|
|
assert l.pos == 5
|
|
|
|
# "can set and unset multiple bookmarks"
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream(longTestString))
|
|
assert l.pos == 0
|
|
assert l.bookmark == @[]
|
|
assert l.readExpected("This is my ")
|
|
|
|
l.setBookmark
|
|
assert l.bookmark == @[11]
|
|
assert l.bookmarkVal == @[""]
|
|
|
|
assert l.readExpected("test string")
|
|
assert l.bookmark == @[11]
|
|
assert l.bookmarkVal == @["test string"]
|
|
assert l.readSinceBookmark == "test string"
|
|
|
|
l.setBookmark
|
|
assert l.bookmark == @[11, 22]
|
|
assert l.bookmarkVal == @["test string", ""]
|
|
|
|
assert l.readExpected(". There are many")
|
|
assert l.bookmarkVal == @["test string. There are many", ". There are many"]
|
|
assert l.readSinceBookmark == ". There are many"
|
|
assert l.pos == 38
|
|
|
|
l.unsetBookmark
|
|
assert l.pos == 38
|
|
assert l.bookmark == @[11]
|
|
assert l.bookmarkVal == @["test string. There are many"]
|
|
assert l.readSinceBookmark == "test string. There are many"
|
|
|
|
l.unsetBookmark
|
|
assert l.pos == 38
|
|
assert l.bookmark == @[]
|
|
assert l.bookmarkVal == @[]
|
|
assert l.readSinceBookmark == ""
|
|
|
|
# "can set and return to multiple bookmarks"
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream(longTestString))
|
|
assert l.pos == 0
|
|
assert l.bookmark == @[]
|
|
assert l.readExpected("This is my ")
|
|
|
|
l.setBookmark
|
|
assert l.readExpected("test string")
|
|
l.setBookmark
|
|
assert l.bookmark == @[11, 22]
|
|
assert l.readExpected(". There are many")
|
|
assert l.bookmarkVal == @["test string. There are many", ". There are many"]
|
|
assert l.pos == 38
|
|
|
|
l.returnToBookmark
|
|
assert l.pos == 22
|
|
assert l.bookmark == @[11]
|
|
assert l.bookmarkVal == @["test string"]
|
|
assert l.readSinceBookmark == "test string"
|
|
|
|
l.returnToBookmark
|
|
assert l.pos == 11
|
|
assert l.bookmark == @[]
|
|
assert l.bookmarkVal == @[]
|
|
|
|
# "readRune":
|
|
block:
|
|
var l: VCardLexer
|
|
l.open(newStringStream("TEST"))
|
|
assert l.bufferIs("TEST")
|
|
assert l.peekRune == Rune('T')
|
|
assert l.readRune == Rune('T')
|
|
assert l.readRune == Rune('E')
|
|
assert l.readRune == Rune('S')
|
|
assert l.readRune == Rune('T')
|
|
|
|
when isMainModule: runVcardLexerPrivateTests()
|