Skip to content

Commit

Permalink
Unicode escape in string literals
Browse files Browse the repository at this point in the history
  • Loading branch information
GULPF committed Oct 16, 2018
1 parent f3c0703 commit 1532554
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 21 deletions.
82 changes: 64 additions & 18 deletions compiler/lexer.nim
Original file line number Diff line number Diff line change
Expand Up @@ -633,13 +633,52 @@ proc handleHexChar(L: var TLexer, xi: var int) =
of 'A'..'F':
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
inc(L.bufpos)
else: discard
else:
lexMessage(L, errGenerated,
"expected a hex digit, but found: " & L.buf[L.bufpos])

proc handleDecChars(L: var TLexer, xi: var int) =
while L.buf[L.bufpos] in {'0'..'9'}:
xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
inc(L.bufpos)

proc addUnicodeCodePoint(s: var string, i: int) =
# inlined toUTF-8 to avoid unicode and strutils dependencies.
let pos = s.len
if i <=% 127:
s.setLen(pos+1)
s[pos+0] = chr(i)
elif i <=% 0x07FF:
s.setLen(pos+2)
s[pos+0] = chr((i shr 6) or 0b110_00000)
s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
elif i <=% 0xFFFF:
s.setLen(pos+3)
s[pos+0] = chr(i shr 12 or 0b1110_0000)
s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
elif i <=% 0x001FFFFF:
s.setLen(pos+4)
s[pos+0] = chr(i shr 18 or 0b1111_0000)
s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
elif i <=% 0x03FFFFFF:
s.setLen(pos+5)
s[pos+0] = chr(i shr 24 or 0b111110_00)
s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
elif i <=% 0x7FFFFFFF:
s.setLen(pos+6)
s[pos+0] = chr(i shr 30 or 0b1111110_0)
s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
s[pos+5] = chr(i and ones(6) or 0b10_0000_00)

proc getEscapedChar(L: var TLexer, tok: var TToken) =
inc(L.bufpos) # skip '\'
case L.buf[L.bufpos]
Expand Down Expand Up @@ -686,29 +725,36 @@ proc getEscapedChar(L: var TLexer, tok: var TToken) =
of '\\':
add(tok.literal, '\\')
inc(L.bufpos)
of 'x', 'X', 'u', 'U':
var tp = L.buf[L.bufpos]
of 'x', 'X':
inc(L.bufpos)
var xi = 0
handleHexChar(L, xi)
handleHexChar(L, xi)
if tp in {'u', 'U'}:
add(tok.literal, chr(xi))
of 'u', 'U':
if tok.tokType == tkCharLit:
lexMessage(L, errGenerated, "\\u not allowed in character literal")
inc(L.bufpos)
var xi = 0
if L.buf[L.bufpos] == '{':
inc(L.bufpos)
var start = L.bufpos
while L.buf[L.bufpos] != '}':
handleHexChar(L, xi)
if start == L.bufpos:
lexMessage(L, errGenerated,
"Unicode codepoint cannot be empty")
inc(L.bufpos)
if xi > 0x10FFFF:
let hex = ($L.buf)[start..L.bufpos-2]
lexMessage(L, errGenerated,
"Unicode codepoint must be lower than 0x10FFFF, but was: " & hex)
else:
handleHexChar(L, xi)
handleHexChar(L, xi)
# inlined toUTF-8 to avoid unicode and strutils dependencies.
if xi <=% 127:
add(tok.literal, xi.char )
elif xi <=% 0x07FF:
add(tok.literal, ((xi shr 6) or 0b110_00000).char )
add(tok.literal, ((xi and ones(6)) or 0b10_0000_00).char )
elif xi <=% 0xFFFF:
add(tok.literal, (xi shr 12 or 0b1110_0000).char )
add(tok.literal, (xi shr 6 and ones(6) or 0b10_0000_00).char )
add(tok.literal, (xi and ones(6) or 0b10_0000_00).char )
else: # value is 0xFFFF
add(tok.literal, "\xef\xbf\xbf" )
else:
add(tok.literal, chr(xi))
handleHexChar(L, xi)
handleHexChar(L, xi)
addUnicodeCodePoint(tok.literal, xi)
of '0'..'9':
if matchTwoChars(L, '0', {'0'..'9'}):
lexMessage(L, warnOctalEscape)
Expand Down
5 changes: 5 additions & 0 deletions doc/manual.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,11 @@ contain the following `escape sequences`:idx:\ :
``\e`` `escape`:idx: `[ESC]`:idx:
``\x`` HH `character with hex value HH`:idx:;
exactly two hex digits are allowed
``\u`` HHHH `unicode codepoint with value HHHH`:idx:;
exactly four hex digits are allowed
``\u`` {H+} `unicode codepoint`:idx:;
all hex digits enclosed in ``{}`` are used for
the codepoint
================== ===================================================


Expand Down
6 changes: 3 additions & 3 deletions tests/lexer/tstrlits.nim
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
discard """
file: "tstrlits.nim"
output: "a\"\"long string\"\"\"\"\"abc\"def_'2'●"
output: "a\"\"long string\"\"\"\"\"abc\"def_'2'●𝌆𝌆A"
"""
# Test the new different string literals

Expand All @@ -11,14 +11,14 @@ const

raw = r"abc""def"

escaped = "\x5f'\50'\u25cf"
escaped = "\x5f'\50'\u25cf\u{1D306}\u{1d306}\u{41}"


stdout.write(rawQuote)
stdout.write(tripleEmpty)
stdout.write(raw)
stdout.write(escaped)
#OUT a""long string"""""abc"def
#OUT a""long string"""""abc"def_'2'●𝌆𝌆A



0 comments on commit 1532554

Please sign in to comment.