From 5f5a92379fab59db0e7e3da5a9cec2a11be45ed8 Mon Sep 17 00:00:00 2001 From: Andreas Rumpf Date: Wed, 24 Mar 2021 14:46:19 +0100 Subject: [PATCH] custom integer literals (#17489) * user defined integer literals; refs #17020 * updated renderer.nim * use mlexerutils helper * imported all test cases from https://github.com/nim-lang/Nim/pull/17020 * final grammar updated --- changelog.md | 2 + compiler/docgen.nim | 2 +- compiler/lexer.nim | 348 +++++++++++------------ compiler/parser.nim | 18 +- compiler/renderer.nim | 18 +- compiler/semstmts.nim | 2 +- doc/grammar.txt | 3 +- doc/manual.rst | 54 +++- tests/lexer/mlexerutils.nim | 9 + tests/lexer/tcustom_numeric_literals.nim | 150 ++++++++++ tests/lexer/tstrlits.nim | 19 -- tests/lexer/tunary_minus.nim | 8 +- 12 files changed, 412 insertions(+), 221 deletions(-) create mode 100644 tests/lexer/mlexerutils.nim create mode 100644 tests/lexer/tcustom_numeric_literals.nim delete mode 100644 tests/lexer/tstrlits.nim diff --git a/changelog.md b/changelog.md index c0cd970247f5..922b8470a65f 100644 --- a/changelog.md +++ b/changelog.md @@ -266,6 +266,8 @@ - The unary minus in `-1` is now part of the integer literal, it is now parsed as a single token. This implies that edge cases like `-128'i8` finally work correctly. +- Custom numeric literals are now supported. + ## Compiler changes diff --git a/compiler/docgen.nim b/compiler/docgen.nim index dded231d773f..b24a24a2b1bd 100644 --- a/compiler/docgen.nim +++ b/compiler/docgen.nim @@ -416,7 +416,7 @@ proc nodeToHighlightedHtml(d: PDoc; n: PNode; result: var Rope; renderFlags: TRe of tkOpr: dispA(d.conf, result, "$1", "\\spanOperator{$1}", [escLit]) - of tkStrLit..tkTripleStrLit: + of tkStrLit..tkTripleStrLit, tkCustomLit: dispA(d.conf, result, "$1", "\\spanStringLit{$1}", [escLit]) of tkCharLit: diff --git a/compiler/lexer.nim b/compiler/lexer.nim index bcd3f0076780..b34b010c24ba 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -60,6 +60,7 @@ type tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit", tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit", tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit", + tkCustomLit = "tkCustomLit", tkParLe = "(", tkParRi = ")", tkBracketLe = "[", tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}", @@ -313,8 +314,7 @@ proc getNumber(L: var Lexer, result: var Token) = proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) = # Used to get slightly human friendlier err messages. - const literalishChars = {'A'..'F', 'a'..'f', '0'..'9', 'X', 'x', 'o', 'O', - 'c', 'C', 'b', 'B', '_', '.', '\'', 'd', 'i', 'u'} + const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''} var msgPos = L.bufpos var t: Token t.literal = "" @@ -326,15 +326,14 @@ proc getNumber(L: var Lexer, result: var Token) = t.literal.add(L.buf[L.bufpos]) inc(L.bufpos) matchChars(L, t, literalishChars) - if L.buf[L.bufpos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: - inc(L.bufpos) + if L.buf[L.bufpos] in literalishChars: t.literal.add(L.buf[L.bufpos]) + inc(L.bufpos) matchChars(L, t, {'0'..'9'}) L.bufpos = msgPos lexMessage(L, msgKind, msg % t.literal) var - startpos, endpos: int xi: BiggestInt isBase10 = true numDigits = 0 @@ -346,7 +345,7 @@ proc getNumber(L: var Lexer, result: var Token) = result.tokType = tkIntLit # int literal until we know better result.literal = "" result.base = base10 - startpos = L.bufpos + let startpos = L.bufpos tokenBegin(result, startpos) var isPositive = true @@ -395,201 +394,187 @@ proc getNumber(L: var Lexer, result: var Token) = discard matchUnderscoreChars(L, result, {'0'..'9'}) if L.buf[L.bufpos] in {'e', 'E'}: result.tokType = tkFloatLit - eatChar(L, result, 'e') + eatChar(L, result) if L.buf[L.bufpos] in {'+', '-'}: eatChar(L, result) discard matchUnderscoreChars(L, result, {'0'..'9'}) - endpos = L.bufpos + let endpos = L.bufpos # Second stage, find out if there's a datatype suffix and handle it var postPos = endpos + if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}: + let errPos = postPos + var customLitPossible = false if L.buf[postPos] == '\'': inc(postPos) + customLitPossible = true - case L.buf[postPos] - of 'f', 'F': - inc(postPos) - if (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): - result.tokType = tkFloat32Lit - inc(postPos, 2) - elif (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): - result.tokType = tkFloat64Lit - inc(postPos, 2) - elif (L.buf[postPos] == '1') and - (L.buf[postPos + 1] == '2') and - (L.buf[postPos + 2] == '8'): - result.tokType = tkFloat128Lit - inc(postPos, 3) - else: # "f" alone defaults to float32 - result.tokType = tkFloat32Lit - of 'd', 'D': # ad hoc convenience shortcut for f64 - inc(postPos) - result.tokType = tkFloat64Lit - of 'i', 'I': - inc(postPos) - if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): - result.tokType = tkInt64Lit - inc(postPos, 2) - elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): - result.tokType = tkInt32Lit - inc(postPos, 2) - elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'): - result.tokType = tkInt16Lit - inc(postPos, 2) - elif (L.buf[postPos] == '8'): - result.tokType = tkInt8Lit - inc(postPos) - else: - lexMessageLitNum(L, "invalid number: '$1'", startpos) - of 'u', 'U': - inc(postPos) - if (L.buf[postPos] == '6') and (L.buf[postPos + 1] == '4'): - result.tokType = tkUInt64Lit - inc(postPos, 2) - elif (L.buf[postPos] == '3') and (L.buf[postPos + 1] == '2'): - result.tokType = tkUInt32Lit - inc(postPos, 2) - elif (L.buf[postPos] == '1') and (L.buf[postPos + 1] == '6'): - result.tokType = tkUInt16Lit - inc(postPos, 2) - elif (L.buf[postPos] == '8'): - result.tokType = tkUInt8Lit - inc(postPos) + if L.buf[postPos] in SymChars: + var suffixAsLower = newStringOfCap(10) + var suffix = newStringOfCap(10) + while true: + let c = L.buf[postPos] + suffix.add c + suffixAsLower.add toLowerAscii(c) + inc postPos + if L.buf[postPos] notin SymChars+{'_'}: break + case suffix + of "f", "f32": result.tokType = tkFloat32Lit + of "d", "f64": result.tokType = tkFloat64Lit + of "f128": result.tokType = tkFloat128Lit + of "i8": result.tokType = tkInt8Lit + of "i16": result.tokType = tkInt16Lit + of "i32": result.tokType = tkInt32Lit + of "i64": result.tokType = tkInt64Lit + of "u": result.tokType = tkUIntLit + of "u8": result.tokType = tkUInt8Lit + of "u16": result.tokType = tkUInt16Lit + of "u32": result.tokType = tkUInt32Lit + of "u64": result.tokType = tkUInt64Lit else: - result.tokType = tkUIntLit + if customLitPossible: + # remember the position of the ``'`` so that the parser doesn't + # have to reparse the custom literal: + result.iNumber = len(result.literal) + result.literal.add '\'' + result.literal.add suffix + result.tokType = tkCustomLit + else: + lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) else: - lexMessageLitNum(L, "invalid number: '$1'", startpos) + lexMessageLitNum(L, "invalid number suffix: '$1'", errPos) # Is there still a literalish char awaiting? Then it's an error! if L.buf[postPos] in literalishChars or (L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}): lexMessageLitNum(L, "invalid number: '$1'", startpos) - # Third stage, extract actual number - L.bufpos = startpos # restore position - var pos = startpos - try: - if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): - inc(pos, 2) - xi = 0 # it is a base prefix - - case L.buf[pos - 1] - of 'b', 'B': - result.base = base2 - while pos < endpos: - if L.buf[pos] != '_': - xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - # 'c', 'C' is deprecated - of 'o', 'c', 'C': - result.base = base8 - while pos < endpos: - if L.buf[pos] != '_': - xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - of 'x', 'X': - result.base = base16 - while pos < endpos: - case L.buf[pos] - of '_': - inc(pos) - of '0'..'9': - xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) - inc(pos) - of 'a'..'f': - xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + if result.tokType != tkCustomLit: + # Third stage, extract actual number + L.bufpos = startpos # restore position + var pos = startpos + try: + if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars): + inc(pos, 2) + xi = 0 # it is a base prefix + + case L.buf[pos - 1] + of 'b', 'B': + result.base = base2 + while pos < endpos: + if L.buf[pos] != '_': + xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0')) inc(pos) - of 'A'..'F': - xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + # 'c', 'C' is deprecated + of 'o', 'c', 'C': + result.base = base8 + while pos < endpos: + if L.buf[pos] != '_': + xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0')) inc(pos) - else: - break + of 'x', 'X': + result.base = base16 + while pos < endpos: + case L.buf[pos] + of '_': + inc(pos) + of '0'..'9': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0')) + inc(pos) + of 'a'..'f': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10) + inc(pos) + of 'A'..'F': + xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10) + inc(pos) + else: + break + else: + internalError(L.config, getLineInfo(L), "getNumber") + + case result.tokType + of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi + of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56) + of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48) + of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32) + of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi + of tkUInt8Lit: setNumber result.iNumber, xi and 0xff + of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff + of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff + of tkFloat32Lit: + setNumber result.fNumber, (cast[PFloat32](addr(xi)))[] + # note: this code is endian neutral! + # XXX: Test this on big endian machine! + of tkFloat64Lit, tkFloatLit: + setNumber result.fNumber, (cast[PFloat64](addr(xi)))[] + else: internalError(L.config, getLineInfo(L), "getNumber") + + # Bounds checks. Non decimal literals are allowed to overflow the range of + # the datatype as long as their pattern don't overflow _bitwise_, hence + # below checks of signed sizes against uint*.high is deliberate: + # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) + if result.tokType notin floatTypes: + let outOfRange = + case result.tokType + of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi + of tkInt8Lit: (xi > BiggestInt(uint8.high)) + of tkInt16Lit: (xi > BiggestInt(uint16.high)) + of tkInt32Lit: (xi > BiggestInt(uint32.high)) + else: false + + if outOfRange: + #echo "out of range num: ", result.iNumber, " vs ", xi + lexMessageLitNum(L, "number out of range: '$1'", startpos) + else: - internalError(L.config, getLineInfo(L), "getNumber") - - case result.tokType - of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi - of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56) - of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48) - of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32) - of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi - of tkUInt8Lit: setNumber result.iNumber, xi and 0xff - of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff - of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff - of tkFloat32Lit: - setNumber result.fNumber, (cast[PFloat32](addr(xi)))[] - # note: this code is endian neutral! - # XXX: Test this on big endian machine! - of tkFloat64Lit, tkFloatLit: - setNumber result.fNumber, (cast[PFloat64](addr(xi)))[] - else: internalError(L.config, getLineInfo(L), "getNumber") - - # Bounds checks. Non decimal literals are allowed to overflow the range of - # the datatype as long as their pattern don't overflow _bitwise_, hence - # below checks of signed sizes against uint*.high is deliberate: - # (0x80'u8 = 128, 0x80'i8 = -128, etc == OK) - if result.tokType notin floatTypes: + case result.tokType + of floatTypes: + result.fNumber = parseFloat(result.literal) + of tkUInt64Lit, tkUIntLit: + var iNumber: uint64 + var len: int + try: + len = parseBiggestUInt(result.literal, iNumber) + except ValueError: + raise newException(OverflowDefect, "number out of range: " & $result.literal) + if len != result.literal.len: + raise newException(ValueError, "invalid integer: " & $result.literal) + result.iNumber = cast[int64](iNumber) + else: + var iNumber: int64 + var len: int + try: + len = parseBiggestInt(result.literal, iNumber) + except ValueError: + raise newException(OverflowDefect, "number out of range: " & $result.literal) + if len != result.literal.len: + raise newException(ValueError, "invalid integer: " & $result.literal) + result.iNumber = iNumber + + # Explicit bounds checks. let outOfRange = case result.tokType - of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi - of tkInt8Lit: (xi > BiggestInt(uint8.high)) - of tkInt16Lit: (xi > BiggestInt(uint16.high)) - of tkInt32Lit: (xi > BiggestInt(uint32.high)) + of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low + of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0 + of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low + of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0 + of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low + of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0 else: false if outOfRange: - #echo "out of range num: ", result.iNumber, " vs ", xi lexMessageLitNum(L, "number out of range: '$1'", startpos) - else: - case result.tokType - of floatTypes: - result.fNumber = parseFloat(result.literal) - of tkUInt64Lit, tkUIntLit: - var iNumber: uint64 - var len: int - try: - len = parseBiggestUInt(result.literal, iNumber) - except ValueError: - raise newException(OverflowDefect, "number out of range: " & $result.literal) - if len != result.literal.len: - raise newException(ValueError, "invalid integer: " & $result.literal) - result.iNumber = cast[int64](iNumber) - else: - var iNumber: int64 - var len: int - try: - len = parseBiggestInt(result.literal, iNumber) - except ValueError: - raise newException(OverflowDefect, "number out of range: " & $result.literal) - if len != result.literal.len: - raise newException(ValueError, "invalid integer: " & $result.literal) - result.iNumber = iNumber - - # Explicit bounds checks. - let outOfRange = - case result.tokType - of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low - of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0 - of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low - of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0 - of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low - of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0 - else: false - - if outOfRange: - lexMessageLitNum(L, "number out of range: '$1'", startpos) - - # Promote int literal to int64? Not always necessary, but more consistent - if result.tokType == tkIntLit: - if result.iNumber > high(int32) or result.iNumber < low(int32): - result.tokType = tkInt64Lit - - except ValueError: - lexMessageLitNum(L, "invalid number: '$1'", startpos) - except OverflowDefect, RangeDefect: - lexMessageLitNum(L, "number out of range: '$1'", startpos) + # Promote int literal to int64? Not always necessary, but more consistent + if result.tokType == tkIntLit: + if result.iNumber > high(int32) or result.iNumber < low(int32): + result.tokType = tkInt64Lit + + except ValueError: + lexMessageLitNum(L, "invalid number: '$1'", startpos) + except OverflowDefect, RangeDefect: + lexMessageLitNum(L, "number out of range: '$1'", startpos) tokenEnd(result, postPos-1) L.bufpos = postPos @@ -830,8 +815,9 @@ proc getString(L: var Lexer, tok: var Token, mode: StringMode) = inc(pos) L.bufpos = pos -proc getCharacter(L: var Lexer, tok: var Token) = +proc getCharacter(L: var Lexer; tok: var Token) = tokenBegin(tok, L.bufpos) + let startPos = L.bufpos inc(L.bufpos) # skip ' var c = L.buf[L.bufpos] case c @@ -842,10 +828,16 @@ proc getCharacter(L: var Lexer, tok: var Token) = else: tok.literal = $c inc(L.bufpos) - if L.buf[L.bufpos] != '\'': - lexMessage(L, errGenerated, "missing closing ' for character literal") - tokenEndIgnore(tok, L.bufpos) - inc(L.bufpos) # skip ' + if L.buf[L.bufpos] == '\'': + tokenEndIgnore(tok, L.bufpos) + inc(L.bufpos) # skip ' + else: + if startPos > 0 and L.buf[startPos-1] == '`': + tok.literal = "'" + L.bufpos = startPos+1 + else: + lexMessage(L, errGenerated, "missing closing ' for character literal") + tokenEndIgnore(tok, L.bufpos) proc getSymbol(L: var Lexer, tok: var Token) = var h: Hash = 0 diff --git a/compiler/parser.nim b/compiler/parser.nim index fe857c81bb47..b9a6ffb8ccdb 100644 --- a/compiler/parser.nim +++ b/compiler/parser.nim @@ -355,7 +355,7 @@ proc parseSymbol(p: var Parser, mode = smNormal): PNode = let node = newNodeI(nkIdent, lineinfo) node.ident = p.lex.cache.getIdent(accm) result.add(node) - of tokKeywordLow..tokKeywordHigh, tkSymbol, tkIntLit..tkCharLit: + of tokKeywordLow..tokKeywordHigh, tkSymbol, tkIntLit..tkCustomLit: result.add(newIdentNodeP(p.lex.cache.getIdent($p.tok), p)) getTok(p) else: @@ -627,7 +627,7 @@ proc identOrLiteral(p: var Parser, mode: PrimaryMode): PNode = #| | UINT_LIT | UINT8_LIT | UINT16_LIT | UINT32_LIT | UINT64_LIT #| | FLOAT_LIT | FLOAT32_LIT | FLOAT64_LIT #| | STR_LIT | RSTR_LIT | TRIPLESTR_LIT - #| | CHAR_LIT + #| | CHAR_LIT | CUSTOM_NUMERIC_LIT #| | NIL #| generalizedLit = GENERALIZED_STR_LIT | GENERALIZED_TRIPLESTR_LIT #| identOrLiteral = generalizedLit | symbol | literal @@ -710,6 +710,14 @@ proc identOrLiteral(p: var Parser, mode: PrimaryMode): PNode = of tkCharLit: result = newIntNodeP(nkCharLit, ord(p.tok.literal[0]), p) getTok(p) + of tkCustomLit: + let splitPos = p.tok.iNumber.int + let str = newStrNodeP(nkRStrLit, p.tok.literal.substr(0, splitPos-1), p) + let callee = newIdentNodeP(getIdent(p.lex.cache, p.tok.literal.substr(splitPos)), p) + result = newNodeP(nkDotExpr, p) + result.add str + result.add callee + getTok(p) of tkNil: result = newNodeP(nkNilLit, p) getTok(p) @@ -807,7 +815,7 @@ proc primarySuffix(p: var Parser, r: PNode, result = commandExpr(p, result, mode) break result = namedParams(p, result, nkCurlyExpr, tkCurlyRi) - of tkSymbol, tkAccent, tkIntLit..tkCharLit, tkNil, tkCast, + of tkSymbol, tkAccent, tkIntLit..tkCustomLit, tkNil, tkCast, tkOpr, tkDotDot, tkVar, tkOut, tkStatic, tkType, tkEnum, tkTuple, tkObject, tkProc: # XXX: In type sections we allow the free application of the @@ -1097,7 +1105,7 @@ proc isExprStart(p: Parser): bool = case p.tok.tokType of tkSymbol, tkAccent, tkOpr, tkNot, tkNil, tkCast, tkIf, tkFor, tkProc, tkFunc, tkIterator, tkBind, tkBuiltInMagics, - tkParLe, tkBracketLe, tkCurlyLe, tkIntLit..tkCharLit, tkVar, tkRef, tkPtr, + tkParLe, tkBracketLe, tkCurlyLe, tkIntLit..tkCustomLit, tkVar, tkRef, tkPtr, tkTuple, tkObject, tkWhen, tkCase, tkOut: result = true else: result = false @@ -1498,7 +1506,7 @@ proc parseReturnOrRaise(p: var Parser, kind: TNodeKind): PNode = #| yieldStmt = 'yield' optInd expr? #| discardStmt = 'discard' optInd expr? #| breakStmt = 'break' optInd expr? - #| continueStmt = 'break' optInd expr? + #| continueStmt = 'continue' optInd expr? result = newNodeP(kind, p) getTok(p) if p.tok.tokType == tkComment: diff --git a/compiler/renderer.nim b/compiler/renderer.nim index c36eaaf1186f..9ca485f6e523 100644 --- a/compiler/renderer.nim +++ b/compiler/renderer.nim @@ -942,7 +942,7 @@ proc skipHiddenNodes(n: PNode): PNode = else: break proc accentedName(g: var TSrcGen, n: PNode) = - const backticksNeeded = OpChars + {'[', '{'} + const backticksNeeded = OpChars + {'[', '{', '\''} if n == nil: return let isOperator = if n.kind == nkIdent and n.ident.s.len > 0 and n.ident.s[0] in backticksNeeded: true @@ -976,6 +976,11 @@ proc infixArgument(g: var TSrcGen, n: PNode, i: int) = if needsParenthesis: put(g, tkParRi, ")") +proc isCustomLit(n: PNode): bool = + n.len == 2 and n[0].kind == nkRStrLit and + (n[1].kind == nkIdent and n[1].ident.s.startsWith('\'')) or + (n[1].kind == nkSym and n[1].sym.name.s.startsWith('\'')) + proc gsub(g: var TSrcGen, n: PNode, c: TContext) = if isNil(n): return var @@ -1195,9 +1200,14 @@ proc gsub(g: var TSrcGen, n: PNode, c: TContext) = gcomma(g, n, c) put(g, tkBracketRi, "]") of nkDotExpr: - gsub(g, n, 0) - put(g, tkDot, ".") - gsub(g, n, 1) + if isCustomLit(n): + put(g, tkCustomLit, n[0].strVal) + gsub(g, n, 1) + else: + gsub(g, n, 0) + put(g, tkDot, ".") + if n.len > 1: + accentedName(g, n[1]) of nkBind: putWithSpace(g, tkBind, "bind") gsub(g, n, 0) diff --git a/compiler/semstmts.nim b/compiler/semstmts.nim index ff8f68ed03ab..fee43162e78c 100644 --- a/compiler/semstmts.nim +++ b/compiler/semstmts.nim @@ -1524,7 +1524,7 @@ proc semProcAnnotation(c: PContext, prc: PNode; return proc semInferredLambda(c: PContext, pt: TIdTable, n: PNode): PNode {.nosinks.} = - ## used for resolving 'auto' in lambdas based on their callsite + ## used for resolving 'auto' in lambdas based on their callsite var n = n let original = n[namePos].sym let s = original #copySym(original, false) diff --git a/doc/grammar.txt b/doc/grammar.txt index 0d5eef179eb1..d4f4a051587d 100644 --- a/doc/grammar.txt +++ b/doc/grammar.txt @@ -46,7 +46,7 @@ literal = | INT_LIT | INT8_LIT | INT16_LIT | INT32_LIT | INT64_LIT | UINT_LIT | UINT8_LIT | UINT16_LIT | UINT32_LIT | UINT64_LIT | FLOAT_LIT | FLOAT32_LIT | FLOAT64_LIT | STR_LIT | RSTR_LIT | TRIPLESTR_LIT - | CHAR_LIT + | CHAR_LIT | CUSTOM_NUMERIC_LIT | NIL generalizedLit = GENERALIZED_STR_LIT | GENERALIZED_TRIPLESTR_LIT identOrLiteral = generalizedLit | symbol | literal @@ -100,6 +100,7 @@ postExprBlocks = ':' stmt? ( IND{=} doBlock | IND{=} 'of' exprList ':' stmt | IND{=} 'elif' expr ':' stmt | IND{=} 'except' exprList ':' stmt + | IND{=} 'finally' ':' stmt | IND{=} 'else' ':' stmt )* exprStmt = simpleExpr (( '=' optInd expr colonBody? ) diff --git a/doc/manual.rst b/doc/manual.rst index db125163043f..a882eb945e4f 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -490,11 +490,17 @@ this. Another reason is that Nim can thus support `array[char, int]` or type is used for Unicode characters, it can represent any Unicode character. `Rune` is declared in the `unicode module `_. +A character literal that does not end in ``'`` interpreted as ``'`` if there +is a preceeding backtick token. There must be no whitespace between the preceeding +backtick token and the character literal. This special rule ensures that a declaration +like ``proc `'customLiteral`(s: string)`` is valid. See also +`Custom Numeric Literals <#custom-numeric-literals>`_. -Numerical constants -------------------- -Numerical constants are of a single type and have the form:: +Numeric Literals +---------------- + +Numeric literals have the form:: hexdigit = digit | 'A'..'F' | 'a'..'f' octdigit = '0'..'7' @@ -530,8 +536,10 @@ Numerical constants are of a single type and have the form:: FLOAT64_LIT = HEX_LIT '\'' FLOAT64_SUFFIX | (FLOAT_LIT | DEC_LIT | OCT_LIT | BIN_LIT) ['\''] FLOAT64_SUFFIX + CUSTOM_NUMERIC_LIT = (FLOAT_LIT | DEC_LIT | OCT_LIT | BIN_LIT) '\'' CUSTOM_NUMERIC_SUFFIX -As can be seen in the productions, numerical constants can contain underscores + +As can be seen in the productions, numeric literals can contain underscores for readability. Integer and floating-point literals may be given in decimal (no prefix), binary (prefix `0b`), octal (prefix `0o`), and hexadecimal (prefix `0x`) notation. @@ -579,7 +587,7 @@ is optional if it is not ambiguous (only hexadecimal floating-point literals with a type suffix can be ambiguous). -The type suffixes are: +The pre-defined type suffixes are: ================= ========================= Type Suffix Resulting type of literal @@ -611,6 +619,42 @@ the bit width of the datatype, it is accepted. Hence: 0b10000000'u8 == 0x80'u8 == 128, but, 0b10000000'i8 == 0x80'i8 == -1 instead of causing an overflow error. + +Custom Numeric Literals +~~~~~~~~~~~~~~~~~~~~~~~ + +If the suffix is not predefined, then the suffix is assumed to be a call +to a proc, template, macro or other callable identifier that is passed the +string containing the literal. The callable identifier needs to be declared +with a special ``'`` prefix: + +.. code-block:: nim + + import strutils + type u4 = distinct uint8 # a 4-bit unsigned integer aka "nibble" + proc `'u4`(n: string): u4 = + # The leading ' is required. + result = (parseInt(n) and 0x0F).u4 + + var x = 5'u4 + +More formally, a custom numeric literal `123'custom` is transformed +to r"123".`'custom` in the parsing step. There is no AST node kind that +corresponds to this transformation. The transformation naturally handles +the case that additional parameters are passed to the callee: + +.. code-block:: nim + + import strutils + type u4 = distinct uint8 # a 4-bit unsigned integer aka "nibble" + proc `'u4`(n: string; moreData: int): u4 = + result = (parseInt(n) and 0x0F).u4 + + var x = 5'u4(123) + +Custom numeric literals are covered by the grammar rule named `CUSTOM_NUMERIC_LIT`. + + Operators --------- diff --git a/tests/lexer/mlexerutils.nim b/tests/lexer/mlexerutils.nim new file mode 100644 index 000000000000..eae7a0006938 --- /dev/null +++ b/tests/lexer/mlexerutils.nim @@ -0,0 +1,9 @@ +import macros + +macro lispReprStr*(a: untyped): untyped = newLit(a.lispRepr) + +macro assertAST*(expected: string, struct: untyped): untyped = + var ast = newLit(struct.treeRepr) + result = quote do: + if `ast` != `expected`: + doAssert false, "\nGot:\n" & `ast`.indent(2) & "\nExpected:\n" & `expected`.indent(2) \ No newline at end of file diff --git a/tests/lexer/tcustom_numeric_literals.nim b/tests/lexer/tcustom_numeric_literals.nim new file mode 100644 index 000000000000..a2f355b4dea0 --- /dev/null +++ b/tests/lexer/tcustom_numeric_literals.nim @@ -0,0 +1,150 @@ +discard """ + targets: "c cpp js" +""" + +# Test tkStrNumLit + +import std/[macros, strutils] +import mlexerutils + +# AST checks + +assertAST dedent """ + StmtList + ProcDef + AccQuoted + Ident "\'" + Ident "wrap" + Empty + Empty + FormalParams + Ident "string" + IdentDefs + Ident "number" + Ident "string" + Empty + Empty + Empty + StmtList + Asgn + Ident "result" + Infix + Ident "&" + Infix + Ident "&" + StrLit "[[" + Ident "number" + StrLit "]]"""": + proc `'wrap`(number: string): string = + result = "[[" & number & "]]" + +assertAST dedent """ + StmtList + DotExpr + RStrLit "-38383839292839283928392839283928392839283.928493849385935898243e-50000" + Ident "\'wrap"""": + -38383839292839283928392839283928392839283.928493849385935898243e-50000'wrap + +proc `'wrap`(number: string): string = "[[" & number & "]]" +doAssert lispReprStr(-1'wrap) == """(DotExpr (RStrLit "-1") (Ident "\'wrap"))""" + +template main = + block: # basic suffix usage + template `'twrap`(number: string): untyped = + number.`'wrap` + proc extraContext(): string = + 22.40'wrap + proc `*`(left, right: string): string = + result = left & "times" & right + proc `+`(left, right: string): string = + result = left & "plus" & right + + doAssert 1'wrap == "[[1]]" + doAssert -1'wrap == "[[-1]]": + "unable to resolve a negative integer-suffix pattern" + doAssert 12345.67890'wrap == "[[12345.67890]]" + doAssert 1'wrap*1'wrap == "[[1]]times[[1]]": + "unable to resolve an operator between two suffixed numeric literals" + doAssert 1'wrap+ -1'wrap == "[[1]]plus[[-1]]": # will generate a compiler warning about inconsistent spacing + "unable to resolve a negative suffixed numeric literal following an operator" + doAssert 1'wrap + -1'wrap == "[[1]]plus[[-1]]" + doAssert 1'twrap == "[[1]]" + doAssert extraContext() == "[[22.40]]": + "unable to return a suffixed numeric literal by an implicit return" + doAssert 0x5a3a'wrap == "[[0x5a3a]]" + doAssert 0o5732'wrap == "[[0o5732]]" + doAssert 0b0101111010101'wrap == "[[0b0101111010101]]" + doAssert -38383839292839283928392839283928392839283.928493849385935898243e-50000'wrap == "[[-38383839292839283928392839283928392839283.928493849385935898243e-50000]]" + doAssert 1234.56'wrap == "[[1234.56]]": + "unable to properly account for context with suffixed numeric literals" + + block: # verify that the i64, f32, etc builtin suffixes still parse correctly + const expectedF32: float32 = 123.125 + proc `'f9`(number: string): string = # proc starts with 'f' just like 'f32' + "[[" & number & "]]" + proc `'f32a`(number: string): string = # looks even more like 'f32' + "[[" & number & "]]" + proc `'d9`(number: string): string = # proc starts with 'd' just like the d suffix + "[[" & number & "]]" + proc `'i9`(number: string): string = # proc starts with 'i' just like 'i64' + "[[" & number & "]]" + proc `'u9`(number: string): string = # proc starts with 'u' just like 'u8' + "[[" & number & "]]" + + doAssert 123.125f32 == expectedF32: + "failing to support non-quoted legacy f32 floating point suffix" + doAssert 123.125'f32 == expectedF32 + doAssert 123.125e0'f32 == expectedF32 + doAssert 1234.56'wrap == 1234.56'f9 + doAssert 1234.56'wrap == 1234.56'f32a + doAssert 1234.56'wrap == 1234.56'd9 + doAssert 1234.56'wrap == 1234.56'i9 + doAssert 1234.56'wrap == 1234.56'u9 + doAssert lispReprStr(1234.56'u9) == """(DotExpr (RStrLit "1234.56") (Ident "\'u9"))""": + "failed to properly build AST for suffix that starts with u" + doAssert -128'i8 == (-128).int8 + + block: # case checks + doAssert 1E2 == 100: + "lexer not handling upper-case exponent" + doAssert 1.0E2 == 100.0 + doAssert 1e2 == 100 + doAssert 0xdeadBEEF'wrap == "[[0xdeadBEEF]]": + "lexer not maintaining original case" + doAssert 0.1E12'wrap == "[[0.1E12]]" + doAssert 0.0e12'wrap == "[[0.0e12]]" + doAssert 0.0e+12'wrap == "[[0.0e+12]]" + doAssert 0.0e-12'wrap == "[[0.0e-12]]" + doAssert 0e-12'wrap == "[[0e-12]]" + + block: # macro and template usage + template `'foo`(a: string): untyped = (a, 2) + doAssert -12'foo == ("-12", 2) + template `'fooplus`(a: string, b: int): untyped = (a, b) + doAssert -12'fooplus(2) == ("-12", 2) + template `'fooplusopt`(a: string, b: int = 99): untyped = (a, b) + doAssert -12'fooplusopt(2) == ("-12", 2) + doAssert -12'fooplusopt() == ("-12", 99) + doAssert -12'fooplusopt == ("-12", 99) + macro `'bar`(a: static string): untyped = + var infix = newNimNode(nnkInfix) + infix.add newIdentNode("&") + infix.add newLit("got ") + infix.add newLit(a.repr) + result = newNimNode(nnkStmtList) + result.add infix + doAssert -12'bar == "got \"-12\"" + macro deb(a): untyped = newLit(a.repr) + doAssert deb(-12'bar) == "-12'bar" + # macro metawrap(): untyped = + # func wrap1(a: string): string = "{" & a & "}" + # func `'wrap2`(a: string): string = "{" & a & "}" + # result = quote do: + # let a1 = wrap1"-128" + # let a2 = -128'wrap2 + # metawrap() + # doAssert a1 == "{-128}" + # doAssert a2 == "{-128}" + +static: main() +main() diff --git a/tests/lexer/tstrlits.nim b/tests/lexer/tstrlits.nim deleted file mode 100644 index 8e8250a5bc2b..000000000000 --- a/tests/lexer/tstrlits.nim +++ /dev/null @@ -1,19 +0,0 @@ -discard """ - output: "a\"\"long string\"\"\"\"\"abc\"def_'2'●𝌆𝌆A" -""" -# Test the new different string literals - -const - tripleEmpty = """"long string"""""""" # "long string """"" - - rawQuote = r"a""" - - raw = r"abc""def" - - escaped = "\x5f'\50'\u25cf\u{1D306}\u{1d306}\u{41}" - - -stdout.write(rawQuote) -stdout.write(tripleEmpty) -stdout.write(raw) -stdout.writeLine(escaped) diff --git a/tests/lexer/tunary_minus.nim b/tests/lexer/tunary_minus.nim index 89f1b79ef72f..0aa861d53721 100644 --- a/tests/lexer/tunary_minus.nim +++ b/tests/lexer/tunary_minus.nim @@ -6,13 +6,7 @@ discard """ import std/[macros, strutils] -macro lispReprStr*(a: untyped): untyped = newLit(a.lispRepr) - -macro assertAST*(expected: string, struct: untyped): untyped = - var ast = newLit(struct.treeRepr) - result = quote do: - if `ast` != `expected`: - doAssert false, "\nGot:\n" & `ast`.indent(2) & "\nExpected:\n" & `expected`.indent(2) +import mlexerutils const one = 1 const minusOne = `-`(one)