Parser: add -fdollars-in-identifiers and -fno-dollars-in-identifiers …

…option - add -f(no-)dollars-in-identifiers option - add warnings related to -W(no-)dollar-in-identifier-extension - dollar $ symbol is now exclusively part of extended_identifier - emit warning when identifier is followed by dollar symbol in -fno-dollars-in-identifiers mode
Vexu · Dec 4, 2021 · 086d4bb · 086d4bb
1 parent 407a3ea
commit 086d4bb
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 35 deletions.
diff --git a/src/Diagnostics.zig b/src/Diagnostics.zig
@@ -100,6 +100,7 @@ pub const Options = struct {
     @"unicode-zero-width": ?Kind = null,
     @"unicode-homoglyph": ?Kind = null,
     @"return-type": ?Kind = null,
+    @"dollar-in-identifier-extension": ?Kind = null,
 };
 
 const messages = struct {
@@ -1266,6 +1267,16 @@ const messages = struct {
         const extra = .str;
         const kind = .@"error";
     };
+    const dollar_in_identifier_extension = struct {
+        const msg = "'$' in identifier";
+        const opt = "dollar-in-identifier-extension";
+        const kind = .off;
+        const suppress_language_option = "dollars_in_identifiers";
+    };
+    const dollars_in_identifiers = struct {
+        const msg = "illegal character '$' in identifier";
+        const kind = .@"error";
+    };
 };
 
 list: std.ArrayList(Message),
@@ -1484,6 +1495,7 @@ fn tagKind(diag: *Diagnostics, tag: Tag) Kind {
             }
             if (@hasDecl(info, "suppress_version")) if (comp.langopts.standard.atLeast(info.suppress_version)) return .off;
             if (@hasDecl(info, "suppress_gnu")) if (comp.langopts.standard.isExplicitGNU()) return .off;
+            if (@hasDecl(info, "suppress_language_option")) if (!@field(comp.langopts, info.suppress_language_option)) return .off;
             if (kind == .@"error" and diag.fatal_errors) kind = .@"fatal error";
             return kind;
         }

diff --git a/src/LangOpts.zig b/src/LangOpts.zig
@@ -72,6 +72,7 @@ const Standard = enum {
 standard: Standard = .default,
 /// -fshort-enums option, makes enums only take up as much space as they need to hold all the values.
 short_enums: bool = false,
+dollars_in_identifiers: bool = true,
 
 pub fn setStandard(self: *LangOpts, name: []const u8) error{InvalidStandard}!void {
     self.standard = Standard.NameMap.get(name) orelse return error.InvalidStandard;

diff --git a/src/Parser.zig b/src/Parser.zig
@@ -136,21 +136,41 @@ fn checkIdentifierCodepoint(comp: *Compilation, codepoint: u21, loc: Source.Loca
 }
 
 fn eatIdentifier(p: *Parser) !?TokenIndex {
-    if (p.tok_ids[p.tok_i] == .identifier) {
-        defer p.tok_i += 1;
-        return p.tok_i;
-    } else if (p.tok_ids[p.tok_i] == .extended_identifier) {
-        defer p.tok_i += 1;
-        const slice = p.tokSlice(p.tok_i);
-        var it = std.unicode.Utf8View.initUnchecked(slice).iterator();
-        var loc = p.pp.tokens.items(.loc)[p.tok_i];
-        while (it.nextCodepoint()) |c| {
-            if (try checkIdentifierCodepoint(p.pp.comp, c, loc)) break;
-            loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable;
+    switch (p.tok_ids[p.tok_i]) {
+        .identifier => {},
+        .extended_identifier => {
+            const slice = p.tokSlice(p.tok_i);
+            var it = std.unicode.Utf8View.initUnchecked(slice).iterator();
+            var loc = p.pp.tokens.items(.loc)[p.tok_i];
+
+            if (mem.indexOfScalar(u8, slice, '$')) |i| {
+                loc.byte_offset += @intCast(u32, i);
+                try p.pp.comp.diag.add(.{
+                    .tag = .dollar_in_identifier_extension,
+                    .loc = loc,
+                });
+                loc = p.pp.tokens.items(.loc)[p.tok_i];
+            }
+
+            while (it.nextCodepoint()) |c| {
+                if (try checkIdentifierCodepoint(p.pp.comp, c, loc)) break;
+                loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable;
+            }
+        },
+        else => return null,
+    }
+    p.tok_i += 1;
+
+    // Handle illegal '$' characters in identifiers
+    if (!p.pp.comp.langopts.dollars_in_identifiers) {
+        if (p.tok_ids[p.tok_i] == .invalid and p.tokSlice(p.tok_i)[0] == '$') {
+            try p.err(.dollars_in_identifiers);
+            p.tok_i += 1;
+            return error.ParsingFailed;
         }
-        return p.tok_i;
     }
-    return null;
+
+    return p.tok_i - 1;
 }
 
 fn expectIdentifier(p: *Parser) Error!TokenIndex {
@@ -161,6 +181,7 @@ fn expectIdentifier(p: *Parser) Error!TokenIndex {
         } });
         return error.ParsingFailed;
     }
+
     return (try p.eatIdentifier()) orelse unreachable;
 }
 
@@ -1244,25 +1265,10 @@ const InitDeclarator = struct { d: Declarator, initializer: NodeIndex = .none };
 fn attribute(p: *Parser) Error!Attribute {
     const name_tok = p.tok_i;
     switch (p.tok_ids[p.tok_i]) {
-        .identifier, .keyword_const, .keyword_const1, .keyword_const2 => {},
-        .extended_identifier => {
-            const slice = p.tokSlice(p.tok_i);
-            var it = std.unicode.Utf8View.initUnchecked(slice).iterator();
-            var loc = p.pp.tokens.items(.loc)[p.tok_i];
-            while (it.nextCodepoint()) |c| {
-                if (try checkIdentifierCodepoint(p.pp.comp, c, loc)) break;
-                loc.byte_offset += std.unicode.utf8CodepointSequenceLength(c) catch unreachable;
-            }
-        },
-        else => {
-            try p.errExtra(.expected_token, p.tok_i, .{ .tok_id = .{
-                .expected = .identifier,
-                .actual = p.tok_ids[p.tok_i],
-            } });
-            return error.ParsingFailed;
-        },
+        .keyword_const, .keyword_const1, .keyword_const2 => p.tok_i += 1,
+        else => _ = try p.expectIdentifier(),
     }
-    p.tok_i += 1;
+
     switch (p.tok_ids[p.tok_i]) {
         .comma, .r_paren => { // will be consumed in attributeList
             return Attribute{ .name = name_tok };

diff --git a/src/Tokenizer.zig b/src/Tokenizer.zig
@@ -583,6 +583,8 @@ pub const Token = struct {
     /// does not check basic character set chars because the tokenizer handles them separately to keep the common
     /// case on the fast path
     pub fn mayAppearInIdent(comp: *const Compilation, codepoint: u21, where: enum { start, inside }) bool {
+        if (codepoint == '$') return comp.langopts.dollars_in_identifiers;
+        if (codepoint < 0x7F) return false;
         return switch (where) {
             .start => if (comp.langopts.standard.atLeast(.c11))
                 CharInfo.isC11IdChar(codepoint) and !CharInfo.isC11DisallowedInitialIdChar(codepoint)
@@ -780,7 +782,7 @@ pub fn next(self: *Tokenizer) Token {
                 'u' => state = .u,
                 'U' => state = .U,
                 'L' => state = .L,
-                'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_', '$' => state = .identifier,
+                'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
                 '=' => state = .equal,
                 '!' => state = .bang,
                 '|' => state = .pipe,
@@ -854,7 +856,7 @@ pub fn next(self: *Tokenizer) Token {
                 '1'...'9' => state = .integer_literal,
                 '\\' => state = .back_slash,
                 '\t', '\x0B', '\x0C', ' ' => state = .whitespace,
-                else => if (c > 0x7F and Token.mayAppearInIdent(self.comp, c, .start)) {
+                else => if (Token.mayAppearInIdent(self.comp, c, .start)) {
                     state = .extended_identifier;
                 } else {
                     id = .invalid;
@@ -1074,9 +1076,9 @@ pub fn next(self: *Tokenizer) Token {
                 },
             },
             .identifier, .extended_identifier => switch (c) {
-                'a'...'z', 'A'...'Z', '_', '0'...'9', '$' => {},
+                'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
                 else => {
-                    if (c <= 0x7F or !Token.mayAppearInIdent(self.comp, c, .inside)) {
+                    if (!Token.mayAppearInIdent(self.comp, c, .inside)) {
                         id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier;
                         break;
                     }

diff --git a/src/main.zig b/src/main.zig
@@ -63,6 +63,10 @@ const usage =
     \\  -fno-color-diagnostics  Disable colors in diagnostics
     \\  -fshort-enums           Use the narrowest possible integer type for enums.
     \\  -fno-short-enums        Use "int" as the tag type for enums.
+    \\  -fdollars-in-identifiers        
+    \\                          Allow '$' in identifiers
+    \\  -fno-dollars-in-identifiers     
+    \\                          Disallow '$' in identifiers
     \\  -I <dir>                Add directory to include search path
     \\  -isystem                Add directory to SYSTEM include search path
     \\  -o <file>               Write output to <file>
@@ -143,6 +147,10 @@ fn handleArgs(comp: *Compilation, args: [][]const u8) !void {
                 comp.langopts.short_enums = true;
             } else if (mem.eql(u8, arg, "-fno-short-enums")) {
                 comp.langopts.short_enums = false;
+            } else if (mem.eql(u8, arg, "-fdollars-in-identifiers")) {
+                comp.langopts.dollars_in_identifiers = true;
+            } else if (mem.eql(u8, arg, "-fno-dollars-in-identifiers")) {
+                comp.langopts.dollars_in_identifiers = false;
             } else if (mem.startsWith(u8, arg, "-I")) {
                 var path = arg["-I".len..];
                 if (path.len == 0) {

diff --git a/test/cases/dollars in identifiers.c b/test/cases/dollars in identifiers.c
@@ -0,0 +1,18 @@
+void foo$() { }
+
+void fib() {
+  int $test;
+}
+
+void ano$ther() {}
+
+#pragma GCC diagnostic warning "-Wdollar-in-identifier-extension"
+
+void identi$fier() {}
+
+void inside() {
+  int vari$able;
+}
+
+#define EXPECTED_ERRORS "dollars in identifiers.c:11:12: warning: '$' in identifier" \
+    "dollars in identifiers.c:14:11: warning: '$' in identifier"