Skip to content

Commit

Permalink
Parser: check that identifiers are in NFC
Browse files Browse the repository at this point in the history
  • Loading branch information
Vexu committed Nov 12, 2023
1 parent ddd5a8b commit e95f60c
Show file tree
Hide file tree
Showing 5 changed files with 645 additions and 0 deletions.
37 changes: 37 additions & 0 deletions src/aro/Diagnostics.zig
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ pub const Message = struct {
offset: u64,
pow_2_as_string: u8,
signed: i64,
normalized: []const u8,
none: void,
};
};
Expand Down Expand Up @@ -207,6 +208,7 @@ pub const Options = struct {
@"duplicate-embed-param": Kind = .default,
@"unsupported-embed-param": Kind = .default,
@"unused-result": Kind = .default,
normalized: Kind = .default,
};

const Diagnostics = @This();
Expand Down Expand Up @@ -469,6 +471,41 @@ pub fn renderMessage(comp: *Compilation, m: anytype, msg: Message) void {
printRt(m, prop.msg, .{"{s}"}, .{str});
}
},
.normalized => {
const f = struct {
pub fn f(
bytes: []const u8,
comptime _: []const u8,
_: std.fmt.FormatOptions,
writer: anytype,
) !void {
var it: std.unicode.Utf8Iterator = .{
.bytes = bytes,
.i = 0,
};
while (it.nextCodepoint()) |codepoint| {
if (codepoint < 0x7F) {
try writer.writeByte(@intCast(codepoint));
} else if (codepoint < 0xFFFF) {
try writer.writeAll("\\u");
try std.fmt.formatInt(codepoint, 16, .upper, .{
.fill = '0',
.width = 4,
}, writer);
} else {
try writer.writeAll("\\U");
try std.fmt.formatInt(codepoint, 16, .upper, .{
.fill = '0',
.width = 8,
}, writer);
}
}
}
}.f;
printRt(m, prop.msg, .{"{s}"}, .{
std.fmt.Formatter(f){ .data = msg.extra.normalized },
});
},
.none, .offset => m.write(prop.msg),
}

Expand Down
6 changes: 6 additions & 0 deletions src/aro/Diagnostics/messages.def
Original file line number Diff line number Diff line change
Expand Up @@ -2422,3 +2422,9 @@ attribute_int_out_of_range
.msg = "attribute value '{s}' out of range"
.kind = .@"error"
.extra = .str

identifier_not_normalized
.msg = "'{s}' is not in NFC"
.kind = .warning
.extra = .normalized
.opt = W("normalized")
18 changes: 18 additions & 0 deletions src/aro/Parser.zig
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ fn validateExtendedIdentifier(p: *Parser) !bool {
var invalid_char: u21 = undefined;
var loc = p.pp.tokens.items(.loc)[p.tok_i];

var normalized = true;
var last_canonical_class: char_info.CanonicalCombiningClass = .not_reordered;
const standard = p.comp.langopts.standard;
while (it.nextCodepoint()) |codepoint| {
defer {
Expand Down Expand Up @@ -260,6 +262,22 @@ fn validateExtendedIdentifier(p: *Parser) !bool {
if (!warned) {
warned = try checkIdentifierCodepointWarnings(p.comp, codepoint, loc);
}

// Check NFC normalization.
if (!normalized) continue;
const canonical_class = char_info.getCanonicalClass(codepoint);
if (@intFromEnum(last_canonical_class) > @intFromEnum(canonical_class) and
canonical_class != .not_reordered)
{
normalized = false;
try p.errStr(.identifier_not_normalized, p.tok_i, slice);
continue;
}
if (char_info.isNormalized(codepoint) != .yes) {
normalized = false;
try p.errExtra(.identifier_not_normalized, p.tok_i, .{ .normalized = slice });
}
last_canonical_class = canonical_class;
}

if (!valid_identifier) {
Expand Down
Loading

0 comments on commit e95f60c

Please sign in to comment.