diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index bab075114d50..a8fa1454a531 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error; -/// Decodes the UTF-8 codepoint encoded in the given slice of bytes. -/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. -/// If you already know the length at comptime, you can call one of -/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. +/// Deprecated. This function has an awkward API that is too easy to use incorrectly. pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 { return switch (bytes.len) { - 1 => @as(u21, bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3(bytes), - 4 => utf8Decode4(bytes), + 1 => bytes[0], + 2 => utf8Decode2(bytes[0..2].*), + 3 => utf8Decode3(bytes[0..3].*), + 4 => utf8Decode4(bytes[0..4].*), else => unreachable, }; } @@ -113,8 +110,7 @@ const Utf8Decode2Error = error{ Utf8ExpectedContinuation, Utf8OverlongEncoding, }; -pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 { - assert(bytes.len == 2); +pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 { assert(bytes[0] & 0b11100000 == 0b11000000); var value: u21 = bytes[0] & 0b00011111; @@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 { const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{ Utf8EncodesSurrogateHalf, }; -pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 { +pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 { const value = try utf8Decode3AllowSurrogateHalf(bytes); if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf; @@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{ Utf8ExpectedContinuation, Utf8OverlongEncoding, }; -pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 { - assert(bytes.len == 3); +pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 { assert(bytes[0] & 0b11110000 == 0b11100000); var value: u21 = bytes[0] & 0b00001111; @@ -165,8 +160,7 @@ const Utf8Decode4Error = error{ Utf8OverlongEncoding, Utf8CodepointTooLarge, }; -pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { - assert(bytes.len == 4); +pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 { assert(bytes[0] & 0b11111000 == 0b11110000); var value: u21 = bytes[0] & 0b00000111; @@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 { const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error; +/// Deprecated. This function has an awkward API that is too easy to use incorrectly. pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 { return switch (bytes.len) { - 1 => @as(u21, bytes[0]), - 2 => utf8Decode2(bytes), - 3 => utf8Decode3AllowSurrogateHalf(bytes), - 4 => utf8Decode4(bytes), + 1 => bytes[0], + 2 => utf8Decode2(bytes[0..2].*), + 3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*), + 4 => utf8Decode4(bytes[0..4].*), else => unreachable, }; } diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig index f55d78b6cedd..1f734cef63f0 100644 --- a/lib/std/zig/Ast.zig +++ b/lib/std/zig/Ast.zig @@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A const token = tokenizer.next(); try tokens.append(gpa, .{ .tag = token.tag, - .start = @as(u32, @intCast(token.loc.start)), + .start = @intCast(token.loc.start), }); if (token.tag == .eof) break; } diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig index a6be743c2bc0..c24aa6d06325 100644 --- a/lib/std/zig/AstGen.zig +++ b/lib/std/zig/AstGen.zig @@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token .{raw_string[bad_index]}, ); }, + .empty_char_literal => { + return astgen.failOff(token, offset, "empty character literal", .{}); + }, } } @@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void { var msg: std.ArrayListUnmanaged(u8) = .{}; defer msg.deinit(gpa); - const token_starts = tree.tokens.items(.start); - const token_tags = tree.tokens.items(.tag); - var notes: std.ArrayListUnmanaged(u32) = .{}; defer notes.deinit(gpa); - const tok = parse_err.token + @intFromBool(parse_err.token_is_prev); - if (token_tags[tok] == .invalid) { - const bad_off: u32 = @intCast(tree.tokenSlice(tok).len); - const byte_abs = token_starts[tok] + bad_off; - try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{ - std.zig.fmtEscapes(tree.source[byte_abs..][0..1]), - })); - } - for (tree.errors[1..]) |note| { if (!note.is_note) break; diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig index 5130ce403757..530aa924d08c 100644 --- a/lib/std/zig/parser_test.zig +++ b/lib/std/zig/parser_test.zig @@ -6061,7 +6061,6 @@ test "recovery: invalid container members" { , &[_]Error{ .expected_expr, .expected_comma_after_field, - .expected_type_expr, .expected_semi_after_stmt, }); } diff --git a/lib/std/zig/string_literal.zig b/lib/std/zig/string_literal.zig index c160d16b07bd..69178098379b 100644 --- a/lib/std/zig/string_literal.zig +++ b/lib/std/zig/string_literal.zig @@ -1,6 +1,5 @@ const std = @import("../std.zig"); const assert = std.debug.assert; -const utf8Decode = std.unicode.utf8Decode; const utf8Encode = std.unicode.utf8Encode; pub const ParseError = error{ @@ -37,12 +36,16 @@ pub const Error = union(enum) { expected_single_quote: usize, /// The character at this index cannot be represented without an escape sequence. invalid_character: usize, + /// `''`. Not returned for string literals. + empty_char_literal, }; -/// Only validates escape sequence characters. -/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between. +/// Asserts the slice starts and ends with single-quotes. +/// Returns an error if there is not exactly one UTF-8 codepoint in between. pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { - assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\''); + if (slice.len < 3) return .{ .failure = .empty_char_literal }; + assert(slice[0] == '\''); + assert(slice[slice.len - 1] == '\''); switch (slice[1]) { '\\' => { @@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral { }, 0 => return .{ .failure = .{ .invalid_character = 1 } }, else => { - const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable; + const inner = slice[1 .. slice.len - 1]; + const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{ + .failure = .{ .invalid_unicode_codepoint = 1 }, + }; + if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } }; + const codepoint = switch (n) { + 1 => inner[0], + 2 => std.unicode.utf8Decode2(inner[0..2].*), + 3 => std.unicode.utf8Decode3(inner[0..3].*), + 4 => std.unicode.utf8Decode4(inner[0..4].*), + else => unreachable, + } catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } }; return .{ .success = codepoint }; }, } diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index 36cbf9a856fb..32e11b1b9a6d 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -320,7 +320,7 @@ pub const Token = struct { pub fn symbol(tag: Tag) []const u8 { return tag.lexeme() orelse switch (tag) { - .invalid => "invalid bytes", + .invalid => "invalid token", .identifier => "an identifier", .string_literal, .multiline_string_literal_line => "a string literal", .char_literal => "a character literal", @@ -338,22 +338,22 @@ pub const Tokenizer = struct { buffer: [:0]const u8, index: usize, - /// For debugging purposes + /// For debugging purposes. pub fn dump(self: *Tokenizer, token: *const Token) void { std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] }); } pub fn init(buffer: [:0]const u8) Tokenizer { - // Skip the UTF-8 BOM if present - const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0; - return Tokenizer{ + // Skip the UTF-8 BOM if present. + return .{ .buffer = buffer, - .index = src_start, + .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, }; } const State = enum { start, + expect_newline, identifier, builtin, string_literal, @@ -361,10 +361,6 @@ pub const Tokenizer = struct { multiline_string_literal_line, char_literal, char_literal_backslash, - char_literal_hex_escape, - char_literal_unicode_escape_saw_u, - char_literal_unicode_escape, - char_literal_end, backslash, equal, bang, @@ -400,32 +396,38 @@ pub const Tokenizer = struct { period_2, period_asterisk, saw_at_sign, + invalid, }; + /// After this returns invalid, it will reset on the next newline, returning tokens starting from there. + /// An eof token will always be returned at the end. pub fn next(self: *Tokenizer) Token { var state: State = .start; - var result = Token{ - .tag = .eof, + var result: Token = .{ + .tag = undefined, .loc = .{ .start = self.index, .end = undefined, }, }; - var seen_escape_digits: usize = undefined; while (true) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { .start => switch (c) { 0 => { - if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; - } - break; - }, - ' ', '\n', '\t', '\r' => { + if (self.index == self.buffer.len) return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + state = .invalid; + }, + '\r' => { + state = .expect_newline; + }, + ' ', '\n', '\t' => { result.loc.start = self.index + 1; }, '"' => { @@ -434,6 +436,7 @@ pub const Tokenizer = struct { }, '\'' => { state = .char_literal; + result.tag = .char_literal; }, 'a'...'z', 'A'...'Z', '_' => { state = .identifier; @@ -545,14 +548,37 @@ pub const Tokenizer = struct { result.tag = .number_literal; }, else => { + state = .invalid; + }, + }, + + .expect_newline => switch (c) { + '\n' => { + result.loc.start = self.index + 1; + state = .start; + }, + else => { + state = .invalid; + }, + }, + + .invalid => switch (c) { + 0 => if (self.index == self.buffer.len) { + result.tag = .invalid; + break; + }, + '\n' => { result.tag = .invalid; - result.loc.end = self.index; - self.index += std.unicode.utf8ByteSequenceLength(c) catch 1; - return result; + break; }, + else => continue, }, .saw_at_sign => switch (c) { + 0, '\n' => { + result.tag = .invalid; + break; + }, '"' => { result.tag = .identifier; state = .string_literal; @@ -562,8 +588,7 @@ pub const Tokenizer = struct { result.tag = .builtin; }, else => { - result.tag = .invalid; - break; + state = .invalid; }, }, @@ -698,7 +723,7 @@ pub const Tokenizer = struct { }, .identifier => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, + 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue, else => { if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| { result.tag = tag; @@ -707,26 +732,37 @@ pub const Tokenizer = struct { }, }, .builtin => switch (c) { - 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, + 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue, else => break, }, .backslash => switch (c) { + 0 => { + result.tag = .invalid; + break; + }, '\\' => { state = .multiline_string_literal_line; }, - else => { + '\n' => { result.tag = .invalid; break; }, + else => { + state = .invalid; + }, }, .string_literal => switch (c) { - 0, '\n' => { - result.tag = .invalid; - result.loc.end = self.index; + 0 => { if (self.index != self.buffer.len) { - self.index += 1; + state = .invalid; + continue; } - return result; + result.tag = .invalid; + break; + }, + '\n' => { + result.tag = .invalid; + break; }, '\\' => { state = .string_literal_backslash; @@ -735,150 +771,74 @@ pub const Tokenizer = struct { self.index += 1; break; }, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0x01...0x09, 0x0b...0x1f, 0x7f => { + state = .invalid; }, + else => continue, }, .string_literal_backslash => switch (c) { 0, '\n' => { result.tag = .invalid; - result.loc.end = self.index; - if (self.index != self.buffer.len) { - self.index += 1; - } - return result; + break; }, else => { state = .string_literal; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .char_literal => switch (c) { - 0, '\n', '\'' => { - result.tag = .invalid; - result.loc.end = self.index; + 0 => { if (self.index != self.buffer.len) { - self.index += 1; + state = .invalid; + continue; } - return result; + result.tag = .invalid; + break; + }, + '\n' => { + result.tag = .invalid; + break; }, '\\' => { state = .char_literal_backslash; }, - else => { - state = .char_literal_end; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + '\'' => { + self.index += 1; + break; }, + 0x01...0x09, 0x0b...0x1f, 0x7f => { + state = .invalid; + }, + else => continue, }, .char_literal_backslash => switch (c) { - 0, '\n' => { - result.tag = .invalid; - result.loc.end = self.index; + 0 => { if (self.index != self.buffer.len) { - self.index += 1; - } - return result; - }, - 'x' => { - state = .char_literal_hex_escape; - seen_escape_digits = 0; - }, - 'u' => { - state = .char_literal_unicode_escape_saw_u; - }, - else => { - state = .char_literal_end; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; - }, - }, - - .char_literal_hex_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => { - seen_escape_digits += 1; - if (seen_escape_digits == 2) { - state = .char_literal_end; + state = .invalid; + continue; } - }, - else => { result.tag = .invalid; break; }, - }, - - .char_literal_unicode_escape_saw_u => switch (c) { - '{' => { - state = .char_literal_unicode_escape; - }, - else => { - result.tag = .invalid; - break; - }, - }, - - .char_literal_unicode_escape => switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => {}, - '}' => { - state = .char_literal_end; // too many/few digits handled later - }, - else => { + '\n' => { result.tag = .invalid; break; }, - }, - - .char_literal_end => switch (c) { - '\'' => { - result.tag = .char_literal; - self.index += 1; - break; + 0x01...0x09, 0x0b...0x1f, 0x7f => { + state = .invalid; }, else => { - result.tag = .invalid; - break; + state = .char_literal; }, }, .multiline_string_literal_line => switch (c) { 0 => { if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; + state = .invalid; + continue; } break; }, @@ -886,17 +846,10 @@ pub const Tokenizer = struct { self.index += 1; break; }, - '\t' => {}, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0x01...0x08, 0x0b...0x1f, 0x7f => { + state = .invalid; }, + else => continue, }, .bang => switch (c) { @@ -1113,12 +1066,16 @@ pub const Tokenizer = struct { .line_comment_start => switch (c) { 0 => { if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; + state = .invalid; + continue; } - break; + return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; }, '/' => { state = .doc_comment_start; @@ -1127,105 +1084,74 @@ pub const Tokenizer = struct { result.tag = .container_doc_comment; state = .doc_comment; }, + '\r' => { + state = .expect_newline; + }, '\n' => { state = .start; result.loc.start = self.index + 1; }, - '\t' => { - state = .line_comment; + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; }, else => { state = .line_comment; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .doc_comment_start => switch (c) { - '/' => { - state = .line_comment; - }, - 0 => { - if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; - } + 0, '\n', '\r' => { result.tag = .doc_comment; break; }, - '\n' => { - result.tag = .doc_comment; - break; + '/' => { + state = .line_comment; }, - '\t' => { - state = .doc_comment; - result.tag = .doc_comment; + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; }, else => { state = .doc_comment; result.tag = .doc_comment; - - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .line_comment => switch (c) { 0 => { if (self.index != self.buffer.len) { - result.tag = .invalid; - result.loc.end = self.index; - self.index += 1; - return result; + state = .invalid; + continue; } - break; + return .{ + .tag = .eof, + .loc = .{ + .start = self.index, + .end = self.index, + }, + }; + }, + '\r' => { + state = .expect_newline; }, '\n' => { state = .start; result.loc.start = self.index + 1; }, - '\t' => {}, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; }, + else => continue, }, .doc_comment => switch (c) { - 0, '\n' => break, - '\t' => {}, - else => { - if (self.invalidCharacterLength()) |len| { - result.tag = .invalid; - result.loc.end = self.index; - self.index += len; - return result; - } - - self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + 0, '\n', '\r' => { + break; }, + 0x01...0x08, 0x0b...0x0c, 0x0e...0x1f, 0x7f => { + state = .invalid; + }, + else => continue, }, .int => switch (c) { '.' => state = .int_period, - '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {}, + '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue, 'e', 'E', 'p', 'P' => state = .int_exponent, else => break, }, @@ -1249,7 +1175,7 @@ pub const Tokenizer = struct { }, }, .float => switch (c) { - '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {}, + '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue, 'e', 'E', 'p', 'P' => state = .float_exponent, else => break, }, @@ -1263,57 +1189,9 @@ pub const Tokenizer = struct { } } - if (result.tag == .eof) { - result.loc.start = self.index; - } - result.loc.end = self.index; return result; } - - fn invalidCharacterLength(self: *Tokenizer) ?u3 { - const c0 = self.buffer[self.index]; - if (std.ascii.isAscii(c0)) { - if (c0 == '\r') { - if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') { - // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise - // they constitute an illegal byte! - return null; - } else { - return 1; - } - } else if (std.ascii.isControl(c0)) { - // ascii control codes are never allowed - // (note that \n was checked before we got here) - return 1; - } - // looks fine to me. - return null; - } else { - // check utf8-encoded character. - const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; - if (self.index + length > self.buffer.len) { - return @as(u3, @intCast(self.buffer.len - self.index)); - } - const bytes = self.buffer[self.index .. self.index + length]; - switch (length) { - 2 => { - const value = std.unicode.utf8Decode2(bytes) catch return length; - if (value == 0x85) return length; // U+0085 (NEL) - }, - 3 => { - const value = std.unicode.utf8Decode3(bytes) catch return length; - if (value == 0x2028) return length; // U+2028 (LS) - if (value == 0x2029) return length; // U+2029 (PS) - }, - 4 => { - _ = std.unicode.utf8Decode4(bytes) catch return length; - }, - else => unreachable, - } - return null; - } - } }; test "keywords" { @@ -1355,7 +1233,7 @@ test "code point literal with hex escape" { , &.{.char_literal}); try testTokenize( \\'\x1' - , &.{ .invalid, .invalid }); + , &.{.char_literal}); } test "newline in char literal" { @@ -1396,40 +1274,30 @@ test "code point literal with unicode escapes" { // Invalid unicode escapes try testTokenize( \\'\u' - , &.{ .invalid, .invalid }); + , &.{.char_literal}); try testTokenize( \\'\u{{' - , &.{ .invalid, .l_brace, .invalid }); + , &.{.char_literal}); try testTokenize( \\'\u{}' , &.{.char_literal}); try testTokenize( \\'\u{s}' - , &.{ - .invalid, - .identifier, - .r_brace, - .invalid, - }); + , &.{.char_literal}); try testTokenize( \\'\u{2z}' - , &.{ - .invalid, - .identifier, - .r_brace, - .invalid, - }); + , &.{.char_literal}); try testTokenize( \\'\u{4a' - , &.{ .invalid, .invalid }); // 4a is valid + , &.{.char_literal}); // Test old-style unicode literals try testTokenize( \\'\u0333' - , &.{ .invalid, .number_literal, .invalid }); + , &.{.char_literal}); try testTokenize( \\'\U0333' - , &.{ .invalid, .number_literal, .invalid }); + , &.{.char_literal}); } test "code point literal with unicode code point" { @@ -1465,24 +1333,15 @@ test "invalid token characters" { try testTokenize("`", &.{.invalid}); try testTokenize("'c", &.{.invalid}); try testTokenize("'", &.{.invalid}); - try testTokenize("''", &.{.invalid}); + try testTokenize("''", &.{.char_literal}); try testTokenize("'\n'", &.{ .invalid, .invalid }); } test "invalid literal/comment characters" { - try testTokenize("\"\x00\"", &.{ - .invalid, - .invalid, // Incomplete string literal starting after invalid - }); - try testTokenize("//\x00", &.{ - .invalid, - }); - try testTokenize("//\x1f", &.{ - .invalid, - }); - try testTokenize("//\x7f", &.{ - .invalid, - }); + try testTokenize("\"\x00\"", &.{.invalid}); + try testTokenize("//\x00", &.{.invalid}); + try testTokenize("//\x1f", &.{.invalid}); + try testTokenize("//\x7f", &.{.invalid}); } test "utf8" { @@ -1491,46 +1350,24 @@ test "utf8" { } test "invalid utf8" { - try testTokenize("//\x80", &.{ - .invalid, - }); - try testTokenize("//\xbf", &.{ - .invalid, - }); - try testTokenize("//\xf8", &.{ - .invalid, - }); - try testTokenize("//\xff", &.{ - .invalid, - }); - try testTokenize("//\xc2\xc0", &.{ - .invalid, - }); - try testTokenize("//\xe0", &.{ - .invalid, - }); - try testTokenize("//\xf0", &.{ - .invalid, - }); - try testTokenize("//\xf0\x90\x80\xc0", &.{ - .invalid, - }); + try testTokenize("//\x80", &.{}); + try testTokenize("//\xbf", &.{}); + try testTokenize("//\xf8", &.{}); + try testTokenize("//\xff", &.{}); + try testTokenize("//\xc2\xc0", &.{}); + try testTokenize("//\xe0", &.{}); + try testTokenize("//\xf0", &.{}); + try testTokenize("//\xf0\x90\x80\xc0", &.{}); } test "illegal unicode codepoints" { // unicode newline characters.U+0085, U+2028, U+2029 try testTokenize("//\xc2\x84", &.{}); - try testTokenize("//\xc2\x85", &.{ - .invalid, - }); + try testTokenize("//\xc2\x85", &.{}); try testTokenize("//\xc2\x86", &.{}); try testTokenize("//\xe2\x80\xa7", &.{}); - try testTokenize("//\xe2\x80\xa8", &.{ - .invalid, - }); - try testTokenize("//\xe2\x80\xa9", &.{ - .invalid, - }); + try testTokenize("//\xe2\x80\xa8", &.{}); + try testTokenize("//\xe2\x80\xa9", &.{}); try testTokenize("//\xe2\x80\xaa", &.{}); } @@ -1892,8 +1729,8 @@ test "multi line string literal with only 1 backslash" { } test "invalid builtin identifiers" { - try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren }); - try testTokenize("@0()", &.{ .invalid, .number_literal, .l_paren, .r_paren }); + try testTokenize("@()", &.{.invalid}); + try testTokenize("@0()", &.{.invalid}); } test "invalid token with unfinished escape right before eof" { @@ -1921,12 +1758,12 @@ test "saturating operators" { } test "null byte before eof" { - try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal }); + try testTokenize("123 \x00 456", &.{ .number_literal, .invalid }); try testTokenize("//\x00", &.{.invalid}); try testTokenize("\\\\\x00", &.{.invalid}); try testTokenize("\x00", &.{.invalid}); try testTokenize("// NUL\x00\n", &.{.invalid}); - try testTokenize("///\x00\n", &.{.invalid}); + try testTokenize("///\x00\n", &.{ .doc_comment, .invalid }); try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid }); } @@ -1936,6 +1773,9 @@ fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !v const token = tokenizer.next(); try std.testing.expectEqual(expected_token_tag, token.tag); } + // Last token should always be eof, even when the last token was invalid, + // in which case the tokenizer is in an invalid state, which can only be + // recovered by opinionated means outside the scope of this implementation. const last_token = tokenizer.next(); try std.testing.expectEqual(Token.Tag.eof, last_token.tag); try std.testing.expectEqual(source.len, last_token.loc.start); diff --git a/src/Package/Manifest.zig b/src/Package/Manifest.zig index d9c39f5ab3fe..eb82ef039da2 100644 --- a/src/Package/Manifest.zig +++ b/src/Package/Manifest.zig @@ -549,6 +549,9 @@ const Parse = struct { .{raw_string[bad_index]}, ); }, + .empty_char_literal => { + try p.appendErrorOff(token, offset, "empty character literal", .{}); + }, } } diff --git a/test/cases/compile_errors/empty_char_lit.zig b/test/cases/compile_errors/empty_char_lit.zig new file mode 100644 index 000000000000..99d80778b19b --- /dev/null +++ b/test/cases/compile_errors/empty_char_lit.zig @@ -0,0 +1,9 @@ +export fn entry() u8 { + return ''; +} + +// error +// backend=stage2 +// target=native +// +// :2:12: error: empty character literal diff --git a/test/cases/compile_errors/invalid_legacy_unicode_escape.zig b/test/cases/compile_errors/invalid_legacy_unicode_escape.zig index cc4e78f6e4ed..c22d70f3eba3 100644 --- a/test/cases/compile_errors/invalid_legacy_unicode_escape.zig +++ b/test/cases/compile_errors/invalid_legacy_unicode_escape.zig @@ -6,5 +6,4 @@ export fn entry() void { // backend=stage2 // target=native // -// :2:15: error: expected expression, found 'invalid bytes' -// :2:18: note: invalid byte: '1' +// :2:17: error: invalid escape character: 'U' diff --git a/test/cases/compile_errors/invalid_unicode_escape.zig b/test/cases/compile_errors/invalid_unicode_escape.zig index 1555f2be801a..956b4a37a2c7 100644 --- a/test/cases/compile_errors/invalid_unicode_escape.zig +++ b/test/cases/compile_errors/invalid_unicode_escape.zig @@ -6,6 +6,5 @@ export fn entry() void { // backend=stage2 // target=native // -// :2:15: error: expected expression, found 'invalid bytes' -// :2:21: note: invalid byte: 'z' +// :2:21: error: expected hex digit or '}', found 'z' diff --git a/test/cases/compile_errors/normal_string_with_newline.zig b/test/cases/compile_errors/normal_string_with_newline.zig index 19e15133ee34..f19ce59ec814 100644 --- a/test/cases/compile_errors/normal_string_with_newline.zig +++ b/test/cases/compile_errors/normal_string_with_newline.zig @@ -5,5 +5,4 @@ b"; // backend=stage2 // target=native // -// :1:13: error: expected expression, found 'invalid bytes' -// :1:15: note: invalid byte: '\n' +// :1:13: error: expected expression, found 'invalid token' diff --git a/test/compile_errors.zig b/test/compile_errors.zig index 5c5a574caf5b..c7a3be8f9fc6 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -42,8 +42,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host); case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{ - ":1:13: error: expected expression, found 'invalid bytes'", - ":1:19: note: invalid byte: '\\r'", + ":1:13: error: expected expression, found 'invalid token'", }); } @@ -179,8 +178,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { \\ return true; \\} , &[_][]const u8{ - ":1:1: error: expected type expression, found 'invalid bytes'", - ":1:1: note: invalid byte: '\\xff'", + ":1:1: error: expected type expression, found 'invalid token'", }); } @@ -222,8 +220,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("invalid byte in string", b.graph.host); case.addError("_ = \"\x01Q\";", &[_][]const u8{ - ":1:5: error: expected expression, found 'invalid bytes'", - ":1:6: note: invalid byte: '\\x01'", + ":1:5: error: expected expression, found 'invalid token'", }); } @@ -231,8 +228,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("invalid byte in comment", b.graph.host); case.addError("//\x01Q", &[_][]const u8{ - ":1:1: error: expected type expression, found 'invalid bytes'", - ":1:3: note: invalid byte: '\\x01'", + ":1:1: error: expected type expression, found 'invalid token'", }); } @@ -240,8 +236,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("control character in character literal", b.graph.host); case.addError("const c = '\x01';", &[_][]const u8{ - ":1:11: error: expected expression, found 'invalid bytes'", - ":1:12: note: invalid byte: '\\x01'", + ":1:11: error: expected expression, found 'invalid token'", }); } @@ -249,8 +244,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("invalid byte at start of token", b.graph.host); case.addError("x = \x00Q", &[_][]const u8{ - ":1:5: error: expected expression, found 'invalid bytes'", - ":1:5: note: invalid byte: '\\x00'", + ":1:5: error: expected expression, found 'invalid token'", }); } }