Skip to content

Commit

Permalink
Recognize & skip the UTF-8 BOM
Browse files Browse the repository at this point in the history
  • Loading branch information
LemonBoy authored and andrewrk committed Sep 11, 2019
1 parent 0eddee4 commit f36b8fd
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
9 changes: 7 additions & 2 deletions src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,14 @@ void tokenize(Buf *buf, Tokenization *out) {
t.buf = buf;

out->line_offsets = allocate<ZigList<size_t>>(1);

out->line_offsets->append(0);
for (t.pos = 0; t.pos < buf_len(t.buf); t.pos += 1) {

// Skip the UTF-8 BOM if present
if (buf_starts_with_mem(buf, "\xEF\xBB\xBF", 3)) {
t.pos += 3;
}

for (; t.pos < buf_len(t.buf); t.pos += 1) {
uint8_t c = buf_ptr(t.buf)[t.pos];
switch (t.state) {
case TokenizeStateError:
Expand Down
11 changes: 10 additions & 1 deletion std/zig/tokenizer.zig
Original file line number Diff line number Diff line change
Expand Up @@ -222,9 +222,11 @@ pub const Tokenizer = struct {
},
};
} else {
// Skip the UTF-8 BOM if present
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else usize(0);
return Tokenizer{
.buffer = buffer,
.index = 0,
.index = src_start,
.pending_invalid_token = null,
};
}
Expand Down Expand Up @@ -1455,6 +1457,13 @@ test "tokenizer - line comment followed by identifier" {
});
}

test "tokenizer - UTF-8 BOM is recognized and skipped" {
testTokenize("\xEF\xBB\xBFa;\n", [_]Token.Id{
Token.Id.Identifier,
Token.Id.Semicolon,
});
}

fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void {
var tokenizer = Tokenizer.init(source);
for (expected_tokens) |expected_token_id| {
Expand Down

0 comments on commit f36b8fd

Please sign in to comment.