Skip to content

Commit f1efc63

Browse files
committed
perf(lexer): skip single space in read_next_token (#15513)
It's very common for tokens to be separated by a single space. e.g. `const x = 1`, `x === y`. Previously a single space resulted in calling the `SPS` byte handler, which consumes the space, and then going round the loop again in `Lexer::read_next_token`. Instead, branchlessly consume a single space (if there is one) before calling the byte handler. Gives between 2% and 7% perf improvement on parser benchmarks. --- This also enables a further optimization (not yet implemented). Now the handler for whitespace (`SPS`) no longer has a hot path for single spaces - it's now only called for a tab, or a 2nd space in a row. In both those cases, it's quite likely there'll be more whitespace following it, so it can now be optimized for that case, and continue consuming bytes until it finds one that *isn't* whitespace. If handlers for whitespace, line breaks, and comments all continue consuming bytes until they find a "real" token, then we can get rid of `Kind::Skip`, and remove the loop from `read_next_token`. This would remove another unpredictable branch.
1 parent b310c28 commit f1efc63

File tree

1 file changed

+53
-10
lines changed
  • crates/oxc_parser/src/lexer

1 file changed

+53
-10
lines changed

crates/oxc_parser/src/lexer/mod.rs

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -329,25 +329,68 @@ impl<'a> Lexer<'a> {
329329
fn read_next_token(&mut self) -> Kind {
330330
self.trivia_builder.has_pure_comment = false;
331331
self.trivia_builder.has_no_side_effects_comment = false;
332+
333+
let end_pos = self.source.end();
332334
loop {
333-
let offset = self.offset();
334-
self.token.set_start(offset);
335-
336-
let Some(byte) = self.peek_byte() else {
337-
// Hint to compiler that this branch is rarely taken (only once at EOF)
338-
#[cold]
339-
fn eof() -> Kind {
340-
Kind::Eof
335+
// Single spaces between tokens are common, so consume a space before processing the next token.
336+
// Do this without a branch. This produces more instructions, but avoids an unpredictable branch.
337+
// Can only do this if there are at least 2 bytes left in source.
338+
// If there aren't 2 bytes left, delegate to `read_next_token_at_end` (cold branch).
339+
let mut pos = self.source.position();
340+
// SAFETY: `source.end()` is always equal to or after `source.position()`
341+
let remaining_bytes = unsafe { end_pos.offset_from(pos) };
342+
if remaining_bytes >= 2 {
343+
// Read next byte.
344+
// SAFETY: There are at least 2 bytes remaining in source.
345+
let byte = unsafe { pos.read() };
346+
347+
// If next byte is a space, advance by 1 byte.
348+
// Do this with maths, instead of a branch.
349+
let is_space = byte == b' ';
350+
// SAFETY: There are at least 2 bytes remaining in source, so advancing 1 byte cannot be out of bounds
351+
pos = unsafe { pos.add(usize::from(is_space)) };
352+
self.source.set_position(pos);
353+
354+
// Read next byte again, in case we skipped a space.
355+
// SAFETY: We checked above that there were at least 2 bytes to read,
356+
// and we skipped a maximum of 1 byte, so there's still at least 1 byte left to read.
357+
let byte = unsafe { pos.read() };
358+
359+
// Set token start
360+
let offset = self.source.offset_of(pos);
361+
self.token.set_start(offset);
362+
363+
// SAFETY: `byte` is byte value at current position in source
364+
let kind = unsafe { self.handle_byte(byte) };
365+
if kind != Kind::Skip {
366+
return kind;
341367
}
342-
return eof();
343-
};
368+
} else {
369+
// Only 0 or 1 bytes left in source.
370+
// Delegate to `#[cold]` function as this is a very rare case.
371+
return self.read_next_token_at_end();
372+
}
373+
}
374+
}
375+
376+
/// Cold path for reading next token where only 0 or 1 bytes are left in source.
377+
#[inline(never)]
378+
#[cold]
379+
fn read_next_token_at_end(&mut self) -> Kind {
380+
let offset = self.offset();
381+
self.token.set_start(offset);
344382

383+
if let Some(byte) = self.peek_byte() {
345384
// SAFETY: `byte` is byte value at current position in source
346385
let kind = unsafe { self.handle_byte(byte) };
347386
if kind != Kind::Skip {
348387
return kind;
349388
}
389+
// Last byte was whitespace/line break (`Kind::Skip`), so now at EOF
390+
self.token.set_start(offset + 1);
350391
}
392+
393+
Kind::Eof
351394
}
352395
}
353396

0 commit comments

Comments
 (0)