perf(lexer): skip single space in read_next_token (#15513)

overlookmotel · overlookmotel · commit f1efc6387dfd · 2025-11-09T16:45:48.000Z
It's very common for tokens to be separated by a single space. e.g. `const x = 1`, `x === y`.

Previously a single space resulted in calling the `SPS` byte handler, which consumes the space, and then going round the loop again in `Lexer::read_next_token`.

Instead, branchlessly consume a single space (if there is one) before calling the byte handler.

Gives between 2% and 7% perf improvement on parser benchmarks.

---

This also enables a further optimization (not yet implemented).

Now the handler for whitespace (`SPS`) no longer has a hot path for single spaces - it's now only called for a tab, or a 2nd space in a row. In both those cases, it's quite likely there'll be more whitespace following it, so it can now be optimized for that case, and continue consuming bytes until it finds one that *isn't* whitespace.

If handlers for whitespace, line breaks, and comments all continue consuming bytes until they find a "real" token, then we can get rid of `Kind::Skip`, and remove the loop from `read_next_token`. This would remove another unpredictable branch.
diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
@@ -329,25 +329,68 @@ impl<'a> Lexer<'a> {
     fn read_next_token(&mut self) -> Kind {
         self.trivia_builder.has_pure_comment = false;
         self.trivia_builder.has_no_side_effects_comment = false;
+
+        let end_pos = self.source.end();
         loop {
-            let offset = self.offset();
-            self.token.set_start(offset);
-
-            let Some(byte) = self.peek_byte() else {
-                // Hint to compiler that this branch is rarely taken (only once at EOF)
-                #[cold]
-                fn eof() -> Kind {
-                    Kind::Eof
+            // Single spaces between tokens are common, so consume a space before processing the next token.
+            // Do this without a branch. This produces more instructions, but avoids an unpredictable branch.
+            // Can only do this if there are at least 2 bytes left in source.
+            // If there aren't 2 bytes left, delegate to `read_next_token_at_end` (cold branch).
+            let mut pos = self.source.position();
+            // SAFETY: `source.end()` is always equal to or after `source.position()`
+            let remaining_bytes = unsafe { end_pos.offset_from(pos) };
+            if remaining_bytes >= 2 {
+                // Read next byte.
+                // SAFETY: There are at least 2 bytes remaining in source.
+                let byte = unsafe { pos.read() };
+
+                // If next byte is a space, advance by 1 byte.
+                // Do this with maths, instead of a branch.
+                let is_space = byte == b' ';
+                // SAFETY: There are at least 2 bytes remaining in source, so advancing 1 byte cannot be out of bounds
+                pos = unsafe { pos.add(usize::from(is_space)) };
+                self.source.set_position(pos);
+
+                // Read next byte again, in case we skipped a space.
+                // SAFETY: We checked above that there were at least 2 bytes to read,
+                // and we skipped a maximum of 1 byte, so there's still at least 1 byte left to read.
+                let byte = unsafe { pos.read() };
+
+                // Set token start
+                let offset = self.source.offset_of(pos);
+                self.token.set_start(offset);
+
+                // SAFETY: `byte` is byte value at current position in source
+                let kind = unsafe { self.handle_byte(byte) };
+                if kind != Kind::Skip {
+                    return kind;
                 }
-                return eof();
-            };
+            } else {
+                // Only 0 or 1 bytes left in source.
+                // Delegate to `#[cold]` function as this is a very rare case.
+                return self.read_next_token_at_end();
+            }
+        }
+    }
+
+    /// Cold path for reading next token where only 0 or 1 bytes are left in source.
+    #[inline(never)]
+    #[cold]
+    fn read_next_token_at_end(&mut self) -> Kind {
+        let offset = self.offset();
+        self.token.set_start(offset);
 
+        if let Some(byte) = self.peek_byte() {
             // SAFETY: `byte` is byte value at current position in source
             let kind = unsafe { self.handle_byte(byte) };
             if kind != Kind::Skip {
                 return kind;
             }
+            // Last byte was whitespace/line break (`Kind::Skip`), so now at EOF
+            self.token.set_start(offset + 1);
         }
+
+        Kind::Eof
     }
 }