From 070d3c475cfe203b687ca241cee4f773d9644ce3 Mon Sep 17 00:00:00 2001
From: overlookmotel <theoverlookmotel@gmail.com>
Date: Thu, 8 Feb 2024 19:48:35 +0000
Subject: [PATCH] perf(parser): lex identifiers as bytes not chars

---
 Cargo.lock                                    |   7 +
 Cargo.toml                                    |   1 +
 crates/oxc_parser/Cargo.toml                  |   1 +
 crates/oxc_parser/src/lexer/byte_handlers.rs  |  20 +-
 crates/oxc_parser/src/lexer/identifier.rs     | 326 ++++++++++--
 crates/oxc_parser/src/lexer/mod.rs            |   7 +
 crates/oxc_parser/src/lexer/search.rs         | 462 ++++++++++++++++++
 crates/oxc_parser/src/lexer/source.rs         |  68 ++-
 crates/oxc_parser/src/lexer/string_builder.rs |   2 +
 crates/oxc_parser/src/lexer/unicode.rs        |  13 +-
 10 files changed, 841 insertions(+), 66 deletions(-)
 create mode 100644 crates/oxc_parser/src/lexer/search.rs

diff --git a/Cargo.lock b/Cargo.lock
index 3cbadbab3c0ad..002ac5ae9ef14 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1603,6 +1603,7 @@ dependencies = [
  "oxc_span",
  "oxc_syntax",
  "rustc-hash",
+ "seq-macro",
  "serde_json",
 ]
 
@@ -2285,6 +2286,12 @@ version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0"
 
+[[package]]
+name = "seq-macro"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
+
 [[package]]
 name = "serde"
 version = "1.0.196"
diff --git a/Cargo.toml b/Cargo.toml
index 3b6a5709d0939..e50b18f4ff1e5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,6 +111,7 @@ regex                     = { version = "1.10.3" }
 rustc-hash                = { version = "1.1.0", default-features = false, features = ["std"] }
 ryu-js                    = { version = "1.0.0" }
 ropey                     = { version = "1.6.1" }
+seq-macro                 = { version = "0.3.5" }
 serde                     = { version = "1.0.196" }
 serde_json                = { version = "1.0.113" }
 syn                       = { version = "=1.0.109" }
diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml
index e6cc4f48b53bc..2da6d6fb22d7c 100644
--- a/crates/oxc_parser/Cargo.toml
+++ b/crates/oxc_parser/Cargo.toml
@@ -29,6 +29,7 @@ assert-unchecked = { workspace = true }
 bitflags         = { workspace = true }
 rustc-hash       = { workspace = true }
 num-bigint       = { workspace = true }
+seq-macro        = { workspace = true }
 
 [dev-dependencies]
 oxc_ast    = { workspace = true, features = ["serde"] }
diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs
index 7009e41b73082..ef720d115795d 100644
--- a/crates/oxc_parser/src/lexer/byte_handlers.rs
+++ b/crates/oxc_parser/src/lexer/byte_handlers.rs
@@ -1,4 +1,4 @@
-use super::{AutoCow, Kind, Lexer, LexerContext};
+use super::{Kind, Lexer, LexerContext};
 use crate::diagnostics;
 
 #[allow(clippy::unnecessary_safety_comment)]
@@ -137,7 +137,10 @@ macro_rules! ascii_byte_handler {
 /// (`a`-`z`, `A`-`Z`, `$` or `_`).
 ///
 /// Macro calls `Lexer::identifier_name_handler` to get the text of the identifier,
-/// and slices off first character.
+/// minus its first character.
+///
+/// `Lexer::identifier_name_handler` is an unsafe function, but if byte being consumed is ASCII,
+/// its requirements are met.
 ///
 /// # SAFETY
 /// Only use this macro to define byte handlers for ASCII characters.
@@ -156,7 +159,8 @@ macro_rules! ascii_byte_handler {
 /// const L_G: ByteHandler = {
 ///   #[allow(non_snake_case)]
 ///   fn L_G(lexer: &mut Lexer) -> Kind {
-///     let id_without_first_char = &lexer.identifier_name_handler()[1..];
+///     // SAFETY: This macro is only used for ASCII characters
+///     let id_without_first_char = unsafe { lexer.identifier_name_handler() };
 ///     match id_without_first_char {
 ///       "et" => Kind::Get,
 ///       "lobal" => Kind::Global,
@@ -169,7 +173,8 @@ macro_rules! ascii_byte_handler {
 macro_rules! ascii_identifier_handler {
     ($id:ident($str:ident) $body:expr) => {
         byte_handler!($id(lexer) {
-            let $str = &lexer.identifier_name_handler()[1..];
+            // SAFETY: This macro is only used for ASCII characters
+            let $str = unsafe { lexer.identifier_name_handler() };
             $body
         });
     };
@@ -439,12 +444,7 @@ ascii_byte_handler!(BTO(lexer) {
 
 // \
 ascii_byte_handler!(ESC(lexer) {
-    let mut builder = AutoCow::new(lexer);
-    lexer.consume_char();
-    builder.force_allocation_without_current_ascii_char(lexer);
-    lexer.identifier_unicode_escape_sequence(&mut builder, true);
-    let text = lexer.identifier_name(builder);
-    Kind::match_keyword(text)
+    lexer.identifier_backslash_handler()
 });
 
 // ]
diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs
index f28a3d6ec6869..f98141e4403cb 100644
--- a/crates/oxc_parser/src/lexer/identifier.rs
+++ b/crates/oxc_parser/src/lexer/identifier.rs
@@ -1,66 +1,300 @@
-use super::{AutoCow, Kind, Lexer, Span};
+use super::{
+    cold_branch,
+    search::{byte_search, safe_byte_match_table, SafeByteMatchTable},
+    Kind, Lexer, SourcePosition,
+};
 use crate::diagnostics;
 
-use oxc_syntax::identifier::{is_identifier_part, is_identifier_start};
+use std::cmp::max;
+
+use oxc_allocator::String;
+use oxc_span::Span;
+use oxc_syntax::identifier::{
+    is_identifier_part, is_identifier_part_unicode, is_identifier_start_unicode,
+};
+
+const MIN_ESCAPED_STR_LEN: usize = 16;
+
+static ASCII_ID_START_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| b.is_ascii_alphabetic() || b == b'_' || b == b'$');
+
+static NOT_ASCII_ID_CONTINUE_TABLE: SafeByteMatchTable =
+    safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$'));
+
+#[inline]
+fn is_identifier_start_ascii_byte(byte: u8) -> bool {
+    ASCII_ID_START_TABLE.matches(byte)
+}
 
 impl<'a> Lexer<'a> {
-    /// Section 12.7.1 Identifier Names
-    pub(super) fn identifier_name_handler(&mut self) -> &'a str {
-        let builder = AutoCow::new(self);
-        self.consume_char();
-        self.identifier_name(builder)
+    /// Handle identifier with ASCII start character.
+    /// Returns text of the identifier, minus its first char.
+    ///
+    /// Start character should not be consumed from `self.source` prior to calling this.
+    ///
+    /// This function is the "fast path" for the most common identifiers in JS code -
+    /// purely consisting of ASCII characters: `a`-`z`, `A`-`Z`, `0`-`9`, `_`, `$`.
+    /// JS syntax also allows Unicode identifiers and escapes (e.g. `\u{FF}`) in identifiers,
+    /// but they are very rare in practice. So this fast path will handle 99% of JS code.
+    ///
+    /// When Unicode or an escape is encountered, this function de-opts to paths which handle those
+    /// cases, but those paths are marked `#[cold]` to keep the ASCII fast path as fast as possible.
+    ///
+    /// The fast path uses pointers and unsafe code to minimize bounds checks etc.
+    /// The functions it delegates to for uncommon cases are both more complex, and less critical,
+    /// so they stick to safe code only.
+    ///
+    /// # SAFETY
+    /// * `self.source` must not be exhausted (at least 1 char remaining).
+    /// * Next char must be ASCII.
+    #[allow(clippy::missing_safety_doc)] // Clippy is wrong!
+    pub(super) unsafe fn identifier_name_handler(&mut self) -> &'a str {
+        // Advance past 1st byte.
+        // SAFETY: Caller guarantees not at EOF, and next byte is ASCII.
+        let after_first = self.source.position().add(1);
+
+        // Consume bytes which are part of identifier
+        byte_search! {
+            lexer: self,
+            table: NOT_ASCII_ID_CONTINUE_TABLE,
+            start: after_first,
+            handle_match: |next_byte| {
+                // Found a matching byte.
+                // Either end of identifier found, or a Unicode char, or `\` escape.
+                // Handle uncommon cases in cold branches to keep the common ASCII path
+                // as fast as possible.
+                if !next_byte.is_ascii() {
+                    return cold_branch(|| {
+                        // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
+                        // makes `start_pos` `source`'s position as it was at start of this function
+                        let start_pos = unsafe { after_first.sub(1) };
+                        &self.identifier_tail_unicode(start_pos)[1..]
+                    });
+                }
+                if next_byte == b'\\' {
+                    return cold_branch(|| {
+                        // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
+                        // makes `start_pos` `source`'s position as it was at start of this function
+                        let start_pos = unsafe { after_first.sub(1) };
+                        &self.identifier_backslash(start_pos, false)[1..]
+                    });
+                }
+
+                // Return identifier minus its first char.
+                // SAFETY: `after_first` was position of `lexer.source` at start of this search.
+                // Searching only proceeds in forwards direction, so `lexer.source.position()`
+                // cannot be before `after_first`.
+                unsafe { self.source.str_from_pos_to_current_unchecked(after_first) }
+            },
+            handle_eof: || {
+                // Return identifier minus its first char.
+                // SAFETY: `lexer.source` is positioned at EOF, so there is no valid value
+                // of `after_first` which could be after current position.
+                unsafe { self.source.str_from_pos_to_current_unchecked(after_first) }
+            },
+        };
     }
 
-    pub(super) fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str {
-        self.identifier_tail(builder)
+    /// Handle rest of identifier after first byte of a multi-byte Unicode char found.
+    /// Any number of characters can have already been consumed from `self.source` prior to it.
+    /// `self.source` should be positioned at start of Unicode character.
+    fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
+        let c = self.peek().unwrap();
+        if is_identifier_part_unicode(c) {
+            self.consume_char();
+            self.identifier_tail_after_unicode(start_pos)
+        } else {
+            // Reached end of identifier. Return identifier.
+            self.source.str_from_pos_to_current(start_pos)
+        }
     }
 
-    pub(super) fn private_identifier(&mut self) -> Kind {
-        let mut builder = AutoCow::new(self);
-        let start = self.offset();
-        match self.next_char() {
-            Some(c) if is_identifier_start(c) => {
-                builder.push_matching(c);
-            }
-            Some('\\') => {
-                builder.force_allocation_without_current_ascii_char(self);
-                self.identifier_unicode_escape_sequence(&mut builder, true);
-            }
-            Some(c) => {
-                #[allow(clippy::cast_possible_truncation)]
-                self.error(diagnostics::InvalidCharacter(
-                    c,
-                    Span::new(start, start + c.len_utf8() as u32),
-                ));
-                return Kind::Undetermined;
-            }
-            None => {
-                self.error(diagnostics::UnexpectedEnd(Span::new(start, start)));
-                return Kind::Undetermined;
+    /// Handle identifier after first char (which was Unicode) is dealt with.
+    ///
+    /// First char should have been consumed from `self.source` prior to calling this.
+    /// `start_pos` should be position of the start of the identifier (before first char was consumed).
+    pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
+        // Identifier contains a Unicode chars, so probably contains more.
+        // So just iterate over chars now, instead of bytes.
+        while let Some(c) = self.peek() {
+            if is_identifier_part(c) {
+                self.consume_char();
+            } else if c == '\\' {
+                // This branch marked cold as escapes are uncommon
+                return cold_branch(|| self.identifier_backslash(start_pos, false));
+            } else {
+                break;
             }
         }
-        self.identifier_tail(builder);
-        Kind::PrivateIdentifier
+
+        // Return identifier
+        self.source.str_from_pos_to_current(start_pos)
     }
 
-    fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str {
-        // ident tail
-        while let Some(c) = self.peek() {
-            if !is_identifier_part(c) {
-                if c == '\\' {
+    /// Handle identifier starting with `\` escape.
+    pub fn identifier_backslash_handler(&mut self) -> Kind {
+        // Create arena string to hold unescaped identifier.
+        // We don't know how long identifier will end up being, so guess.
+        let str = String::with_capacity_in(MIN_ESCAPED_STR_LEN, self.allocator);
+
+        // Process escape and get rest of identifier
+        let id = self.identifier_on_backslash(str, true);
+        Kind::match_keyword(id)
+    }
+
+    /// Consume rest of identifier after a `\` escape is found.
+    ///
+    /// The `\` must not have be consumed from `lexer.source`.
+    /// `start_pos` must be position of start of identifier.
+    fn identifier_backslash(&mut self, start_pos: SourcePosition, is_start: bool) -> &'a str {
+        // Create arena string to hold unescaped identifier.
+        // We don't know how long identifier will end up being. Take a guess that total length
+        // will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum.
+        let so_far = self.source.str_from_pos_to_current(start_pos);
+        let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN);
+        let mut str = String::with_capacity_in(capacity, self.allocator);
+
+        // Push identifier up this point into `str`
+        // `bumpalo::collections::string::String::push_str` is currently expensive due to
+        // inefficiency in bumpalo's implementation. But best we have right now.
+        str.push_str(so_far);
+
+        // Process escape and get rest of identifier
+        self.identifier_on_backslash(str, is_start)
+    }
+
+    /// Process rest of identifier after a `\` found.
+    ///
+    /// `self.source` should be positioned *on* the `\` (i.e. `\` has not been consumed yet).
+    /// `str` should contain the identifier up to before the escape.
+    /// `is_start` should be `true` if this is first char in the identifier, `false` otherwise.
+    fn identifier_on_backslash(&mut self, mut str: String<'a>, mut is_start: bool) -> &'a str {
+        'outer: loop {
+            // Consume `\`
+            self.consume_char();
+
+            // Consume escape sequence and add char to `str`
+            self.identifier_unicode_escape_sequence(&mut str, is_start);
+            is_start = false;
+
+            // Consume chars until reach end of identifier or another escape
+            let chunk_start = self.source.position();
+            loop {
+                let maybe_char = self.peek();
+                if maybe_char.is_some_and(is_identifier_part) {
                     self.consume_char();
-                    builder.force_allocation_without_current_ascii_char(self);
-                    self.identifier_unicode_escape_sequence(&mut builder, false);
                     continue;
                 }
+
+                // End of identifier, EOF, or another `\` escape.
+                // Push chunk since last escape to `str`.
+                let chunk = self.source.str_from_pos_to_current(chunk_start);
+                str.push_str(chunk);
+
+                if maybe_char != Some('\\') {
+                    // End of identifier or EOF
+                    break 'outer;
+                }
+
+                // Found another escape. Go back to start of outer loop.
                 break;
             }
-            self.consume_char();
-            builder.push_matching(c);
         }
-        let has_escape = builder.has_escape();
-        let text = builder.finish(self);
-        self.save_string(has_escape, text);
-        text
+
+        // Convert `str` to arena slice and save to `escaped_strings`
+        let id = str.into_bump_str();
+        self.save_string(true, id);
+        id
+    }
+
+    /// Entry point for a private identifier. i.e. after `#`.
+    /// `#` must be consumed before calling this.
+    ///
+    /// Like `identifier_name_handler`, this contains a fast path for identifiers which are pure ASCII.
+    /// Unicode characters and escapes are handled on paths marked `#[cold]` to keep the common ASCII
+    /// fast path as fast as possible.
+    pub fn private_identifier(&mut self) -> Kind {
+        // Handle EOF directly after `#`
+        let start_pos = self.source.position();
+        if start_pos.addr() == self.source.end_addr() {
+            return cold_branch(|| {
+                let start = self.offset();
+                self.error(diagnostics::UnexpectedEnd(Span::new(start, start)));
+                Kind::Undetermined
+            });
+        }
+
+        // Handle if not an ASCII identifier byte.
+        // SAFETY: Not at EOF, so safe to read a byte.
+        let b = unsafe { start_pos.read() };
+        if !is_identifier_start_ascii_byte(b) {
+            return self.private_identifier_not_ascii_id();
+        }
+
+        // SAFETY: Not at EOF, so can advance 1 byte without going out of bounds
+        let after_first = unsafe { start_pos.add(1) };
+
+        // Consume bytes which are part of identifier
+        byte_search! {
+            lexer: self,
+            table: NOT_ASCII_ID_CONTINUE_TABLE,
+            start: after_first,
+            handle_match: |next_byte| {
+                // Found a matching byte.
+                // Either end of identifier found, or a Unicode char, or `\` escape.
+                // Handle uncommon cases in cold branches to keep the common ASCII path
+                // as fast as possible.
+                if !next_byte.is_ascii() {
+                    return cold_branch(|| {
+                        // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
+                        // makes `start_pos` `source`'s position as it was at start of this function
+                        let start_pos = unsafe { after_first.sub(1) };
+                        self.identifier_tail_unicode(start_pos);
+                        Kind::PrivateIdentifier
+                    });
+                }
+                if next_byte == b'\\' {
+                    return cold_branch(|| {
+                        // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1
+                        // makes `start_pos` `source`'s position as it was at start of this function
+                        let start_pos = unsafe { after_first.sub(1) };
+                        self.identifier_backslash(start_pos, false);
+                        Kind::PrivateIdentifier
+                    });
+                }
+
+                Kind::PrivateIdentifier
+            },
+            handle_eof: || {
+                Kind::PrivateIdentifier
+            },
+        };
+    }
+
+    /// Handle private identifier whose first byte is not an ASCII identifier start byte.
+    #[cold]
+    fn private_identifier_not_ascii_id(&mut self) -> Kind {
+        let b = self.source.peek_byte().unwrap();
+        if !b.is_ascii() {
+            let c = self.peek().unwrap();
+            if is_identifier_start_unicode(c) {
+                let start_pos = self.source.position();
+                self.consume_char();
+                self.identifier_tail_after_unicode(start_pos);
+                return Kind::PrivateIdentifier;
+            }
+        } else if b == b'\\' {
+            // Assume Unicode characters are more common than `\` escapes, so this branch as cold
+            return cold_branch(|| {
+                self.identifier_backslash_handler();
+                Kind::PrivateIdentifier
+            });
+        }
+
+        // No identifier found
+        let start = self.offset();
+        let c = self.consume_char();
+        self.error(diagnostics::InvalidCharacter(c, Span::new(start, self.offset())));
+        Kind::Undetermined
     }
 }
diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs
index 26c9434a2e762..ac18ad91d2c30 100644
--- a/crates/oxc_parser/src/lexer/mod.rs
+++ b/crates/oxc_parser/src/lexer/mod.rs
@@ -16,6 +16,7 @@ mod number;
 mod numeric;
 mod punctuation;
 mod regex;
+mod search;
 mod source;
 mod string;
 mod string_builder;
@@ -303,3 +304,9 @@ impl<'a> Lexer<'a> {
         }
     }
 }
+
+/// Call a closure while hinting to compiler that this branch is rarely taken.
+#[cold]
+pub fn cold_branch<F: FnOnce() -> T, T>(f: F) -> T {
+    f()
+}
diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs
new file mode 100644
index 0000000000000..a9d9327a64d4b
--- /dev/null
+++ b/crates/oxc_parser/src/lexer/search.rs
@@ -0,0 +1,462 @@
+//! Structs and macros for searching source for combinations of byte values.
+//!
+//! * `ByteMatchTable` and `SafeByteMatchTable` are lookup table types for byte values.
+//! * `byte_match_table!` and `safe_byte_match_table!` macros create those tables at compile time.
+//! * `byte_search!` macro searches source text for first byte matching a byte table.
+
+/// Batch size for searching
+pub const SEARCH_BATCH_SIZE: usize = 32;
+
+/// Byte matcher lookup table.
+///
+/// Create table at compile time as a `static` or `const` with `byte_match_table!` macro.
+/// Test bytes against table with `ByteMatchTable::matches`.
+/// Or use `byte_search!` macro to search for first matching byte in source.
+///
+/// If the match pattern satisfies constraints of `SafeByteMatchTable`, use that instead.
+///
+/// # Examples
+/// ```
+/// use crate::lexer::search::{ByteMatchTable, byte_match_table};
+///
+/// static NOT_WHITESPACE: ByteMatchTable = byte_match_table!(|b| b != b' ' && b != b'\t');
+/// assert_eq!(NOT_WHITESPACE.matches(b'X'), true);
+/// assert_eq!(NOT_WHITESPACE.matches(b' '), false);
+///
+/// impl<'a> Lexer<'a> {
+///   fn eat_whitespace(&mut self) {
+///     // NB: Using `byte_search!` macro with a `ByteMatchTable` is unsafe
+///     unsafe {
+///       byte_search! {
+///         lexer: self,
+///         table: NOT_WHITESPACE,
+///         handle_match: |matched_byte, start| {},
+///         handle_eof: |start| {},
+///       };
+///     };
+///   }
+/// }
+/// ```
+// TODO: Delete this type + `byte_match_table!` macro if not used
+#[repr(C, align(64))]
+pub struct ByteMatchTable([bool; 256]);
+
+#[allow(dead_code)]
+impl ByteMatchTable {
+    // Create new `ByteMatchTable`.
+    pub const fn new(bytes: [bool; 256]) -> Self {
+        let mut table = Self([false; 256]);
+        let mut i = 0;
+        loop {
+            table.0[i] = bytes[i];
+            i += 1;
+            if i == 256 {
+                break;
+            }
+        }
+        table
+    }
+
+    /// Declare that using this table for searching.
+    /// An unsafe function here, whereas for `SafeByteMatchTable` it's safe.
+    /// `byte_search!` macro calls `.use_table()` on whatever table it's provided, which makes
+    /// using the macro unsafe for `ByteMatchTable`, but safe for `SafeByteMatchTable`.
+    #[allow(clippy::unused_self)]
+    #[inline]
+    pub const unsafe fn use_table(&self) {}
+
+    /// Test a value against this `ByteMatchTable`.
+    #[inline]
+    pub const fn matches(&self, b: u8) -> bool {
+        self.0[b as usize]
+    }
+}
+
+/// Macro to create a `ByteMatchTable` at compile time.
+///
+/// `byte_match_table!(|b| b < 3)` expands to:
+///
+/// ```
+/// {
+///   use crate::lexer::search::ByteMatchTable;
+///   #[allow(clippy::eq_op)]
+///   const TABLE: ByteMatchTable = ByteMatchTable::new([
+///     (0u8 < 3),
+///     (1u8 < 3),
+///     (2u8 < 3),
+///     (3u8 < 3),
+///     /* ... */
+///     (254u8 < 3),
+///     (255u8 < 3),
+///   ]);
+///   TABLE
+/// }
+/// ```
+#[allow(unused_macros)]
+macro_rules! byte_match_table {
+    (|$byte:ident| $res:expr) => {{
+        use crate::lexer::search::ByteMatchTable;
+        // Clippy creates warnings because e.g. `byte_match_table!(|b| b == 0)`
+        // is expanded to `ByteMatchTable([(0 == 0), ... ])`
+        #[allow(clippy::eq_op)]
+        const TABLE: ByteMatchTable = seq_macro::seq!($byte in 0u8..=255 {
+            ByteMatchTable::new([ #($res,)* ])
+        });
+        TABLE
+    }};
+}
+#[allow(unused_imports)]
+pub(crate) use byte_match_table;
+
+/// Safe byte matcher lookup table.
+///
+/// Create table at compile time as a `static` or `const` with `safe_byte_match_table!` macro.
+/// Test bytes against table with `SafeByteMatchTable::matches`.
+/// Or use `byte_search!` macro to search for first matching byte in source.
+///
+/// Only difference between this and `ByteMatchTable` is that for `SafeByteMatchTable`,
+/// it must be guaranteed that `byte_search!` macro using this table will always end up with
+/// `lexer.source` positioned on a UTF-8 character boundary.
+///
+/// Usage of `byte_search!` macro with a `SafeByteMatchTable` table is safe,
+/// and does not require an `unsafe {}` block (unlike `ByteMatchTable`).
+///
+/// To make this guarantee, one of the following must be true:
+///
+/// 1. Table contains `true` for all byte values 192 - 247
+///    i.e. first byte of any multi-byte Unicode character matches.
+///    (NB: 248 - 255 cannot occur in UTF-8 strings)
+///    e.g. `safe_byte_match_table!(|b| b >= 192)`
+///         `safe_byte_match_table!(|b| !b.is_ascii())`
+///
+/// 2. Table contains `false` for all byte values 128 - 191
+///    i.e. the continuation bytes of any multi-byte Unicode chars will be consumed in full.
+///    e.g. `safe_byte_match_table!(|b| b < 128 || b >= 192)`
+///         `safe_byte_match_table!(|b| b.is_ascii())`
+///         `safe_byte_match_table!(|b| b == ' ' || b == '\t')`
+///
+/// This is statically checked by `SafeByteMatchTable::new`, and will fail to compile if match
+/// pattern does not satisfy one of the above.
+///
+/// # Examples
+/// ```
+/// use crate::lexer::search::{SafeByteMatchTable, safe_byte_match_table};
+///
+/// static NOT_ASCII: SafeByteMatchTable = safe_byte_match_table!(|b| !b.is_ascii());
+/// assert_eq!(NOT_ASCII.matches(b'X'), false);
+/// assert_eq!(NOT_ASCII.matches(192), true);
+///
+/// impl<'a> Lexer<'a> {
+///   fn eat_ascii(&mut self) {
+///     // NB: Using `byte_search!` macro with a `SafeByteMatchTable` is safe
+///     byte_search! {
+///       lexer: self,
+///       table: NOT_ASCII,
+///       handle_match: |matched_byte, start| {},
+///       handle_eof: |start| {},
+///     };
+///   }
+/// }
+/// ```
+#[repr(C, align(64))]
+pub struct SafeByteMatchTable([bool; 256]);
+
+impl SafeByteMatchTable {
+    // Create new `SafeByteMatchTable`.
+    pub const fn new(bytes: [bool; 256]) -> Self {
+        let mut table = Self([false; 256]);
+
+        // Check if contains either:
+        // 1. `true` for all byte values 192..248
+        // 2. `false` for all byte values 128..192
+        let mut unicode_start_all_match = true;
+        let mut unicode_cont_all_no_match = true;
+
+        let mut i = 0;
+        loop {
+            let matches = bytes[i];
+            table.0[i] = matches;
+
+            if matches {
+                if i >= 128 && i < 192 {
+                    unicode_cont_all_no_match = false;
+                }
+            } else if i >= 192 && i < 248 {
+                unicode_start_all_match = false;
+            }
+
+            i += 1;
+            if i == 256 {
+                break;
+            }
+        }
+
+        assert!(
+            unicode_start_all_match || unicode_cont_all_no_match,
+            "Cannot create a `SafeByteMatchTable` with an unsafe pattern"
+        );
+
+        table
+    }
+
+    /// Declare that using this table for searching.
+    /// A safe function here, whereas for `ByteMatchTable` it's unsafe.
+    /// `byte_search!` macro calls `.use_table()` on whatever table it's provided, which makes
+    /// using the macro unsafe for `ByteMatchTable`, but safe for `SafeByteMatchTable`.
+    #[allow(clippy::unused_self)]
+    #[inline]
+    pub const fn use_table(&self) {}
+
+    /// Test a value against this `SafeByteMatchTable`.
+    #[inline]
+    pub const fn matches(&self, b: u8) -> bool {
+        self.0[b as usize]
+    }
+}
+
+/// Macro to create a `SafeByteMatchTable` at compile time.
+///
+/// `safe_byte_match_table!(|b| !b.is_ascii())` expands to:
+///
+/// ```
+/// {
+///   use crate::lexer::search::SafeByteMatchTable;
+///   #[allow(clippy::eq_op)]
+///   const TABLE: SafeByteMatchTable = SafeByteMatchTable::new([
+///     (!0u8.is_ascii()),
+///     (!1u8.is_ascii()),
+///     /* ... */
+///     (!255u8.is_ascii()),
+///   ]);
+///   TABLE
+/// }
+/// ```
+macro_rules! safe_byte_match_table {
+    (|$byte:ident| $res:expr) => {{
+        use crate::lexer::search::SafeByteMatchTable;
+        #[allow(clippy::eq_op)]
+        const TABLE: SafeByteMatchTable = seq_macro::seq!($byte in 0u8..=255 {
+            // Clippy creates warnings because e.g. `byte_match_table!(|b| b == 0)`
+            // is expanded to `SafeByteMatchTable([0 == 0, ... ])`
+            SafeByteMatchTable::new([#($res,)*])
+        });
+        TABLE
+    }};
+}
+pub(crate) use safe_byte_match_table;
+
+/// Macro to search for first byte matching a `ByteMatchTable` or `SafeByteMatchTable`.
+///
+/// Search processes source in batches of `SEARCH_BATCH_SIZE` bytes for speed.
+/// When not enough bytes remaining in source for a batch, search source byte by byte.
+///
+/// This is a macro rather than a function for 2 reasons:
+/// 1. Searching is a bit faster when all the code is in a single function.
+/// 2. The `handle_match` section has to be repeated twice.
+///    This macro does that, so code using the macro can be DRY-er.
+///
+/// Used as follows:
+///
+/// ```
+/// static NOT_STUFF_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| !is_stuff(b));
+///
+/// impl<'a> Lexer<'a> {
+///   fn eat_stuff(&mut self) -> bool {
+///     byte_search! {
+///       lexer: self,
+///       table: NOT_STUFF_TABLE,
+///       handle_match: |matched_byte, start| {
+///         // Matching byte has been found.
+///         // `matched_byte` is `u8` value of first byte which matched the table.
+///         // `start` is `SourcePosition` where search began.
+///         // `lexer.source` is now positioned on first matching byte.
+///         // Handle the next matching byte (deal with any special cases).
+///         // Value this block evaluates to will be returned from enclosing function.
+///         matched_byte == b'X'
+///       },
+///       handle_eof: |start| {
+///         // No bytes from start position to end of source matched the table.
+///         // `start` is `SourcePosition` where search began.
+///         // `lexer.source` is now positioned at EOF.
+///         // Handle EOF in some way.
+///         // Value this block evaluates to will be returned from enclosing function.
+///         false
+///       },
+///     };
+///
+///     // This is unreachable.
+///     // Macro always exits current function with a `return` statement.
+///   }
+/// }
+/// ```
+///
+/// or provide the `SourcePosition` to start searching from:
+///
+/// ```
+/// impl<'a> Lexer<'a> {
+///   fn eat_stuff(&mut self) -> bool {
+///     let start = unsafe { self.source.position().add(1) };
+///     byte_search! {
+///       lexer: self,
+///       table: NOT_STUFF_TABLE,
+///       start: start,
+///       handle_match: |matched_byte| {
+///         // Matching byte has been found.
+///         // `matched_byte` is `u8` value of first byte which matched the table.
+///         // `lexer.source` is now positioned on first matching byte.
+///         // Handle the next matching byte (deal with any special cases).
+///         // Value this block evaluates to will be returned from enclosing function.
+///         true
+///       },
+///       handle_eof: || {
+///         // No bytes from start position to end of source matched the table.
+///         // `lexer.source` is now positioned at EOF.
+///         // Handle EOF in some way.
+///         // Value this block evaluates to will be returned from enclosing function.
+///         false
+///       },
+///     };
+///
+///     // This is unreachable.
+///     // Macro always exits current function with a `return` statement.
+///   }
+/// }
+/// ```
+///
+/// NB: The macro always causes enclosing function to return.
+/// It creates `return` statements with the value that `handle_match` / `handle_eof` blocks evaluate to.
+/// After the `byte_search!` macro is unreachable.
+///
+/// # SAFETY
+///
+/// This macro will consume bytes from `lexer.source` according to the `ByteMatchTable`
+/// or `SafeByteMatchTable` provided.
+///
+/// Using `byte_search!` with a `SafeByteMatchTable` is guaranteed to end up with `lexer.source`
+/// positioned on a UTF-8 character boundary when entering `handle_match`.
+/// Therefore it's safe to use `byte_search!` with a `SafeByteMatchTable`.
+///
+/// `ByteMatchTable` makes no such guarantee, and using `byte_search!` with a `ByteMatchTable` is unsafe.
+/// It is caller's responsibility to ensure that `lexer.source` is moved onto a UTF-8 character boundary.
+/// This is similar to the contract's of `Source`'s unsafe methods.
+macro_rules! byte_search {
+    // Standard version.
+    // `start` is calculated from current position of `lexer.source`.
+    (
+        lexer: $lexer:ident,
+        table: $table:ident,
+        handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
+        handle_eof: |$eof_start:ident| $eof_handler:expr,
+    ) => {{
+        let start = $lexer.source.position();
+        byte_search! {
+            lexer: $lexer,
+            table: $table,
+            start: start,
+            handle_match: |$match_byte, $match_start| $match_handler,
+            handle_eof: |$eof_start| $eof_handler,
+        }
+    }};
+
+    // Provide your own `start` position
+    (
+        lexer: $lexer:ident,
+        table: $table:ident,
+        start: $start:ident,
+        handle_match: |$match_byte:ident| $match_handler:expr,
+        handle_eof: || $eof_handler:expr,
+    ) => {
+        byte_search! {
+            lexer: $lexer,
+            table: $table,
+            start: $start,
+            handle_match: |$match_byte, __start| $match_handler,
+            handle_eof: |__start| $eof_handler,
+        }
+    };
+
+    // Actual implementation
+    (
+        lexer: $lexer:ident,
+        table: $table:ident,
+        start: $start:ident,
+        handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr,
+        handle_eof: |$eof_start:ident| $eof_handler:expr,
+    ) => {{
+        // SAFETY:
+        // If `$table` is a `SafeByteMatchTable`, it's guaranteed that `lexer.source`
+        // will be positioned on a UTF-8 character boundary before `handle_match` is called.
+        // If `$table` is a `ByteMatchTable`, no such guarantee is given, but call to
+        // `$table.use_table()` here makes using this macro unsafe, and it's the user's
+        // responsibility to uphold this invariant.
+        // Therefore we can assume this is taken care of one way or another, and wrap the calls
+        // to unsafe functions in this function with `unsafe {}`.
+        $table.use_table();
+
+        let mut pos = $start;
+        loop {
+            if pos.addr() <= $lexer.source.end_for_batch_search_addr() {
+                // Search a batch of `SEARCH_BATCH_SIZE` bytes.
+                // The compiler unrolls this loop.
+                // SAFETY:
+                // `pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are
+                // at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`.
+                // So calls to `pos.read()` and `pos.add(1)` in this loop cannot go out of bounds.
+                for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE {
+                    // SAFETY: `pos` cannot go out of bounds in this loop (see above).
+                    let $match_byte = unsafe { pos.read() };
+                    if $table.matches($match_byte) {
+                        // Found match.
+                        // Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
+                        // SAFETY: See above about UTF-8 character boundaries invariant.
+                        $lexer.source.set_position(pos);
+
+                        let $match_start = $start;
+                        return $match_handler;
+                    }
+
+                    // No match - continue searching
+                    // SAFETY: `pos` cannot go out of bounds in this loop (see above).
+                    // Also see above about UTF-8 character boundaries invariant.
+                    pos = unsafe { pos.add(1) };
+                }
+                // No match in batch - loop round and searching next batch
+            } else {
+                // Not enough bytes remaining to process as a batch.
+                // This branch marked `#[cold]` as should be very uncommon in normal-length JS files.
+                // Very short JS files will be penalized, but they'll be very fast to parse anyway.
+                // TODO: Could extend very short files with padding during parser initialization
+                // to remove that problem.
+                return crate::lexer::cold_branch(|| {
+                    let end_addr = $lexer.source.end_addr();
+                    while pos.addr() < end_addr {
+                        // SAFETY: `pos` is not at end of source, so safe to read a byte
+                        let $match_byte = unsafe { pos.read() };
+                        if $table.matches($match_byte) {
+                            // Found match.
+                            // Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes.
+                            // SAFETY: See above about UTF-8 character boundaries invariant.
+                            $lexer.source.set_position(pos);
+
+                            let $match_start = $start;
+                            return $match_handler;
+                        }
+
+                        // No match - continue searching
+                        // SAFETY: `pos` is not at end of source, so safe to advance 1 byte.
+                        // See above about UTF-8 character boundaries invariant.
+                        pos = unsafe { pos.add(1) };
+                    }
+
+                    // EOF.
+                    // Advance `lexer.source`'s position to end of file.
+                    $lexer.source.set_position(pos);
+
+                    let $eof_start = $start;
+                    $eof_handler
+                });
+            }
+        }
+    }};
+}
+pub(crate) use byte_search;
diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs
index 4178048db1cce..fe1b2c9dd436f 100644
--- a/crates/oxc_parser/src/lexer/source.rs
+++ b/crates/oxc_parser/src/lexer/source.rs
@@ -1,5 +1,6 @@
 #![allow(clippy::unnecessary_safety_comment)]
 
+use super::search::SEARCH_BATCH_SIZE;
 use crate::{UniquePromise, MAX_LEN};
 
 use std::{marker::PhantomData, slice, str};
@@ -66,6 +67,10 @@ pub(super) struct Source<'a> {
     end: *const u8,
     /// Pointer to current position in source string
     ptr: *const u8,
+    /// Memory address past which not enough bytes remaining in source to process a batch of
+    /// `SEARCH_BATCH_SIZE` bytes in one go.
+    /// Must be `usize`, not a pointer, as if source is very short, a pointer could be out of bounds.
+    end_for_batch_search_addr: usize,
     /// Marker for immutable borrow of source string
     _marker: PhantomData<&'a str>,
 }
@@ -89,7 +94,13 @@ impl<'a> Source<'a> {
         // for direct pointer equality with `ptr` to check if at end of file.
         let end = unsafe { start.add(source_text.len()) };
 
-        Self { start, end, ptr: start, _marker: PhantomData }
+        // `saturating_sub` not `wrapping_sub` so that value doesn't wrap around if source
+        // is very short, and has very low memory address (e.g. 16). If that's the case,
+        // `end_for_batch_search_addr` will be 0, so a test whether any non-null pointer is past end
+        // will always test positive, and disable batch search.
+        let end_for_batch_search_addr = (end as usize).saturating_sub(SEARCH_BATCH_SIZE);
+
+        Self { start, end, ptr: start, end_for_batch_search_addr, _marker: PhantomData }
     }
 
     /// Get entire source text as `&str`.
@@ -127,6 +138,19 @@ impl<'a> Source<'a> {
         self.ptr == self.end
     }
 
+    /// Get end address.
+    #[inline]
+    pub(super) fn end_addr(&self) -> usize {
+        self.end as usize
+    }
+
+    /// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes
+    /// can be read without going out of bounds.
+    #[inline]
+    pub(super) fn end_for_batch_search_addr(&self) -> usize {
+        self.end_for_batch_search_addr
+    }
+
     /// Get current position.
     ///
     /// The `SourcePosition` returned is guaranteed to be within bounds of `&str` that `Source`
@@ -183,6 +207,40 @@ impl<'a> Source<'a> {
         self.ptr = pos.ptr;
     }
 
+    /// Get string slice from a `SourcePosition` up to the current position of `Source`.
+    pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str {
+        assert!(pos.ptr <= self.ptr);
+        // SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements
+        unsafe { self.str_from_pos_to_current_unchecked(pos) }
+    }
+
+    /// Get string slice from a `SourcePosition` up to the current position of `Source`,
+    /// without checks.
+    ///
+    /// SAFETY:
+    /// `pos` must not be after current position of `Source`.
+    /// This is always the case if both:
+    /// 1. `Source::set_position` has not been called since `pos` was created.
+    /// 2. `pos` has not been advanced with `SourcePosition::add`.
+    #[inline]
+    pub(super) unsafe fn str_from_pos_to_current_unchecked(&self, pos: SourcePosition) -> &'a str {
+        // SAFETY: Caller guarantees `pos` is not after current position of `Source`.
+        // `SourcePosition`s can only be created from a `Source`.
+        // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source`
+        // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another
+        // `Source` originated on another thread can "jump" onto this one.
+        // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be
+        // from this `Source`, therefore `pos.ptr` and `self.ptr` must both be within the same allocation
+        // and derived from the same original pointer.
+        // Invariants of `Source` and `SourcePosition` types guarantee that both are positioned
+        // on UTF-8 character boundaries. So slicing source text between these 2 points will always
+        // yield a valid UTF-8 string.
+        debug_assert!(pos.ptr <= self.ptr);
+        let len = self.ptr as usize - pos.addr();
+        let slice = slice::from_raw_parts(pos.ptr, len);
+        std::str::from_utf8_unchecked(slice)
+    }
+
     /// Get current position in source, relative to start of source.
     #[allow(clippy::cast_possible_truncation)]
     #[inline]
@@ -318,7 +376,6 @@ impl<'a> Source<'a> {
     ///
     /// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining`
     /// are *not* safe to call until one of above conditions is satisfied.
-    #[allow(dead_code)]
     #[inline]
     unsafe fn next_byte_unchecked(&mut self) -> u8 {
         // SAFETY: Caller guarantees not at end of file i.e. `ptr != end`.
@@ -422,6 +479,12 @@ impl<'a> SourcePosition<'a> {
         Self { ptr, _marker: PhantomData }
     }
 
+    /// Get memory address of `SourcePosition` as a `usize`.
+    #[inline]
+    pub(super) fn addr(self) -> usize {
+        self.ptr as usize
+    }
+
     /// Create new `SourcePosition` which is `n` bytes after this one.
     /// The provenance of the pointer `SourcePosition` contains is maintained.
     ///
@@ -430,7 +493,6 @@ impl<'a> SourcePosition<'a> {
     /// of `Source` this `SourcePosition` was created from.
     /// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`,
     /// just not past it.
-    #[allow(dead_code)]
     #[inline]
     pub(super) unsafe fn add(self, n: usize) -> Self {
         Self::new(self.ptr.add(n))
diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs
index 01e1c4ef05f33..3b6961c9fa612 100644
--- a/crates/oxc_parser/src/lexer/string_builder.rs
+++ b/crates/oxc_parser/src/lexer/string_builder.rs
@@ -55,6 +55,8 @@ impl<'a> AutoCow<'a> {
         self.value.is_some()
     }
 
+    // TODO: Delete this if not using it
+    #[allow(dead_code)]
     pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str {
         match self.value.take() {
             Some(s) => s.into_bump_str(),
diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs
index 01aac7d1dcfbc..651adffec0c1a 100644
--- a/crates/oxc_parser/src/lexer/unicode.rs
+++ b/crates/oxc_parser/src/lexer/unicode.rs
@@ -1,4 +1,4 @@
-use super::{AutoCow, Kind, Lexer, Span};
+use super::{Kind, Lexer, Span};
 use crate::diagnostics;
 
 use oxc_allocator::String;
@@ -21,10 +21,9 @@ impl<'a> Lexer<'a> {
         let c = self.peek().unwrap();
         match c {
             c if is_identifier_start_unicode(c) => {
-                let mut builder = AutoCow::new(self);
-                let c = self.consume_char();
-                builder.push_matching(c);
-                self.identifier_name(builder);
+                let start_pos = self.source.position();
+                self.consume_char();
+                self.identifier_tail_after_unicode(start_pos);
                 Kind::Ident
             }
             c if is_irregular_whitespace(c) => {
@@ -51,7 +50,7 @@ impl<'a> Lexer<'a> {
     ///   \u{ `CodePoint` }
     pub(super) fn identifier_unicode_escape_sequence(
         &mut self,
-        builder: &mut AutoCow<'a>,
+        str: &mut String<'a>,
         check_identifier_start: bool,
     ) {
         let start = self.offset();
@@ -98,7 +97,7 @@ impl<'a> Lexer<'a> {
             return;
         }
 
-        builder.push_different(ch);
+        str.push(ch);
     }
 
     /// String `UnicodeEscapeSequence`