From 070d3c475cfe203b687ca241cee4f773d9644ce3 Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Thu, 8 Feb 2024 19:48:35 +0000 Subject: [PATCH] perf(parser): lex identifiers as bytes not chars --- Cargo.lock | 7 + Cargo.toml | 1 + crates/oxc_parser/Cargo.toml | 1 + crates/oxc_parser/src/lexer/byte_handlers.rs | 20 +- crates/oxc_parser/src/lexer/identifier.rs | 326 ++++++++++-- crates/oxc_parser/src/lexer/mod.rs | 7 + crates/oxc_parser/src/lexer/search.rs | 462 ++++++++++++++++++ crates/oxc_parser/src/lexer/source.rs | 68 ++- crates/oxc_parser/src/lexer/string_builder.rs | 2 + crates/oxc_parser/src/lexer/unicode.rs | 13 +- 10 files changed, 841 insertions(+), 66 deletions(-) create mode 100644 crates/oxc_parser/src/lexer/search.rs diff --git a/Cargo.lock b/Cargo.lock index 3cbadbab3c0ad..002ac5ae9ef14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1603,6 +1603,7 @@ dependencies = [ "oxc_span", "oxc_syntax", "rustc-hash", + "seq-macro", "serde_json", ] @@ -2285,6 +2286,12 @@ version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + [[package]] name = "serde" version = "1.0.196" diff --git a/Cargo.toml b/Cargo.toml index 3b6a5709d0939..e50b18f4ff1e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -111,6 +111,7 @@ regex = { version = "1.10.3" } rustc-hash = { version = "1.1.0", default-features = false, features = ["std"] } ryu-js = { version = "1.0.0" } ropey = { version = "1.6.1" } +seq-macro = { version = "0.3.5" } serde = { version = "1.0.196" } serde_json = { version = "1.0.113" } syn = { version = "=1.0.109" } diff --git a/crates/oxc_parser/Cargo.toml b/crates/oxc_parser/Cargo.toml index e6cc4f48b53bc..2da6d6fb22d7c 100644 --- a/crates/oxc_parser/Cargo.toml +++ b/crates/oxc_parser/Cargo.toml @@ -29,6 +29,7 @@ assert-unchecked = { workspace = true } bitflags = { workspace = true } rustc-hash = { workspace = true } num-bigint = { workspace = true } +seq-macro = { workspace = true } [dev-dependencies] oxc_ast = { workspace = true, features = ["serde"] } diff --git a/crates/oxc_parser/src/lexer/byte_handlers.rs b/crates/oxc_parser/src/lexer/byte_handlers.rs index 7009e41b73082..ef720d115795d 100644 --- a/crates/oxc_parser/src/lexer/byte_handlers.rs +++ b/crates/oxc_parser/src/lexer/byte_handlers.rs @@ -1,4 +1,4 @@ -use super::{AutoCow, Kind, Lexer, LexerContext}; +use super::{Kind, Lexer, LexerContext}; use crate::diagnostics; #[allow(clippy::unnecessary_safety_comment)] @@ -137,7 +137,10 @@ macro_rules! ascii_byte_handler { /// (`a`-`z`, `A`-`Z`, `$` or `_`). /// /// Macro calls `Lexer::identifier_name_handler` to get the text of the identifier, -/// and slices off first character. +/// minus its first character. +/// +/// `Lexer::identifier_name_handler` is an unsafe function, but if byte being consumed is ASCII, +/// its requirements are met. /// /// # SAFETY /// Only use this macro to define byte handlers for ASCII characters. @@ -156,7 +159,8 @@ macro_rules! ascii_byte_handler { /// const L_G: ByteHandler = { /// #[allow(non_snake_case)] /// fn L_G(lexer: &mut Lexer) -> Kind { -/// let id_without_first_char = &lexer.identifier_name_handler()[1..]; +/// // SAFETY: This macro is only used for ASCII characters +/// let id_without_first_char = unsafe { lexer.identifier_name_handler() }; /// match id_without_first_char { /// "et" => Kind::Get, /// "lobal" => Kind::Global, @@ -169,7 +173,8 @@ macro_rules! ascii_byte_handler { macro_rules! ascii_identifier_handler { ($id:ident($str:ident) $body:expr) => { byte_handler!($id(lexer) { - let $str = &lexer.identifier_name_handler()[1..]; + // SAFETY: This macro is only used for ASCII characters + let $str = unsafe { lexer.identifier_name_handler() }; $body }); }; @@ -439,12 +444,7 @@ ascii_byte_handler!(BTO(lexer) { // \ ascii_byte_handler!(ESC(lexer) { - let mut builder = AutoCow::new(lexer); - lexer.consume_char(); - builder.force_allocation_without_current_ascii_char(lexer); - lexer.identifier_unicode_escape_sequence(&mut builder, true); - let text = lexer.identifier_name(builder); - Kind::match_keyword(text) + lexer.identifier_backslash_handler() }); // ] diff --git a/crates/oxc_parser/src/lexer/identifier.rs b/crates/oxc_parser/src/lexer/identifier.rs index f28a3d6ec6869..f98141e4403cb 100644 --- a/crates/oxc_parser/src/lexer/identifier.rs +++ b/crates/oxc_parser/src/lexer/identifier.rs @@ -1,66 +1,300 @@ -use super::{AutoCow, Kind, Lexer, Span}; +use super::{ + cold_branch, + search::{byte_search, safe_byte_match_table, SafeByteMatchTable}, + Kind, Lexer, SourcePosition, +}; use crate::diagnostics; -use oxc_syntax::identifier::{is_identifier_part, is_identifier_start}; +use std::cmp::max; + +use oxc_allocator::String; +use oxc_span::Span; +use oxc_syntax::identifier::{ + is_identifier_part, is_identifier_part_unicode, is_identifier_start_unicode, +}; + +const MIN_ESCAPED_STR_LEN: usize = 16; + +static ASCII_ID_START_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| b.is_ascii_alphabetic() || b == b'_' || b == b'$'); + +static NOT_ASCII_ID_CONTINUE_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| !(b.is_ascii_alphanumeric() || b == b'_' || b == b'$')); + +#[inline] +fn is_identifier_start_ascii_byte(byte: u8) -> bool { + ASCII_ID_START_TABLE.matches(byte) +} impl<'a> Lexer<'a> { - /// Section 12.7.1 Identifier Names - pub(super) fn identifier_name_handler(&mut self) -> &'a str { - let builder = AutoCow::new(self); - self.consume_char(); - self.identifier_name(builder) + /// Handle identifier with ASCII start character. + /// Returns text of the identifier, minus its first char. + /// + /// Start character should not be consumed from `self.source` prior to calling this. + /// + /// This function is the "fast path" for the most common identifiers in JS code - + /// purely consisting of ASCII characters: `a`-`z`, `A`-`Z`, `0`-`9`, `_`, `$`. + /// JS syntax also allows Unicode identifiers and escapes (e.g. `\u{FF}`) in identifiers, + /// but they are very rare in practice. So this fast path will handle 99% of JS code. + /// + /// When Unicode or an escape is encountered, this function de-opts to paths which handle those + /// cases, but those paths are marked `#[cold]` to keep the ASCII fast path as fast as possible. + /// + /// The fast path uses pointers and unsafe code to minimize bounds checks etc. + /// The functions it delegates to for uncommon cases are both more complex, and less critical, + /// so they stick to safe code only. + /// + /// # SAFETY + /// * `self.source` must not be exhausted (at least 1 char remaining). + /// * Next char must be ASCII. + #[allow(clippy::missing_safety_doc)] // Clippy is wrong! + pub(super) unsafe fn identifier_name_handler(&mut self) -> &'a str { + // Advance past 1st byte. + // SAFETY: Caller guarantees not at EOF, and next byte is ASCII. + let after_first = self.source.position().add(1); + + // Consume bytes which are part of identifier + byte_search! { + lexer: self, + table: NOT_ASCII_ID_CONTINUE_TABLE, + start: after_first, + handle_match: |next_byte| { + // Found a matching byte. + // Either end of identifier found, or a Unicode char, or `\` escape. + // Handle uncommon cases in cold branches to keep the common ASCII path + // as fast as possible. + if !next_byte.is_ascii() { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + &self.identifier_tail_unicode(start_pos)[1..] + }); + } + if next_byte == b'\\' { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + &self.identifier_backslash(start_pos, false)[1..] + }); + } + + // Return identifier minus its first char. + // SAFETY: `after_first` was position of `lexer.source` at start of this search. + // Searching only proceeds in forwards direction, so `lexer.source.position()` + // cannot be before `after_first`. + unsafe { self.source.str_from_pos_to_current_unchecked(after_first) } + }, + handle_eof: || { + // Return identifier minus its first char. + // SAFETY: `lexer.source` is positioned at EOF, so there is no valid value + // of `after_first` which could be after current position. + unsafe { self.source.str_from_pos_to_current_unchecked(after_first) } + }, + }; } - pub(super) fn identifier_name(&mut self, builder: AutoCow<'a>) -> &'a str { - self.identifier_tail(builder) + /// Handle rest of identifier after first byte of a multi-byte Unicode char found. + /// Any number of characters can have already been consumed from `self.source` prior to it. + /// `self.source` should be positioned at start of Unicode character. + fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str { + let c = self.peek().unwrap(); + if is_identifier_part_unicode(c) { + self.consume_char(); + self.identifier_tail_after_unicode(start_pos) + } else { + // Reached end of identifier. Return identifier. + self.source.str_from_pos_to_current(start_pos) + } } - pub(super) fn private_identifier(&mut self) -> Kind { - let mut builder = AutoCow::new(self); - let start = self.offset(); - match self.next_char() { - Some(c) if is_identifier_start(c) => { - builder.push_matching(c); - } - Some('\\') => { - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, true); - } - Some(c) => { - #[allow(clippy::cast_possible_truncation)] - self.error(diagnostics::InvalidCharacter( - c, - Span::new(start, start + c.len_utf8() as u32), - )); - return Kind::Undetermined; - } - None => { - self.error(diagnostics::UnexpectedEnd(Span::new(start, start))); - return Kind::Undetermined; + /// Handle identifier after first char (which was Unicode) is dealt with. + /// + /// First char should have been consumed from `self.source` prior to calling this. + /// `start_pos` should be position of the start of the identifier (before first char was consumed). + pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str { + // Identifier contains a Unicode chars, so probably contains more. + // So just iterate over chars now, instead of bytes. + while let Some(c) = self.peek() { + if is_identifier_part(c) { + self.consume_char(); + } else if c == '\\' { + // This branch marked cold as escapes are uncommon + return cold_branch(|| self.identifier_backslash(start_pos, false)); + } else { + break; } } - self.identifier_tail(builder); - Kind::PrivateIdentifier + + // Return identifier + self.source.str_from_pos_to_current(start_pos) } - fn identifier_tail(&mut self, mut builder: AutoCow<'a>) -> &'a str { - // ident tail - while let Some(c) = self.peek() { - if !is_identifier_part(c) { - if c == '\\' { + /// Handle identifier starting with `\` escape. + pub fn identifier_backslash_handler(&mut self) -> Kind { + // Create arena string to hold unescaped identifier. + // We don't know how long identifier will end up being, so guess. + let str = String::with_capacity_in(MIN_ESCAPED_STR_LEN, self.allocator); + + // Process escape and get rest of identifier + let id = self.identifier_on_backslash(str, true); + Kind::match_keyword(id) + } + + /// Consume rest of identifier after a `\` escape is found. + /// + /// The `\` must not have be consumed from `lexer.source`. + /// `start_pos` must be position of start of identifier. + fn identifier_backslash(&mut self, start_pos: SourcePosition, is_start: bool) -> &'a str { + // Create arena string to hold unescaped identifier. + // We don't know how long identifier will end up being. Take a guess that total length + // will be double what we've seen so far, or `MIN_ESCAPED_STR_LEN` minimum. + let so_far = self.source.str_from_pos_to_current(start_pos); + let capacity = max(so_far.len() * 2, MIN_ESCAPED_STR_LEN); + let mut str = String::with_capacity_in(capacity, self.allocator); + + // Push identifier up this point into `str` + // `bumpalo::collections::string::String::push_str` is currently expensive due to + // inefficiency in bumpalo's implementation. But best we have right now. + str.push_str(so_far); + + // Process escape and get rest of identifier + self.identifier_on_backslash(str, is_start) + } + + /// Process rest of identifier after a `\` found. + /// + /// `self.source` should be positioned *on* the `\` (i.e. `\` has not been consumed yet). + /// `str` should contain the identifier up to before the escape. + /// `is_start` should be `true` if this is first char in the identifier, `false` otherwise. + fn identifier_on_backslash(&mut self, mut str: String<'a>, mut is_start: bool) -> &'a str { + 'outer: loop { + // Consume `\` + self.consume_char(); + + // Consume escape sequence and add char to `str` + self.identifier_unicode_escape_sequence(&mut str, is_start); + is_start = false; + + // Consume chars until reach end of identifier or another escape + let chunk_start = self.source.position(); + loop { + let maybe_char = self.peek(); + if maybe_char.is_some_and(is_identifier_part) { self.consume_char(); - builder.force_allocation_without_current_ascii_char(self); - self.identifier_unicode_escape_sequence(&mut builder, false); continue; } + + // End of identifier, EOF, or another `\` escape. + // Push chunk since last escape to `str`. + let chunk = self.source.str_from_pos_to_current(chunk_start); + str.push_str(chunk); + + if maybe_char != Some('\\') { + // End of identifier or EOF + break 'outer; + } + + // Found another escape. Go back to start of outer loop. break; } - self.consume_char(); - builder.push_matching(c); } - let has_escape = builder.has_escape(); - let text = builder.finish(self); - self.save_string(has_escape, text); - text + + // Convert `str` to arena slice and save to `escaped_strings` + let id = str.into_bump_str(); + self.save_string(true, id); + id + } + + /// Entry point for a private identifier. i.e. after `#`. + /// `#` must be consumed before calling this. + /// + /// Like `identifier_name_handler`, this contains a fast path for identifiers which are pure ASCII. + /// Unicode characters and escapes are handled on paths marked `#[cold]` to keep the common ASCII + /// fast path as fast as possible. + pub fn private_identifier(&mut self) -> Kind { + // Handle EOF directly after `#` + let start_pos = self.source.position(); + if start_pos.addr() == self.source.end_addr() { + return cold_branch(|| { + let start = self.offset(); + self.error(diagnostics::UnexpectedEnd(Span::new(start, start))); + Kind::Undetermined + }); + } + + // Handle if not an ASCII identifier byte. + // SAFETY: Not at EOF, so safe to read a byte. + let b = unsafe { start_pos.read() }; + if !is_identifier_start_ascii_byte(b) { + return self.private_identifier_not_ascii_id(); + } + + // SAFETY: Not at EOF, so can advance 1 byte without going out of bounds + let after_first = unsafe { start_pos.add(1) }; + + // Consume bytes which are part of identifier + byte_search! { + lexer: self, + table: NOT_ASCII_ID_CONTINUE_TABLE, + start: after_first, + handle_match: |next_byte| { + // Found a matching byte. + // Either end of identifier found, or a Unicode char, or `\` escape. + // Handle uncommon cases in cold branches to keep the common ASCII path + // as fast as possible. + if !next_byte.is_ascii() { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + self.identifier_tail_unicode(start_pos); + Kind::PrivateIdentifier + }); + } + if next_byte == b'\\' { + return cold_branch(|| { + // SAFETY: `after_first` is position after consuming 1 byte, so subtracting 1 + // makes `start_pos` `source`'s position as it was at start of this function + let start_pos = unsafe { after_first.sub(1) }; + self.identifier_backslash(start_pos, false); + Kind::PrivateIdentifier + }); + } + + Kind::PrivateIdentifier + }, + handle_eof: || { + Kind::PrivateIdentifier + }, + }; + } + + /// Handle private identifier whose first byte is not an ASCII identifier start byte. + #[cold] + fn private_identifier_not_ascii_id(&mut self) -> Kind { + let b = self.source.peek_byte().unwrap(); + if !b.is_ascii() { + let c = self.peek().unwrap(); + if is_identifier_start_unicode(c) { + let start_pos = self.source.position(); + self.consume_char(); + self.identifier_tail_after_unicode(start_pos); + return Kind::PrivateIdentifier; + } + } else if b == b'\\' { + // Assume Unicode characters are more common than `\` escapes, so this branch as cold + return cold_branch(|| { + self.identifier_backslash_handler(); + Kind::PrivateIdentifier + }); + } + + // No identifier found + let start = self.offset(); + let c = self.consume_char(); + self.error(diagnostics::InvalidCharacter(c, Span::new(start, self.offset()))); + Kind::Undetermined } } diff --git a/crates/oxc_parser/src/lexer/mod.rs b/crates/oxc_parser/src/lexer/mod.rs index 26c9434a2e762..ac18ad91d2c30 100644 --- a/crates/oxc_parser/src/lexer/mod.rs +++ b/crates/oxc_parser/src/lexer/mod.rs @@ -16,6 +16,7 @@ mod number; mod numeric; mod punctuation; mod regex; +mod search; mod source; mod string; mod string_builder; @@ -303,3 +304,9 @@ impl<'a> Lexer<'a> { } } } + +/// Call a closure while hinting to compiler that this branch is rarely taken. +#[cold] +pub fn cold_branch T, T>(f: F) -> T { + f() +} diff --git a/crates/oxc_parser/src/lexer/search.rs b/crates/oxc_parser/src/lexer/search.rs new file mode 100644 index 0000000000000..a9d9327a64d4b --- /dev/null +++ b/crates/oxc_parser/src/lexer/search.rs @@ -0,0 +1,462 @@ +//! Structs and macros for searching source for combinations of byte values. +//! +//! * `ByteMatchTable` and `SafeByteMatchTable` are lookup table types for byte values. +//! * `byte_match_table!` and `safe_byte_match_table!` macros create those tables at compile time. +//! * `byte_search!` macro searches source text for first byte matching a byte table. + +/// Batch size for searching +pub const SEARCH_BATCH_SIZE: usize = 32; + +/// Byte matcher lookup table. +/// +/// Create table at compile time as a `static` or `const` with `byte_match_table!` macro. +/// Test bytes against table with `ByteMatchTable::matches`. +/// Or use `byte_search!` macro to search for first matching byte in source. +/// +/// If the match pattern satisfies constraints of `SafeByteMatchTable`, use that instead. +/// +/// # Examples +/// ``` +/// use crate::lexer::search::{ByteMatchTable, byte_match_table}; +/// +/// static NOT_WHITESPACE: ByteMatchTable = byte_match_table!(|b| b != b' ' && b != b'\t'); +/// assert_eq!(NOT_WHITESPACE.matches(b'X'), true); +/// assert_eq!(NOT_WHITESPACE.matches(b' '), false); +/// +/// impl<'a> Lexer<'a> { +/// fn eat_whitespace(&mut self) { +/// // NB: Using `byte_search!` macro with a `ByteMatchTable` is unsafe +/// unsafe { +/// byte_search! { +/// lexer: self, +/// table: NOT_WHITESPACE, +/// handle_match: |matched_byte, start| {}, +/// handle_eof: |start| {}, +/// }; +/// }; +/// } +/// } +/// ``` +// TODO: Delete this type + `byte_match_table!` macro if not used +#[repr(C, align(64))] +pub struct ByteMatchTable([bool; 256]); + +#[allow(dead_code)] +impl ByteMatchTable { + // Create new `ByteMatchTable`. + pub const fn new(bytes: [bool; 256]) -> Self { + let mut table = Self([false; 256]); + let mut i = 0; + loop { + table.0[i] = bytes[i]; + i += 1; + if i == 256 { + break; + } + } + table + } + + /// Declare that using this table for searching. + /// An unsafe function here, whereas for `SafeByteMatchTable` it's safe. + /// `byte_search!` macro calls `.use_table()` on whatever table it's provided, which makes + /// using the macro unsafe for `ByteMatchTable`, but safe for `SafeByteMatchTable`. + #[allow(clippy::unused_self)] + #[inline] + pub const unsafe fn use_table(&self) {} + + /// Test a value against this `ByteMatchTable`. + #[inline] + pub const fn matches(&self, b: u8) -> bool { + self.0[b as usize] + } +} + +/// Macro to create a `ByteMatchTable` at compile time. +/// +/// `byte_match_table!(|b| b < 3)` expands to: +/// +/// ``` +/// { +/// use crate::lexer::search::ByteMatchTable; +/// #[allow(clippy::eq_op)] +/// const TABLE: ByteMatchTable = ByteMatchTable::new([ +/// (0u8 < 3), +/// (1u8 < 3), +/// (2u8 < 3), +/// (3u8 < 3), +/// /* ... */ +/// (254u8 < 3), +/// (255u8 < 3), +/// ]); +/// TABLE +/// } +/// ``` +#[allow(unused_macros)] +macro_rules! byte_match_table { + (|$byte:ident| $res:expr) => {{ + use crate::lexer::search::ByteMatchTable; + // Clippy creates warnings because e.g. `byte_match_table!(|b| b == 0)` + // is expanded to `ByteMatchTable([(0 == 0), ... ])` + #[allow(clippy::eq_op)] + const TABLE: ByteMatchTable = seq_macro::seq!($byte in 0u8..=255 { + ByteMatchTable::new([ #($res,)* ]) + }); + TABLE + }}; +} +#[allow(unused_imports)] +pub(crate) use byte_match_table; + +/// Safe byte matcher lookup table. +/// +/// Create table at compile time as a `static` or `const` with `safe_byte_match_table!` macro. +/// Test bytes against table with `SafeByteMatchTable::matches`. +/// Or use `byte_search!` macro to search for first matching byte in source. +/// +/// Only difference between this and `ByteMatchTable` is that for `SafeByteMatchTable`, +/// it must be guaranteed that `byte_search!` macro using this table will always end up with +/// `lexer.source` positioned on a UTF-8 character boundary. +/// +/// Usage of `byte_search!` macro with a `SafeByteMatchTable` table is safe, +/// and does not require an `unsafe {}` block (unlike `ByteMatchTable`). +/// +/// To make this guarantee, one of the following must be true: +/// +/// 1. Table contains `true` for all byte values 192 - 247 +/// i.e. first byte of any multi-byte Unicode character matches. +/// (NB: 248 - 255 cannot occur in UTF-8 strings) +/// e.g. `safe_byte_match_table!(|b| b >= 192)` +/// `safe_byte_match_table!(|b| !b.is_ascii())` +/// +/// 2. Table contains `false` for all byte values 128 - 191 +/// i.e. the continuation bytes of any multi-byte Unicode chars will be consumed in full. +/// e.g. `safe_byte_match_table!(|b| b < 128 || b >= 192)` +/// `safe_byte_match_table!(|b| b.is_ascii())` +/// `safe_byte_match_table!(|b| b == ' ' || b == '\t')` +/// +/// This is statically checked by `SafeByteMatchTable::new`, and will fail to compile if match +/// pattern does not satisfy one of the above. +/// +/// # Examples +/// ``` +/// use crate::lexer::search::{SafeByteMatchTable, safe_byte_match_table}; +/// +/// static NOT_ASCII: SafeByteMatchTable = safe_byte_match_table!(|b| !b.is_ascii()); +/// assert_eq!(NOT_ASCII.matches(b'X'), false); +/// assert_eq!(NOT_ASCII.matches(192), true); +/// +/// impl<'a> Lexer<'a> { +/// fn eat_ascii(&mut self) { +/// // NB: Using `byte_search!` macro with a `SafeByteMatchTable` is safe +/// byte_search! { +/// lexer: self, +/// table: NOT_ASCII, +/// handle_match: |matched_byte, start| {}, +/// handle_eof: |start| {}, +/// }; +/// } +/// } +/// ``` +#[repr(C, align(64))] +pub struct SafeByteMatchTable([bool; 256]); + +impl SafeByteMatchTable { + // Create new `SafeByteMatchTable`. + pub const fn new(bytes: [bool; 256]) -> Self { + let mut table = Self([false; 256]); + + // Check if contains either: + // 1. `true` for all byte values 192..248 + // 2. `false` for all byte values 128..192 + let mut unicode_start_all_match = true; + let mut unicode_cont_all_no_match = true; + + let mut i = 0; + loop { + let matches = bytes[i]; + table.0[i] = matches; + + if matches { + if i >= 128 && i < 192 { + unicode_cont_all_no_match = false; + } + } else if i >= 192 && i < 248 { + unicode_start_all_match = false; + } + + i += 1; + if i == 256 { + break; + } + } + + assert!( + unicode_start_all_match || unicode_cont_all_no_match, + "Cannot create a `SafeByteMatchTable` with an unsafe pattern" + ); + + table + } + + /// Declare that using this table for searching. + /// A safe function here, whereas for `ByteMatchTable` it's unsafe. + /// `byte_search!` macro calls `.use_table()` on whatever table it's provided, which makes + /// using the macro unsafe for `ByteMatchTable`, but safe for `SafeByteMatchTable`. + #[allow(clippy::unused_self)] + #[inline] + pub const fn use_table(&self) {} + + /// Test a value against this `SafeByteMatchTable`. + #[inline] + pub const fn matches(&self, b: u8) -> bool { + self.0[b as usize] + } +} + +/// Macro to create a `SafeByteMatchTable` at compile time. +/// +/// `safe_byte_match_table!(|b| !b.is_ascii())` expands to: +/// +/// ``` +/// { +/// use crate::lexer::search::SafeByteMatchTable; +/// #[allow(clippy::eq_op)] +/// const TABLE: SafeByteMatchTable = SafeByteMatchTable::new([ +/// (!0u8.is_ascii()), +/// (!1u8.is_ascii()), +/// /* ... */ +/// (!255u8.is_ascii()), +/// ]); +/// TABLE +/// } +/// ``` +macro_rules! safe_byte_match_table { + (|$byte:ident| $res:expr) => {{ + use crate::lexer::search::SafeByteMatchTable; + #[allow(clippy::eq_op)] + const TABLE: SafeByteMatchTable = seq_macro::seq!($byte in 0u8..=255 { + // Clippy creates warnings because e.g. `byte_match_table!(|b| b == 0)` + // is expanded to `SafeByteMatchTable([0 == 0, ... ])` + SafeByteMatchTable::new([#($res,)*]) + }); + TABLE + }}; +} +pub(crate) use safe_byte_match_table; + +/// Macro to search for first byte matching a `ByteMatchTable` or `SafeByteMatchTable`. +/// +/// Search processes source in batches of `SEARCH_BATCH_SIZE` bytes for speed. +/// When not enough bytes remaining in source for a batch, search source byte by byte. +/// +/// This is a macro rather than a function for 2 reasons: +/// 1. Searching is a bit faster when all the code is in a single function. +/// 2. The `handle_match` section has to be repeated twice. +/// This macro does that, so code using the macro can be DRY-er. +/// +/// Used as follows: +/// +/// ``` +/// static NOT_STUFF_TABLE: SafeByteMatchTable = safe_byte_match_table!(|b| !is_stuff(b)); +/// +/// impl<'a> Lexer<'a> { +/// fn eat_stuff(&mut self) -> bool { +/// byte_search! { +/// lexer: self, +/// table: NOT_STUFF_TABLE, +/// handle_match: |matched_byte, start| { +/// // Matching byte has been found. +/// // `matched_byte` is `u8` value of first byte which matched the table. +/// // `start` is `SourcePosition` where search began. +/// // `lexer.source` is now positioned on first matching byte. +/// // Handle the next matching byte (deal with any special cases). +/// // Value this block evaluates to will be returned from enclosing function. +/// matched_byte == b'X' +/// }, +/// handle_eof: |start| { +/// // No bytes from start position to end of source matched the table. +/// // `start` is `SourcePosition` where search began. +/// // `lexer.source` is now positioned at EOF. +/// // Handle EOF in some way. +/// // Value this block evaluates to will be returned from enclosing function. +/// false +/// }, +/// }; +/// +/// // This is unreachable. +/// // Macro always exits current function with a `return` statement. +/// } +/// } +/// ``` +/// +/// or provide the `SourcePosition` to start searching from: +/// +/// ``` +/// impl<'a> Lexer<'a> { +/// fn eat_stuff(&mut self) -> bool { +/// let start = unsafe { self.source.position().add(1) }; +/// byte_search! { +/// lexer: self, +/// table: NOT_STUFF_TABLE, +/// start: start, +/// handle_match: |matched_byte| { +/// // Matching byte has been found. +/// // `matched_byte` is `u8` value of first byte which matched the table. +/// // `lexer.source` is now positioned on first matching byte. +/// // Handle the next matching byte (deal with any special cases). +/// // Value this block evaluates to will be returned from enclosing function. +/// true +/// }, +/// handle_eof: || { +/// // No bytes from start position to end of source matched the table. +/// // `lexer.source` is now positioned at EOF. +/// // Handle EOF in some way. +/// // Value this block evaluates to will be returned from enclosing function. +/// false +/// }, +/// }; +/// +/// // This is unreachable. +/// // Macro always exits current function with a `return` statement. +/// } +/// } +/// ``` +/// +/// NB: The macro always causes enclosing function to return. +/// It creates `return` statements with the value that `handle_match` / `handle_eof` blocks evaluate to. +/// After the `byte_search!` macro is unreachable. +/// +/// # SAFETY +/// +/// This macro will consume bytes from `lexer.source` according to the `ByteMatchTable` +/// or `SafeByteMatchTable` provided. +/// +/// Using `byte_search!` with a `SafeByteMatchTable` is guaranteed to end up with `lexer.source` +/// positioned on a UTF-8 character boundary when entering `handle_match`. +/// Therefore it's safe to use `byte_search!` with a `SafeByteMatchTable`. +/// +/// `ByteMatchTable` makes no such guarantee, and using `byte_search!` with a `ByteMatchTable` is unsafe. +/// It is caller's responsibility to ensure that `lexer.source` is moved onto a UTF-8 character boundary. +/// This is similar to the contract's of `Source`'s unsafe methods. +macro_rules! byte_search { + // Standard version. + // `start` is calculated from current position of `lexer.source`. + ( + lexer: $lexer:ident, + table: $table:ident, + handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr, + handle_eof: |$eof_start:ident| $eof_handler:expr, + ) => {{ + let start = $lexer.source.position(); + byte_search! { + lexer: $lexer, + table: $table, + start: start, + handle_match: |$match_byte, $match_start| $match_handler, + handle_eof: |$eof_start| $eof_handler, + } + }}; + + // Provide your own `start` position + ( + lexer: $lexer:ident, + table: $table:ident, + start: $start:ident, + handle_match: |$match_byte:ident| $match_handler:expr, + handle_eof: || $eof_handler:expr, + ) => { + byte_search! { + lexer: $lexer, + table: $table, + start: $start, + handle_match: |$match_byte, __start| $match_handler, + handle_eof: |__start| $eof_handler, + } + }; + + // Actual implementation + ( + lexer: $lexer:ident, + table: $table:ident, + start: $start:ident, + handle_match: |$match_byte:ident, $match_start:ident| $match_handler:expr, + handle_eof: |$eof_start:ident| $eof_handler:expr, + ) => {{ + // SAFETY: + // If `$table` is a `SafeByteMatchTable`, it's guaranteed that `lexer.source` + // will be positioned on a UTF-8 character boundary before `handle_match` is called. + // If `$table` is a `ByteMatchTable`, no such guarantee is given, but call to + // `$table.use_table()` here makes using this macro unsafe, and it's the user's + // responsibility to uphold this invariant. + // Therefore we can assume this is taken care of one way or another, and wrap the calls + // to unsafe functions in this function with `unsafe {}`. + $table.use_table(); + + let mut pos = $start; + loop { + if pos.addr() <= $lexer.source.end_for_batch_search_addr() { + // Search a batch of `SEARCH_BATCH_SIZE` bytes. + // The compiler unrolls this loop. + // SAFETY: + // `pos.addr() > lexer.source.end_for_batch_search_addr()` check above ensures there are + // at least `SEARCH_BATCH_SIZE` bytes remaining in `lexer.source`. + // So calls to `pos.read()` and `pos.add(1)` in this loop cannot go out of bounds. + for _i in 0..crate::lexer::search::SEARCH_BATCH_SIZE { + // SAFETY: `pos` cannot go out of bounds in this loop (see above). + let $match_byte = unsafe { pos.read() }; + if $table.matches($match_byte) { + // Found match. + // Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes. + // SAFETY: See above about UTF-8 character boundaries invariant. + $lexer.source.set_position(pos); + + let $match_start = $start; + return $match_handler; + } + + // No match - continue searching + // SAFETY: `pos` cannot go out of bounds in this loop (see above). + // Also see above about UTF-8 character boundaries invariant. + pos = unsafe { pos.add(1) }; + } + // No match in batch - loop round and searching next batch + } else { + // Not enough bytes remaining to process as a batch. + // This branch marked `#[cold]` as should be very uncommon in normal-length JS files. + // Very short JS files will be penalized, but they'll be very fast to parse anyway. + // TODO: Could extend very short files with padding during parser initialization + // to remove that problem. + return crate::lexer::cold_branch(|| { + let end_addr = $lexer.source.end_addr(); + while pos.addr() < end_addr { + // SAFETY: `pos` is not at end of source, so safe to read a byte + let $match_byte = unsafe { pos.read() }; + if $table.matches($match_byte) { + // Found match. + // Advance `lexer.source`'s position up to `pos`, consuming unmatched bytes. + // SAFETY: See above about UTF-8 character boundaries invariant. + $lexer.source.set_position(pos); + + let $match_start = $start; + return $match_handler; + } + + // No match - continue searching + // SAFETY: `pos` is not at end of source, so safe to advance 1 byte. + // See above about UTF-8 character boundaries invariant. + pos = unsafe { pos.add(1) }; + } + + // EOF. + // Advance `lexer.source`'s position to end of file. + $lexer.source.set_position(pos); + + let $eof_start = $start; + $eof_handler + }); + } + } + }}; +} +pub(crate) use byte_search; diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs index 4178048db1cce..fe1b2c9dd436f 100644 --- a/crates/oxc_parser/src/lexer/source.rs +++ b/crates/oxc_parser/src/lexer/source.rs @@ -1,5 +1,6 @@ #![allow(clippy::unnecessary_safety_comment)] +use super::search::SEARCH_BATCH_SIZE; use crate::{UniquePromise, MAX_LEN}; use std::{marker::PhantomData, slice, str}; @@ -66,6 +67,10 @@ pub(super) struct Source<'a> { end: *const u8, /// Pointer to current position in source string ptr: *const u8, + /// Memory address past which not enough bytes remaining in source to process a batch of + /// `SEARCH_BATCH_SIZE` bytes in one go. + /// Must be `usize`, not a pointer, as if source is very short, a pointer could be out of bounds. + end_for_batch_search_addr: usize, /// Marker for immutable borrow of source string _marker: PhantomData<&'a str>, } @@ -89,7 +94,13 @@ impl<'a> Source<'a> { // for direct pointer equality with `ptr` to check if at end of file. let end = unsafe { start.add(source_text.len()) }; - Self { start, end, ptr: start, _marker: PhantomData } + // `saturating_sub` not `wrapping_sub` so that value doesn't wrap around if source + // is very short, and has very low memory address (e.g. 16). If that's the case, + // `end_for_batch_search_addr` will be 0, so a test whether any non-null pointer is past end + // will always test positive, and disable batch search. + let end_for_batch_search_addr = (end as usize).saturating_sub(SEARCH_BATCH_SIZE); + + Self { start, end, ptr: start, end_for_batch_search_addr, _marker: PhantomData } } /// Get entire source text as `&str`. @@ -127,6 +138,19 @@ impl<'a> Source<'a> { self.ptr == self.end } + /// Get end address. + #[inline] + pub(super) fn end_addr(&self) -> usize { + self.end as usize + } + + /// Get last memory address at which a batch of `Lexer::search::SEARCH_BATCH_SIZE` bytes + /// can be read without going out of bounds. + #[inline] + pub(super) fn end_for_batch_search_addr(&self) -> usize { + self.end_for_batch_search_addr + } + /// Get current position. /// /// The `SourcePosition` returned is guaranteed to be within bounds of `&str` that `Source` @@ -183,6 +207,40 @@ impl<'a> Source<'a> { self.ptr = pos.ptr; } + /// Get string slice from a `SourcePosition` up to the current position of `Source`. + pub(super) fn str_from_pos_to_current(&self, pos: SourcePosition) -> &'a str { + assert!(pos.ptr <= self.ptr); + // SAFETY: The above assertion satisfies `str_from_pos_to_current_unchecked`'s requirements + unsafe { self.str_from_pos_to_current_unchecked(pos) } + } + + /// Get string slice from a `SourcePosition` up to the current position of `Source`, + /// without checks. + /// + /// SAFETY: + /// `pos` must not be after current position of `Source`. + /// This is always the case if both: + /// 1. `Source::set_position` has not been called since `pos` was created. + /// 2. `pos` has not been advanced with `SourcePosition::add`. + #[inline] + pub(super) unsafe fn str_from_pos_to_current_unchecked(&self, pos: SourcePosition) -> &'a str { + // SAFETY: Caller guarantees `pos` is not after current position of `Source`. + // `SourcePosition`s can only be created from a `Source`. + // `Source::new` takes a `UniquePromise`, which guarantees that it's the only `Source` + // in existence on this thread. `Source` is not `Sync` or `Send`, so no possibility another + // `Source` originated on another thread can "jump" onto this one. + // This is sufficient to guarantee that any `SourcePosition` that parser/lexer holds must be + // from this `Source`, therefore `pos.ptr` and `self.ptr` must both be within the same allocation + // and derived from the same original pointer. + // Invariants of `Source` and `SourcePosition` types guarantee that both are positioned + // on UTF-8 character boundaries. So slicing source text between these 2 points will always + // yield a valid UTF-8 string. + debug_assert!(pos.ptr <= self.ptr); + let len = self.ptr as usize - pos.addr(); + let slice = slice::from_raw_parts(pos.ptr, len); + std::str::from_utf8_unchecked(slice) + } + /// Get current position in source, relative to start of source. #[allow(clippy::cast_possible_truncation)] #[inline] @@ -318,7 +376,6 @@ impl<'a> Source<'a> { /// /// In particular, safe methods `Source::next_char`, `Source::peek_char`, and `Source::remaining` /// are *not* safe to call until one of above conditions is satisfied. - #[allow(dead_code)] #[inline] unsafe fn next_byte_unchecked(&mut self) -> u8 { // SAFETY: Caller guarantees not at end of file i.e. `ptr != end`. @@ -422,6 +479,12 @@ impl<'a> SourcePosition<'a> { Self { ptr, _marker: PhantomData } } + /// Get memory address of `SourcePosition` as a `usize`. + #[inline] + pub(super) fn addr(self) -> usize { + self.ptr as usize + } + /// Create new `SourcePosition` which is `n` bytes after this one. /// The provenance of the pointer `SourcePosition` contains is maintained. /// @@ -430,7 +493,6 @@ impl<'a> SourcePosition<'a> { /// of `Source` this `SourcePosition` was created from. /// NB: It is legal to use `add` to create a `SourcePosition` which is *on* the end of `Source`, /// just not past it. - #[allow(dead_code)] #[inline] pub(super) unsafe fn add(self, n: usize) -> Self { Self::new(self.ptr.add(n)) diff --git a/crates/oxc_parser/src/lexer/string_builder.rs b/crates/oxc_parser/src/lexer/string_builder.rs index 01e1c4ef05f33..3b6961c9fa612 100644 --- a/crates/oxc_parser/src/lexer/string_builder.rs +++ b/crates/oxc_parser/src/lexer/string_builder.rs @@ -55,6 +55,8 @@ impl<'a> AutoCow<'a> { self.value.is_some() } + // TODO: Delete this if not using it + #[allow(dead_code)] pub fn finish(mut self, lexer: &Lexer<'a>) -> &'a str { match self.value.take() { Some(s) => s.into_bump_str(), diff --git a/crates/oxc_parser/src/lexer/unicode.rs b/crates/oxc_parser/src/lexer/unicode.rs index 01aac7d1dcfbc..651adffec0c1a 100644 --- a/crates/oxc_parser/src/lexer/unicode.rs +++ b/crates/oxc_parser/src/lexer/unicode.rs @@ -1,4 +1,4 @@ -use super::{AutoCow, Kind, Lexer, Span}; +use super::{Kind, Lexer, Span}; use crate::diagnostics; use oxc_allocator::String; @@ -21,10 +21,9 @@ impl<'a> Lexer<'a> { let c = self.peek().unwrap(); match c { c if is_identifier_start_unicode(c) => { - let mut builder = AutoCow::new(self); - let c = self.consume_char(); - builder.push_matching(c); - self.identifier_name(builder); + let start_pos = self.source.position(); + self.consume_char(); + self.identifier_tail_after_unicode(start_pos); Kind::Ident } c if is_irregular_whitespace(c) => { @@ -51,7 +50,7 @@ impl<'a> Lexer<'a> { /// \u{ `CodePoint` } pub(super) fn identifier_unicode_escape_sequence( &mut self, - builder: &mut AutoCow<'a>, + str: &mut String<'a>, check_identifier_start: bool, ) { let start = self.offset(); @@ -98,7 +97,7 @@ impl<'a> Lexer<'a> { return; } - builder.push_different(ch); + str.push(ch); } /// String `UnicodeEscapeSequence`