From 9fae4aa4f2c9a651856b6d300f55de0b8f268e2c Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Mon, 3 Nov 2025 20:48:18 +0800 Subject: [PATCH 1/4] Remove generic --- crates/swc_ecma_parser/src/lexer/mod.rs | 14 +++++++------- crates/swc_ecma_parser/src/lexer/state.rs | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 2bf454fbe387..8930aae765e8 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -260,7 +260,7 @@ impl<'a> Lexer<'a> { if self.state.had_line_break && C == b'-' && self.eat(b'>') { self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); self.skip_line_comment(0); - self.skip_space::(); + self.skip_space(); return self.read_token(); } @@ -303,7 +303,7 @@ impl<'a> Lexer<'a> { if had_line_break_before_last && self.is_str("====") { self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); self.skip_line_comment(4); - self.skip_space::(); + self.skip_space(); return self.read_token(); } @@ -347,7 +347,7 @@ impl Lexer<'_> { if C == b'<' && self.is(b'!') && self.peek() == Some('-') && self.peek_ahead() == Some('-') { self.skip_line_comment(3); - self.skip_space::(); + self.skip_space(); self.emit_module_mode_error(start, SyntaxError::LegacyCommentInModule); return self.read_token(); @@ -399,7 +399,7 @@ impl Lexer<'_> { { self.emit_error_span(fixed_len_span(start, 7), SyntaxError::TS1185); self.skip_line_comment(5); - self.skip_space::(); + self.skip_space(); return self.read_token(); } @@ -875,7 +875,7 @@ impl<'a> Lexer<'a> { /// /// See https://tc39.github.io/ecma262/#sec-white-space #[inline(never)] - fn skip_space(&mut self) { + fn skip_space(&mut self) { loop { let (offset, newline) = { let mut skip = self::whitespace::SkipWhitespace { @@ -894,7 +894,7 @@ impl<'a> Lexer<'a> { self.state_mut().mark_had_line_break(); } - if LEX_COMMENTS && self.input().is_byte(b'/') { + if self.input().is_byte(b'/') { if let Some(c) = self.peek() { if c == '/' { self.skip_line_comment(2); @@ -2178,7 +2178,7 @@ impl<'a> Lexer<'a> { let span = fixed_len_span(start, 7); self.emit_error_span(span, SyntaxError::TS1185); self.skip_line_comment(5); - self.skip_space::(); + self.skip_space(); return self.error_span(span, SyntaxError::TS1185); } diff --git a/crates/swc_ecma_parser/src/lexer/state.rs b/crates/swc_ecma_parser/src/lexer/state.rs index befe021df7cd..9844a5f1ffb4 100644 --- a/crates/swc_ecma_parser/src/lexer/state.rs +++ b/crates/swc_ecma_parser/src/lexer/state.rs @@ -211,7 +211,7 @@ impl crate::input::Tokens for Lexer<'_> { } fn scan_jsx_open_el_terminal_token(&mut self) -> TokenAndSpan { - self.skip_space::(); + self.skip_space(); let start = self.input.cur_pos(); let res = match self.scan_jsx_attrs_terminal_token() { Ok(res) => Ok(res), @@ -382,7 +382,7 @@ impl Lexer<'_> { self.state.had_line_break = self.state.is_first; self.state.is_first = false; - self.skip_space::(); + self.skip_space(); *start = self.input.cur_pos(); if self.input.last_pos() == self.input.end_pos() { From e1c0129bcc0a1a2cf9b4c5a8ebb7a311e32254af Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Mon, 3 Nov 2025 21:34:22 +0800 Subject: [PATCH 2/4] Refactor skip_space --- crates/swc_ecma_parser/src/lexer/mod.rs | 39 --- crates/swc_ecma_parser/src/lexer/table.rs | 3 +- .../swc_ecma_parser/src/lexer/whitespace.rs | 309 +++++++----------- 3 files changed, 119 insertions(+), 232 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/mod.rs b/crates/swc_ecma_parser/src/lexer/mod.rs index 8930aae765e8..fba8994ed75e 100644 --- a/crates/swc_ecma_parser/src/lexer/mod.rs +++ b/crates/swc_ecma_parser/src/lexer/mod.rs @@ -871,45 +871,6 @@ impl<'a> Lexer<'a> { } } - /// Skip comments or whitespaces. - /// - /// See https://tc39.github.io/ecma262/#sec-white-space - #[inline(never)] - fn skip_space(&mut self) { - loop { - let (offset, newline) = { - let mut skip = self::whitespace::SkipWhitespace { - input: self.input().as_str(), - newline: false, - offset: 0, - }; - - skip.scan(); - - (skip.offset, skip.newline) - }; - - self.input_mut().bump_bytes(offset as usize); - if newline { - self.state_mut().mark_had_line_break(); - } - - if self.input().is_byte(b'/') { - if let Some(c) = self.peek() { - if c == '/' { - self.skip_line_comment(2); - continue; - } else if c == '*' { - self.skip_block_comment(); - continue; - } - } - } - - break; - } - } - /// Ensure that ident cannot directly follow numbers. fn ensure_not_ident(&mut self) -> LexResult<()> { match self.cur() { diff --git a/crates/swc_ecma_parser/src/lexer/table.rs b/crates/swc_ecma_parser/src/lexer/table.rs index 038792dcd4da..e1db2a89c331 100644 --- a/crates/swc_ecma_parser/src/lexer/table.rs +++ b/crates/swc_ecma_parser/src/lexer/table.rs @@ -19,8 +19,9 @@ use crate::{ pub(super) type ByteHandler = fn(&mut Lexer<'_>) -> LexResult; /// Lookup table mapping any incoming byte to a handler function defined below. +#[rustfmt::skip] pub(super) static BYTE_HANDLERS: [ByteHandler; 256] = [ - // 0 1 2 3 4 5 6 7 8 9 A B C D E F // +// 0 1 2 3 4 5 6 7 8 9 A B C D E F // ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 0 ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, ERR, // 1 ERR, EXL, QOT, HSH, IDN, PRC, AMP, QOT, PNO, PNC, ATR, PLS, COM, MIN, PRD, SLH, // 2 diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs index 6037843a60cc..c470b2661938 100644 --- a/crates/swc_ecma_parser/src/lexer/whitespace.rs +++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs @@ -1,12 +1,65 @@ +use crate::{byte_search, lexer::search::SafeByteMatchTable, safe_byte_match_table, Lexer}; + +/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated ``. +/// Considered a whitespace character in JS. +pub const ZWNBSP: char = '\u{feff}'; + +/// U+000B VERTICAL TAB, abbreviated ``. +pub const VT: char = '\u{b}'; + +/// U+000C FORM FEED, abbreviated ``. +pub const FF: char = '\u{c}'; + +/// U+00A0 NON-BREAKING SPACE, abbreviated ``. +pub const NBSP: char = '\u{a0}'; + +// U+0085 NEXT LINE, abbreviated ``. +const NEL: char = '\u{85}'; + +const OGHAM_SPACE_MARK: char = '\u{1680}'; + +const EN_QUAD: char = '\u{2000}'; + +// U+200B ZERO WIDTH SPACE, abbreviated ``. +const ZWSP: char = '\u{200b}'; + +// Narrow NO-BREAK SPACE, abbreviated ``. +const NNBSP: char = '\u{202f}'; + +// U+205F MEDIUM MATHEMATICAL SPACE, abbreviated ``. +const MMSP: char = '\u{205f}'; + +const IDEOGRAPHIC_SPACE: char = '\u{3000}'; + +#[inline] +pub fn is_irregular_whitespace(c: char) -> bool { + matches!( + c, + VT | FF | NBSP | ZWNBSP | NEL | OGHAM_SPACE_MARK | EN_QUAD + ..=ZWSP | NNBSP | MMSP | IDEOGRAPHIC_SPACE + ) +} + +/// U+2028 LINE SEPARATOR, abbreviated ``. +pub const LS: char = '\u{2028}'; + +/// U+2029 PARAGRAPH SEPARATOR, abbreviated ``. +pub const PS: char = '\u{2029}'; + +pub fn is_irregular_line_terminator(c: char) -> bool { + matches!(c, LS | PS) +} + /// Returns true if it's done -type ByteHandler = Option fn(&mut SkipWhitespace<'aa>) -> u32>; +type ByteHandler = fn(&mut Lexer<'_>) -> bool; /// Lookup table for whitespace +#[rustfmt::skip] static BYTE_HANDLERS: [ByteHandler; 256] = [ - // 0 1 2 3 4 5 6 7 8 9 A B C D E F // +// 0 1 2 3 4 5 6 7 8 9 A B C D E F // ___, ___, ___, ___, ___, ___, ___, ___, ___, SPC, NLN, SPC, SPC, NLN, ___, ___, // 0 ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 1 - SPC, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 2 + SPC, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, SLH, // 2 ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 3 ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 4 ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, ___, // 5 @@ -23,208 +76,80 @@ static BYTE_HANDLERS: [ByteHandler; 256] = [ ]; /// Stop -const ___: ByteHandler = None; +const ___: ByteHandler = |_| false; /// Newline -const NLN: ByteHandler = Some(|skip| { - skip.newline = true; - - 1 -}); +const NLN: ByteHandler = |lexer| { + static NOT_REGULAR_WHITESPACE_OR_LINE_BREAK_TABLE: SafeByteMatchTable = + safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | 0x0b | 0x0c | b'\r' | b'\n')); + + lexer.state.mark_had_line_break(); + byte_search! { + lexer: lexer, + table: NOT_REGULAR_WHITESPACE_OR_LINE_BREAK_TABLE, + handle_eof: return false, + }; + true +}; /// Space -const SPC: ByteHandler = Some(|_| 1); - -/// Unicode -const UNI: ByteHandler = Some(|skip| { - // Check byte patterns directly for more efficient Unicode character processing - let bytes = skip.input.as_bytes(); - let i = skip.offset as usize; - - // Check available bytes - let remaining_bytes = bytes.len() - i; - if remaining_bytes < 1 { - return 0; - } - - // Predict UTF-8 character length from the first byte - let first_byte = unsafe { *bytes.get_unchecked(i) }; - let char_len = if first_byte < 128 { - 1 - } else if first_byte < 224 { - if remaining_bytes < 2 { - return 0; - } - 2 - } else if first_byte < 240 { - if remaining_bytes < 3 { - return 0; - } - 3 - } else { - if remaining_bytes < 4 { - return 0; - } - 4 +const SPC: ByteHandler = |lexer| { + static NOT_SPC: SafeByteMatchTable = + safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | 0x0b | 0x0c)); + + byte_search! { + lexer: lexer, + table: NOT_SPC, + handle_eof: return false, }; + true +}; - // Fast path for common Unicode whitespace characters - // Check UTF-8 byte patterns directly - if char_len == 3 { - // LSEP (U+2028) - Line Separator: E2 80 A8 - if first_byte == 0xe2 - && unsafe { *bytes.get_unchecked(i + 1) } == 0x80 - && unsafe { *bytes.get_unchecked(i + 2) } == 0xa8 - { - skip.newline = true; - return 3; - } - - // PSEP (U+2029) - Paragraph Separator: E2 80 A9 - if first_byte == 0xe2 - && unsafe { *bytes.get_unchecked(i + 1) } == 0x80 - && unsafe { *bytes.get_unchecked(i + 2) } == 0xa9 - { - skip.newline = true; - return 3; - } +const SLH: ByteHandler = |lexer| match lexer.peek() { + Some('/') => { + lexer.skip_line_comment(2); + true } + Some('*') => { + lexer.skip_block_comment(); + true + } + _ => false, +}; - // Process with general method if not handled by fast path - let s = unsafe { - // Safety: `skip.offset` is always valid - skip.input.get_unchecked(skip.offset as usize..) - }; - - let c = unsafe { - // Safety: byte handlers are only called when `skip.input` is not empty - s.chars().next().unwrap_unchecked() - }; - +/// Unicode +const UNI: ByteHandler = |lexer| { + let c = lexer.cur().unwrap(); match c { - // Byte Order Mark (BOM) - '\u{feff}' => {} - // Line break characters already handled above - '\u{2028}' | '\u{2029}' => { - skip.newline = true; + c if is_irregular_whitespace(c) => { + lexer.bump(); + true } - // Other whitespace characters - _ if c.is_whitespace() => {} - // Not a whitespace character - _ => return 0, - } - - c.len_utf8() as u32 -}); - -/// API is taked from oxc by Boshen (https://github.com/Boshen/oxc/pull/26) -pub(super) struct SkipWhitespace<'a> { - pub input: &'a str, - - /// Total offset - pub offset: u32, - - /// Found newline - pub newline: bool, -} - -impl SkipWhitespace<'_> { - #[inline(always)] - pub fn scan(&mut self) { - let bytes = self.input.as_bytes(); - let len = bytes.len(); - let mut pos = self.offset as usize; - debug_assert!(pos == 0); - debug_assert!(pos <= len); - - // Optimization: return immediately if input is empty - if pos == len { - return; + c if is_irregular_line_terminator(c) => { + lexer.bump(); + lexer.state.mark_had_line_break(); + true } - + _ => false, + } +}; + +impl<'a> Lexer<'a> { + /// Skip comments or whitespaces. + /// + /// See https://tc39.github.io/ecma262/#sec-white-space + #[inline(never)] + pub fn skip_space(&mut self) { loop { - // Optimization 1: Process consecutive spaces (most common case) at once - let mut byte = unsafe { *bytes.get_unchecked(pos) }; - - // Handle consecutive space characters (very common case) - if byte == b' ' { - pos += 1; - // Skip spaces repeatedly (process multiple spaces at once) - while pos < len && unsafe { *bytes.get_unchecked(pos) } == b' ' { - pos += 1; - } - - // Check if we've reached the end of input - if pos >= len { - break; - } - - // Get current byte again - byte = unsafe { *bytes.get_unchecked(pos) }; - } - - // Optimization 2: Handle other common whitespace characters - match byte { - b'\n' => { - pos += 1; - self.newline = true; - - if pos >= len { - break; - } - continue; - } - b'\r' => { - pos += 1; - - // Handle CR+LF sequence (Windows line break) - if pos < len && unsafe { *bytes.get_unchecked(pos) } == b'\n' { - pos += 1; - self.newline = true; - } else { - self.newline = true; // Treat standalone CR as line - // break too - } - - if pos >= len { - break; - } - continue; - } - // Case where handler is needed - _ => { - debug_assert!(byte != b' ' && byte != b'\n' && byte != b'\r'); - // Temporarily update offset - self.offset = pos as u32; - - // Use handler table - let handler = unsafe { BYTE_HANDLERS.get_unchecked(byte as usize) }; - - match handler { - Some(handler) => { - let delta = handler(self); - if delta == 0 { - // Non-whitespace character found - // offset is already updated - return; - } - pos = (self.offset + delta) as usize; - - if pos >= len { - break; - } - } - None => { - // Non-whitespace character found - // offset is already updated - return; - } - } - } + let byte = match self.input.as_str().as_bytes().first() { + Some(&v) => v, + None => return, + }; + + let handler = unsafe { *(&BYTE_HANDLERS as *const ByteHandler).offset(byte as isize) }; + if !handler(self) { + break; } } - - // Update offset to final position - self.offset = pos as u32; } } From 56d16419645dcfa89d95ad3828a836259dcf43ac Mon Sep 17 00:00:00 2001 From: CPunisher <1343316114@qq.com> Date: Tue, 4 Nov 2025 11:05:05 +0800 Subject: [PATCH 3/4] Organize --- .../swc_ecma_parser/src/lexer/whitespace.rs | 64 +++++++++++-------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/crates/swc_ecma_parser/src/lexer/whitespace.rs b/crates/swc_ecma_parser/src/lexer/whitespace.rs index c470b2661938..9a79df81e3ac 100644 --- a/crates/swc_ecma_parser/src/lexer/whitespace.rs +++ b/crates/swc_ecma_parser/src/lexer/whitespace.rs @@ -1,38 +1,44 @@ use crate::{byte_search, lexer::search::SafeByteMatchTable, safe_byte_match_table, Lexer}; -/// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated ``. -/// Considered a whitespace character in JS. -pub const ZWNBSP: char = '\u{feff}'; - /// U+000B VERTICAL TAB, abbreviated ``. -pub const VT: char = '\u{b}'; +const B_VT: u8 = 0x0b; /// U+000C FORM FEED, abbreviated ``. -pub const FF: char = '\u{c}'; +const B_FF: u8 = 0x0c; -/// U+00A0 NON-BREAKING SPACE, abbreviated ``. -pub const NBSP: char = '\u{a0}'; +// https://github.com/oxc-project/oxc/blob/ec6721c458d64c5b27b78542aa205d70b06edf9a/crates/oxc_syntax/src/identifier.rs#L70 +#[inline] +pub fn is_irregular_whitespace(c: char) -> bool { + /// U+FEFF ZERO WIDTH NO-BREAK SPACE, abbreviated ``. + /// Considered a whitespace character in JS. + const ZWNBSP: char = '\u{feff}'; -// U+0085 NEXT LINE, abbreviated ``. -const NEL: char = '\u{85}'; + /// U+000B VERTICAL TAB, abbreviated ``. + const VT: char = '\u{b}'; -const OGHAM_SPACE_MARK: char = '\u{1680}'; + /// U+000C FORM FEED, abbreviated ``. + const FF: char = '\u{c}'; -const EN_QUAD: char = '\u{2000}'; + /// U+00A0 NON-BREAKING SPACE, abbreviated ``. + const NBSP: char = '\u{a0}'; -// U+200B ZERO WIDTH SPACE, abbreviated ``. -const ZWSP: char = '\u{200b}'; + /// U+0085 NEXT LINE, abbreviated ``. + const NEL: char = '\u{85}'; -// Narrow NO-BREAK SPACE, abbreviated ``. -const NNBSP: char = '\u{202f}'; + const OGHAM_SPACE_MARK: char = '\u{1680}'; -// U+205F MEDIUM MATHEMATICAL SPACE, abbreviated ``. -const MMSP: char = '\u{205f}'; + const EN_QUAD: char = '\u{2000}'; -const IDEOGRAPHIC_SPACE: char = '\u{3000}'; + /// U+200B ZERO WIDTH SPACE, abbreviated ``. + const ZWSP: char = '\u{200b}'; -#[inline] -pub fn is_irregular_whitespace(c: char) -> bool { + /// Narrow NO-BREAK SPACE, abbreviated ``. + const NNBSP: char = '\u{202f}'; + + /// U+205F MEDIUM MATHEMATICAL SPACE, abbreviated ``. + const MMSP: char = '\u{205f}'; + + const IDEOGRAPHIC_SPACE: char = '\u{3000}'; matches!( c, VT | FF | NBSP | ZWNBSP | NEL | OGHAM_SPACE_MARK | EN_QUAD @@ -40,13 +46,15 @@ pub fn is_irregular_whitespace(c: char) -> bool { ) } -/// U+2028 LINE SEPARATOR, abbreviated ``. -pub const LS: char = '\u{2028}'; +// https://github.com/oxc-project/oxc/blob/ec6721c458d64c5b27b78542aa205d70b06edf9a/crates/oxc_syntax/src/identifier.rs#L102 +#[inline] +pub fn is_irregular_line_terminator(c: char) -> bool { + /// U+2028 LINE SEPARATOR, abbreviated ``. + const LS: char = '\u{2028}'; -/// U+2029 PARAGRAPH SEPARATOR, abbreviated ``. -pub const PS: char = '\u{2029}'; + /// U+2029 PARAGRAPH SEPARATOR, abbreviated ``. + const PS: char = '\u{2029}'; -pub fn is_irregular_line_terminator(c: char) -> bool { matches!(c, LS | PS) } @@ -81,7 +89,7 @@ const ___: ByteHandler = |_| false; /// Newline const NLN: ByteHandler = |lexer| { static NOT_REGULAR_WHITESPACE_OR_LINE_BREAK_TABLE: SafeByteMatchTable = - safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | 0x0b | 0x0c | b'\r' | b'\n')); + safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | B_VT | B_FF | b'\r' | b'\n')); lexer.state.mark_had_line_break(); byte_search! { @@ -95,7 +103,7 @@ const NLN: ByteHandler = |lexer| { /// Space const SPC: ByteHandler = |lexer| { static NOT_SPC: SafeByteMatchTable = - safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | 0x0b | 0x0c)); + safe_byte_match_table!(|b| !matches!(b, b' ' | b'\t' | B_VT | B_FF)); byte_search! { lexer: lexer, From 0a74aa153931e532775276c60010e0ddb703bba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Donny/=EA=B0=95=EB=8F=99=EC=9C=A4?= Date: Tue, 4 Nov 2025 12:29:09 +0900 Subject: [PATCH 4/4] Create fluffy-cobras-agree.md --- .changeset/fluffy-cobras-agree.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changeset/fluffy-cobras-agree.md diff --git a/.changeset/fluffy-cobras-agree.md b/.changeset/fluffy-cobras-agree.md new file mode 100644 index 000000000000..dc331e244c65 --- /dev/null +++ b/.changeset/fluffy-cobras-agree.md @@ -0,0 +1,6 @@ +--- +swc_core: patch +swc_ecma_parser: patch +--- + +perf(es/parser): optimize `skip_space`