From a36061e1db217e069769c930b3f8e7eafaf974bd Mon Sep 17 00:00:00 2001 From: overlookmotel Date: Wed, 9 Apr 2025 18:38:21 +0100 Subject: [PATCH] fix(codegen): escape `" = { NU, __, __, __, __, __, __, BE, BK, __, NL, VT, FF, CR, __, __, // 0 __, __, __, __, __, __, __, __, __, __, __, ES, __, __, __, __, // 1 __, __, DQ, __, DO, __, __, SQ, __, __, __, __, __, __, __, __, // 2 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 + __, __, __, __, __, __, __, __, __, __, __, __, LT, __, __, __, // 3 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 BQ, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 @@ -380,9 +381,10 @@ type ByteHandler = unsafe fn(&mut Codegen, &mut PrintStringState); /// Indexed by `escape as usize - 1` (where `escape` is not `Escape::__`). /// Must be in same order as discriminants in `Escape`. /// -/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 128 bytes in total. -/// Aligned on 128, so occupies a pair of L1 cache lines. -static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([ +/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 136 bytes in total. +/// Aligned on 128, so first 16 occupy a pair of L1 cache lines. +/// The last will be in separate cache line, but it should be vanishingly rare that it's accessed. +static BYTE_HANDLERS: Aligned128<[ByteHandler; 17]> = Aligned128([ print_null, print_bell, print_backspace, @@ -396,6 +398,7 @@ static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([ print_double_quote, print_backtick, print_dollar, + print_less_than, print_ls_or_ps, print_non_breaking_space, print_lossy_replacement, @@ -574,6 +577,42 @@ unsafe fn print_dollar(codegen: &mut Codegen, state: &mut PrintStringState) { } } +// < +unsafe fn print_less_than(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'<')); + + // Get slice of remaining bytes, including leading `<` + let slice = state.bytes.as_slice(); + + // SAFETY: Next byte is `<`, which is ASCII + unsafe { state.consume_byte_unchecked() }; + + // We have to check 2nd byte separately as `next8_lower_case == *b"= 8 && slice[1] == b'/' { + // Compiler condenses these operations to an 8-byte read, u64 AND, and u64 compare. + // https://godbolt.org/z/9ndYnbj53 + let next8: [u8; 8] = slice[0..8].try_into().unwrap(); + let mut next8_lower_case = [0; 8]; + for i in 0..8 { + // `| 32` converts ASCII upper case letters to lower case. `<` and `/` are unaffected. + next8_lower_case[i] = next8[i] | 32; + } + + if next8_lower_case == *b"= 8` check above ensures there are 6 bytes left, after consuming 2 already. + // `script` / `SCRIPT` is all ASCII bytes, so skipping them leaves `bytes` iterator + // positioned on UTF-8 char boundary. + unsafe { state.consume_bytes_unchecked::<6>() }; + codegen.print_str("\\/"); + } + } +} + // 0xE2 - first byte of or unsafe fn print_ls_or_ps(codegen: &mut Codegen, state: &mut PrintStringState) { debug_assert_eq!(state.peek(), Some(0xE2)); diff --git a/crates/oxc_codegen/tests/integration/esbuild.rs b/crates/oxc_codegen/tests/integration/esbuild.rs index 6e75a376d4939..3e00a70cef7be 100644 --- a/crates/oxc_codegen/tests/integration/esbuild.rs +++ b/crates/oxc_codegen/tests/integration/esbuild.rs @@ -1011,6 +1011,10 @@ fn test_jsx_single_line() { fn test_avoid_slash_script() { // Positive cases test("x = '