Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 46 additions & 7 deletions crates/oxc_codegen/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,9 +335,10 @@ enum Escape {
DQ = 11, // " - Double quote
BQ = 12, // ` - Backtick quote
DO = 13, // $ - Dollar sign
LS = 14, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
NB = 15, // NBSP - Non-breaking space (first byte)
LO = 16, // � - U+FFFD lossy replacement character (first byte)
LT = 14, // < - Less-than sign
LS = 15, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
NB = 16, // NBSP - Non-breaking space (first byte)
LO = 17, // � - U+FFFD lossy replacement character (first byte)
}

/// Struct which ensures content is aligned on 128.
Expand All @@ -357,7 +358,7 @@ static ESCAPES: Aligned128<[Escape; 256]> = {
NU, __, __, __, __, __, __, BE, BK, __, NL, VT, FF, CR, __, __, // 0
__, __, __, __, __, __, __, __, __, __, __, ES, __, __, __, __, // 1
__, __, DQ, __, DO, __, __, SQ, __, __, __, __, __, __, __, __, // 2
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
__, __, __, __, __, __, __, __, __, __, __, __, LT, __, __, __, // 3
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
BQ, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
Expand All @@ -380,9 +381,10 @@ type ByteHandler = unsafe fn(&mut Codegen, &mut PrintStringState);
/// Indexed by `escape as usize - 1` (where `escape` is not `Escape::__`).
/// Must be in same order as discriminants in `Escape`.
///
/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 128 bytes in total.
/// Aligned on 128, so occupies a pair of L1 cache lines.
static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 136 bytes in total.
/// Aligned on 128, so first 16 occupy a pair of L1 cache lines.
/// The last will be in separate cache line, but it should be vanishingly rare that it's accessed.
static BYTE_HANDLERS: Aligned128<[ByteHandler; 17]> = Aligned128([
print_null,
print_bell,
print_backspace,
Expand All @@ -396,6 +398,7 @@ static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
print_double_quote,
print_backtick,
print_dollar,
print_less_than,
print_ls_or_ps,
print_non_breaking_space,
print_lossy_replacement,
Expand Down Expand Up @@ -574,6 +577,42 @@ unsafe fn print_dollar(codegen: &mut Codegen, state: &mut PrintStringState) {
}
}

// <
unsafe fn print_less_than(codegen: &mut Codegen, state: &mut PrintStringState) {
debug_assert_eq!(state.peek(), Some(b'<'));

// Get slice of remaining bytes, including leading `<`
let slice = state.bytes.as_slice();

// SAFETY: Next byte is `<`, which is ASCII
unsafe { state.consume_byte_unchecked() };

// We have to check 2nd byte separately as `next8_lower_case == *b"</script"`
// would also match `<\x0Fscript` (0xF | 32 == b'/').
if slice.len() >= 8 && slice[1] == b'/' {
// Compiler condenses these operations to an 8-byte read, u64 AND, and u64 compare.
// https://godbolt.org/z/9ndYnbj53
let next8: [u8; 8] = slice[0..8].try_into().unwrap();
let mut next8_lower_case = [0; 8];
for i in 0..8 {
// `| 32` converts ASCII upper case letters to lower case. `<` and `/` are unaffected.
next8_lower_case[i] = next8[i] | 32;
}

if next8_lower_case == *b"</script" {
// Flush up to and including `<`. Skip `/`. Write `\/` instead. Then skip over `script`.
// Next chunk starts with `script`.
// SAFETY: We already consumed `<`. Next byte is `/`, which is ASCII.
unsafe { state.flush_and_consume_byte(codegen) };
// SAFETY: `slice.len() >= 8` check above ensures there are 6 bytes left, after consuming 2 already.
// `script` / `SCRIPT` is all ASCII bytes, so skipping them leaves `bytes` iterator
// positioned on UTF-8 char boundary.
unsafe { state.consume_bytes_unchecked::<6>() };
codegen.print_str("\\/");
}
}
}

// 0xE2 - first byte of <LS> or <PS>
unsafe fn print_ls_or_ps(codegen: &mut Codegen, state: &mut PrintStringState) {
debug_assert_eq!(state.peek(), Some(0xE2));
Expand Down
4 changes: 4 additions & 0 deletions crates/oxc_codegen/tests/integration/esbuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,10 @@ fn test_jsx_single_line() {
fn test_avoid_slash_script() {
// Positive cases
test("x = '</script'", "x = \"<\\/script\";\n");
test("x = '</SCRIPT'", "x = \"<\\/SCRIPT\";\n");
test("x = '</ScRiPt'", "x = \"<\\/ScRiPt\";\n");
test("x = 'abc </script def'", "x = \"abc <\\/script def\";\n");
test("x = 'abc </ScRiPt def'", "x = \"abc <\\/ScRiPt def\";\n");
test("x = `</script`", "x = `<\\/script`;\n");
test("x = `</SCRIPT`", "x = `<\\/SCRIPT`;\n");
test("x = `</ScRiPt`", "x = `<\\/ScRiPt`;\n");
Expand Down
Loading