Skip to content

Commit 33dcd44

Browse files
overlookmotelxu-cheng
authored andcommitted
fix(codegen): escape "</script"
1 parent 21c8852 commit 33dcd44

File tree

2 files changed

+50
-7
lines changed

2 files changed

+50
-7
lines changed

crates/oxc_codegen/src/str.rs

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -340,9 +340,10 @@ enum Escape {
340340
DQ = 11, // " - Double quote
341341
BQ = 12, // ` - Backtick quote
342342
DO = 13, // $ - Dollar sign
343-
LS = 14, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
344-
NB = 15, // NBSP - Non-breaking space (first byte)
345-
LO = 16, // � - U+FFFD lossy replacement character (first byte)
343+
LT = 14, // < - Less-than sign
344+
LS = 15, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
345+
NB = 16, // NBSP - Non-breaking space (first byte)
346+
LO = 17, // � - U+FFFD lossy replacement character (first byte)
346347
}
347348

348349
/// Struct which ensures content is aligned on 128.
@@ -362,7 +363,7 @@ static ESCAPES: Aligned128<[Escape; 256]> = {
362363
NU, __, __, __, __, __, __, BE, BK, __, NL, VT, FF, CR, __, __, // 0
363364
__, __, __, __, __, __, __, __, __, __, __, ES, __, __, __, __, // 1
364365
__, __, DQ, __, DO, __, __, SQ, __, __, __, __, __, __, __, __, // 2
365-
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
366+
__, __, __, __, __, __, __, __, __, __, __, __, LT, __, __, __, // 3
366367
__, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
367368
__, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
368369
BQ, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
@@ -385,9 +386,10 @@ type ByteHandler = unsafe fn(&mut Codegen, &mut PrintStringState);
385386
/// Indexed by `escape as usize - 1` (where `escape` is not `Escape::__`).
386387
/// Must be in same order as discriminants in `Escape`.
387388
///
388-
/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 128 bytes in total.
389-
/// Aligned on 128, so occupies a pair of L1 cache lines.
390-
static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
389+
/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 136 bytes in total.
390+
/// Aligned on 128, so first 16 occupy a pair of L1 cache lines.
391+
/// The last will be in separate cache line, but it should be vanishingly rare that it's accessed.
392+
static BYTE_HANDLERS: Aligned128<[ByteHandler; 17]> = Aligned128([
391393
print_null,
392394
print_bell,
393395
print_backspace,
@@ -401,6 +403,7 @@ static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
401403
print_double_quote,
402404
print_backtick,
403405
print_dollar,
406+
print_less_than,
404407
print_ls_or_ps,
405408
print_non_breaking_space,
406409
print_lossy_replacement,
@@ -579,6 +582,42 @@ unsafe fn print_dollar(codegen: &mut Codegen, state: &mut PrintStringState) {
579582
}
580583
}
581584

585+
// <
586+
unsafe fn print_less_than(codegen: &mut Codegen, state: &mut PrintStringState) {
587+
debug_assert_eq!(state.peek(), Some(b'<'));
588+
589+
// Get slice of remaining bytes, including leading `<`
590+
let slice = state.bytes.as_slice();
591+
592+
// SAFETY: Next byte is `<`, which is ASCII
593+
unsafe { state.consume_byte_unchecked() };
594+
595+
// We have to check 2nd byte separately as `next8_lower_case == *b"</script"`
596+
// would also match `<\x0Fscript` (0xF | 32 == b'/').
597+
if slice.len() >= 8 && slice[1] == b'/' {
598+
// Compiler condenses these operations to an 8-byte read, u64 AND, and u64 compare.
599+
// https://godbolt.org/z/9ndYnbj53
600+
let next8: [u8; 8] = slice[0..8].try_into().unwrap();
601+
let mut next8_lower_case = [0; 8];
602+
for i in 0..8 {
603+
// `| 32` converts ASCII upper case letters to lower case. `<` and `/` are unaffected.
604+
next8_lower_case[i] = next8[i] | 32;
605+
}
606+
607+
if next8_lower_case == *b"</script" {
608+
// Flush up to and including `<`. Skip `/`. Write `\/` instead. Then skip over `script`.
609+
// Next chunk starts with `script`.
610+
// SAFETY: We already consumed `<`. Next byte is `/`, which is ASCII.
611+
unsafe { state.flush_and_consume_byte(codegen) };
612+
// SAFETY: `slice.len() >= 8` check above ensures there are 6 bytes left, after consuming 2 already.
613+
// `script` / `SCRIPT` is all ASCII bytes, so skipping them leaves `bytes` iterator
614+
// positioned on UTF-8 char boundary.
615+
unsafe { state.consume_bytes_unchecked::<6>() };
616+
codegen.print_str("\\/");
617+
}
618+
}
619+
}
620+
582621
// 0xE2 - first byte of <LS> or <PS>
583622
unsafe fn print_ls_or_ps(codegen: &mut Codegen, state: &mut PrintStringState) {
584623
debug_assert_eq!(state.peek(), Some(0xE2));

crates/oxc_codegen/tests/integration/esbuild.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,6 +1018,10 @@ fn test_jsx_single_line() {
10181018
fn test_avoid_slash_script() {
10191019
// Positive cases
10201020
test("x = '</script'", "x = \"<\\/script\";\n");
1021+
test("x = '</SCRIPT'", "x = \"<\\/SCRIPT\";\n");
1022+
test("x = '</ScRiPt'", "x = \"<\\/ScRiPt\";\n");
1023+
test("x = 'abc </script def'", "x = \"abc <\\/script def\";\n");
1024+
test("x = 'abc </ScRiPt def'", "x = \"abc <\\/ScRiPt def\";\n");
10211025
test("x = `</script`", "x = `<\\/script`;\n");
10221026
test("x = `</SCRIPT`", "x = `<\\/SCRIPT`;\n");
10231027
test("x = `</ScRiPt`", "x = `<\\/ScRiPt`;\n");

0 commit comments

Comments
 (0)