fix(codegen): escape "</script"

overlookmotel · xu-cheng · commit 33dcd441c02b · 2025-06-17T11:29:33.000-07:00
diff --git a/crates/oxc_codegen/src/str.rs b/crates/oxc_codegen/src/str.rs
@@ -340,9 +340,10 @@ enum Escape {
     DQ = 11, // "     - Double quote
     BQ = 12, // `     - Backtick quote
     DO = 13, // $     - Dollar sign
-    LS = 14, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
-    NB = 15, // NBSP  - Non-breaking space (first byte)
-    LO = 16, // �     - U+FFFD lossy replacement character (first byte)
+    LT = 14, // <     - Less-than sign
+    LS = 15, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
+    NB = 16, // NBSP  - Non-breaking space (first byte)
+    LO = 17, // �     - U+FFFD lossy replacement character (first byte)
 }
 
 /// Struct which ensures content is aligned on 128.
@@ -362,7 +363,7 @@ static ESCAPES: Aligned128<[Escape; 256]> = {
         NU, __, __, __, __, __, __, BE, BK, __, NL, VT, FF, CR, __, __, // 0
         __, __, __, __, __, __, __, __, __, __, __, ES, __, __, __, __, // 1
         __, __, DQ, __, DO, __, __, SQ, __, __, __, __, __, __, __, __, // 2
-        __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
+        __, __, __, __, __, __, __, __, __, __, __, __, LT, __, __, __, // 3
         __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
         __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
         BQ, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
@@ -385,9 +386,10 @@ type ByteHandler = unsafe fn(&mut Codegen, &mut PrintStringState);
 /// Indexed by `escape as usize - 1` (where `escape` is not `Escape::__`).
 /// Must be in same order as discriminants in `Escape`.
 ///
-/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 128 bytes in total.
-/// Aligned on 128, so occupies a pair of L1 cache lines.
-static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
+/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 136 bytes in total.
+/// Aligned on 128, so first 16 occupy a pair of L1 cache lines.
+/// The last will be in separate cache line, but it should be vanishingly rare that it's accessed.
+static BYTE_HANDLERS: Aligned128<[ByteHandler; 17]> = Aligned128([
     print_null,
     print_bell,
     print_backspace,
@@ -401,6 +403,7 @@ static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
     print_double_quote,
     print_backtick,
     print_dollar,
+    print_less_than,
     print_ls_or_ps,
     print_non_breaking_space,
     print_lossy_replacement,
@@ -579,6 +582,42 @@ unsafe fn print_dollar(codegen: &mut Codegen, state: &mut PrintStringState) {
     }
 }
 
+// <
+unsafe fn print_less_than(codegen: &mut Codegen, state: &mut PrintStringState) {
+    debug_assert_eq!(state.peek(), Some(b'<'));
+
+    // Get slice of remaining bytes, including leading `<`
+    let slice = state.bytes.as_slice();
+
+    // SAFETY: Next byte is `<`, which is ASCII
+    unsafe { state.consume_byte_unchecked() };
+
+    // We have to check 2nd byte separately as `next8_lower_case == *b"</script"`
+    // would also match `<\x0Fscript` (0xF | 32 == b'/').
+    if slice.len() >= 8 && slice[1] == b'/' {
+        // Compiler condenses these operations to an 8-byte read, u64 AND, and u64 compare.
+        // https://godbolt.org/z/9ndYnbj53
+        let next8: [u8; 8] = slice[0..8].try_into().unwrap();
+        let mut next8_lower_case = [0; 8];
+        for i in 0..8 {
+            // `| 32` converts ASCII upper case letters to lower case. `<` and `/` are unaffected.
+            next8_lower_case[i] = next8[i] | 32;
+        }
+
+        if next8_lower_case == *b"</script" {
+            // Flush up to and including `<`. Skip `/`. Write `\/` instead. Then skip over `script`.
+            // Next chunk starts with `script`.
+            // SAFETY: We already consumed `<`. Next byte is `/`, which is ASCII.
+            unsafe { state.flush_and_consume_byte(codegen) };
+            // SAFETY: `slice.len() >= 8` check above ensures there are 6 bytes left, after consuming 2 already.
+            // `script` / `SCRIPT` is all ASCII bytes, so skipping them leaves `bytes` iterator
+            // positioned on UTF-8 char boundary.
+            unsafe { state.consume_bytes_unchecked::<6>() };
+            codegen.print_str("\\/");
+        }
+    }
+}
+
 // 0xE2 - first byte of <LS> or <PS>
 unsafe fn print_ls_or_ps(codegen: &mut Codegen, state: &mut PrintStringState) {
     debug_assert_eq!(state.peek(), Some(0xE2));
diff --git a/crates/oxc_codegen/tests/integration/esbuild.rs b/crates/oxc_codegen/tests/integration/esbuild.rs
@@ -1018,6 +1018,10 @@ fn test_jsx_single_line() {
 fn test_avoid_slash_script() {
     // Positive cases
     test("x = '</script'", "x = \"<\\/script\";\n");
+    test("x = '</SCRIPT'", "x = \"<\\/SCRIPT\";\n");
+    test("x = '</ScRiPt'", "x = \"<\\/ScRiPt\";\n");
+    test("x = 'abc </script def'", "x = \"abc <\\/script def\";\n");
+    test("x = 'abc </ScRiPt def'", "x = \"abc <\\/ScRiPt def\";\n");
     test("x = `</script`", "x = `<\\/script`;\n");
     test("x = `</SCRIPT`", "x = `<\\/SCRIPT`;\n");
     test("x = `</ScRiPt`", "x = `<\\/ScRiPt`;\n");