@@ -340,9 +340,10 @@ enum Escape {
340340 DQ = 11 , // " - Double quote
341341 BQ = 12 , // ` - Backtick quote
342342 DO = 13 , // $ - Dollar sign
343- LS = 14 , // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
344- NB = 15 , // NBSP - Non-breaking space (first byte)
345- LO = 16 , // � - U+FFFD lossy replacement character (first byte)
343+ LT = 14 , // < - Less-than sign
344+ LS = 15 , // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte)
345+ NB = 16 , // NBSP - Non-breaking space (first byte)
346+ LO = 17 , // � - U+FFFD lossy replacement character (first byte)
346347}
347348
348349/// Struct which ensures content is aligned on 128.
@@ -362,7 +363,7 @@ static ESCAPES: Aligned128<[Escape; 256]> = {
362363 NU , __, __, __, __, __, __, BE , BK , __, NL , VT , FF , CR , __, __, // 0
363364 __, __, __, __, __, __, __, __, __, __, __, ES , __, __, __, __, // 1
364365 __, __, DQ , __, DO , __, __, SQ , __, __, __, __, __, __, __, __, // 2
365- __, __, __, __, __, __, __, __, __, __, __, __, __ , __, __, __, // 3
366+ __, __, __, __, __, __, __, __, __, __, __, __, LT , __, __, __, // 3
366367 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
367368 __, __, __, __, __, __, __, __, __, __, __, __, BS , __, __, __, // 5
368369 BQ , __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
@@ -385,9 +386,10 @@ type ByteHandler = unsafe fn(&mut Codegen, &mut PrintStringState);
385386/// Indexed by `escape as usize - 1` (where `escape` is not `Escape::__`).
386387/// Must be in same order as discriminants in `Escape`.
387388///
388- /// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 128 bytes in total.
389- /// Aligned on 128, so occupies a pair of L1 cache lines.
390- static BYTE_HANDLERS : Aligned128 < [ ByteHandler ; 16 ] > = Aligned128 ( [
389+ /// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 136 bytes in total.
390+ /// Aligned on 128, so first 16 occupy a pair of L1 cache lines.
391+ /// The last will be in separate cache line, but it should be vanishingly rare that it's accessed.
392+ static BYTE_HANDLERS : Aligned128 < [ ByteHandler ; 17 ] > = Aligned128 ( [
391393 print_null,
392394 print_bell,
393395 print_backspace,
@@ -401,6 +403,7 @@ static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([
401403 print_double_quote,
402404 print_backtick,
403405 print_dollar,
406+ print_less_than,
404407 print_ls_or_ps,
405408 print_non_breaking_space,
406409 print_lossy_replacement,
@@ -579,6 +582,42 @@ unsafe fn print_dollar(codegen: &mut Codegen, state: &mut PrintStringState) {
579582 }
580583}
581584
585+ // <
586+ unsafe fn print_less_than ( codegen : & mut Codegen , state : & mut PrintStringState ) {
587+ debug_assert_eq ! ( state. peek( ) , Some ( b'<' ) ) ;
588+
589+ // Get slice of remaining bytes, including leading `<`
590+ let slice = state. bytes . as_slice ( ) ;
591+
592+ // SAFETY: Next byte is `<`, which is ASCII
593+ unsafe { state. consume_byte_unchecked ( ) } ;
594+
595+ // We have to check 2nd byte separately as `next8_lower_case == *b"</script"`
596+ // would also match `<\x0Fscript` (0xF | 32 == b'/').
597+ if slice. len ( ) >= 8 && slice[ 1 ] == b'/' {
598+ // Compiler condenses these operations to an 8-byte read, u64 AND, and u64 compare.
599+ // https://godbolt.org/z/9ndYnbj53
600+ let next8: [ u8 ; 8 ] = slice[ 0 ..8 ] . try_into ( ) . unwrap ( ) ;
601+ let mut next8_lower_case = [ 0 ; 8 ] ;
602+ for i in 0 ..8 {
603+ // `| 32` converts ASCII upper case letters to lower case. `<` and `/` are unaffected.
604+ next8_lower_case[ i] = next8[ i] | 32 ;
605+ }
606+
607+ if next8_lower_case == * b"</script" {
608+ // Flush up to and including `<`. Skip `/`. Write `\/` instead. Then skip over `script`.
609+ // Next chunk starts with `script`.
610+ // SAFETY: We already consumed `<`. Next byte is `/`, which is ASCII.
611+ unsafe { state. flush_and_consume_byte ( codegen) } ;
612+ // SAFETY: `slice.len() >= 8` check above ensures there are 6 bytes left, after consuming 2 already.
613+ // `script` / `SCRIPT` is all ASCII bytes, so skipping them leaves `bytes` iterator
614+ // positioned on UTF-8 char boundary.
615+ unsafe { state. consume_bytes_unchecked :: < 6 > ( ) } ;
616+ codegen. print_str ( "\\ /" ) ;
617+ }
618+ }
619+ }
620+
582621// 0xE2 - first byte of <LS> or <PS>
583622unsafe fn print_ls_or_ps ( codegen : & mut Codegen , state : & mut PrintStringState ) {
584623 debug_assert_eq ! ( state. peek( ) , Some ( 0xE2 ) ) ;
0 commit comments