22//!
33//! Code adapted from
44//! * [esbuild](https://github.com/evanw/esbuild/blob/v0.24.0/internal/js_printer/js_printer.go)
5+
56#![ warn( missing_docs) ]
67
8+ use std:: { cmp, slice} ;
9+
10+ use oxc_data_structures:: pointer_ext:: PointerExt ;
11+
712mod binary_expr_visitor;
813mod comment;
914mod context;
@@ -30,7 +35,7 @@ use crate::{
3035 comment:: CommentsMap ,
3136 operator:: Operator ,
3237 sourcemap_builder:: SourcemapBuilder ,
33- str:: { Quote , is_script_close_tag} ,
38+ str:: { Quote , cold_branch , is_script_close_tag} ,
3439} ;
3540pub use crate :: {
3641 context:: Context ,
@@ -236,35 +241,113 @@ impl<'a> Codegen<'a> {
236241 /// Push str into the buffer, escaping `</script` to `<\/script`.
237242 #[ inline]
238243 pub fn print_str_escaping_script_close_tag ( & mut self , s : & str ) {
239- let slice = s. as_bytes ( ) ;
244+ // `</script` will be very rare. So we try to make the search as quick as possible by:
245+ // 1. Searching for `<` first, and only checking if followed by `/script` once `<` is found.
246+ // 2. Searching longer strings for `<` in chunks of 16 bytes using SIMD, and only doing the
247+ // more expensive byte-by-byte search once a `<` is found.
248+
249+ let bytes = s. as_bytes ( ) ;
240250 let mut consumed = 0 ;
241- let mut i = 0 ;
242-
243- // Only check when remaining string has length larger than 8
244- while i + 8 <= slice. len ( ) {
245- if is_script_close_tag ( & slice[ i..i + 8 ] ) {
246- // Push str up to and including `<`. Skip `/`. Write `\/` instead.
247- // Skip over `script` - it'll be written in next chunk.
248- // SAFETY:
249- // The slice is guaranteed to be a valid UTF-8 string.
250- // `consumed` is always on a UTF-8 char boundary.
251- // `i` is on `<`, so `i + 1` is a UTF-8 char boundary.
252- unsafe {
253- self . code . print_bytes_unchecked ( & slice[ consumed..=i] ) ;
251+
252+ #[ expect( clippy:: unnecessary_safety_comment) ]
253+ // Search range of bytes for `</script`, byte by byte.
254+ //
255+ // Bytes between `ptr` and `last_ptr` (inclusive) are searched for `<`.
256+ // If `<` is found, the following 7 bytes are checked to see if they're `/script`.
257+ //
258+ // SAFETY:
259+ // * `ptr` and `last_ptr` must be within bounds of `bytes`.
260+ // * `last_ptr` must be greater or equal to `ptr`.
261+ // * `last_ptr` must be no later than 8 bytes before end of string.
262+ // i.e. safe to read 8 bytes at `end_ptr`.
263+ let mut search_bytes = |mut ptr : * const u8 , last_ptr| {
264+ loop {
265+ // SAFETY: `ptr` is always less than or equal to `last_ptr`.
266+ // `last_ptr` is within bounds of `bytes`, so safe to read a byte at `ptr`.
267+ let byte = unsafe { * ptr. as_ref ( ) . unwrap_unchecked ( ) } ;
268+ if byte == b'<' {
269+ // SAFETY: `ptr <= last_ptr`, and `last_ptr` points to no later than
270+ // 8 bytes before end of string, so safe to read 8 bytes from `ptr`
271+ let slice = unsafe { slice:: from_raw_parts ( ptr, 8 ) } ;
272+ if is_script_close_tag ( slice) {
273+ // Push str up to and including `<`. Skip `/`. Write `\/` instead.
274+ // SAFETY:
275+ // `consumed` is initially 0, and only updated below to be after `/`,
276+ // so in bounds, and on a UTF-8 char boundary.
277+ // `index` is on `<`, so `index + 1` is in bounds and a UTF-8 char boundary.
278+ // `consumed` is always less than `index + 1` as it's set on a previous round.
279+ unsafe {
280+ let index = ptr. offset_from_usize ( bytes. as_ptr ( ) ) ;
281+ let before = bytes. get_unchecked ( consumed..=index) ;
282+ self . code . print_bytes_unchecked ( before) ;
283+
284+ // Set `consumed` to after `/`
285+ consumed = index + 2 ;
286+ }
287+ self . print_str ( "\\ /" ) ;
288+ // Note: We could advance `ptr` by 8 bytes here to skip over `</script`,
289+ // but this branch will be very rarely taken, so it's better to keep it simple
290+ }
254291 }
255- self . code . print_str ( "\\ /" ) ;
256- consumed = i + 2 ;
257- i += 8 ;
258- } else {
259- i += 1 ;
292+
293+ if ptr == last_ptr {
294+ break ;
295+ }
296+ // SAFETY: `ptr` is less than `last_ptr`, which is in bounds, so safe to increment `ptr`
297+ ptr = unsafe { ptr. add ( 1 ) } ;
298+ }
299+ } ;
300+
301+ // Search string in chunks of 16 bytes
302+ let mut chunks = bytes. chunks_exact ( 16 ) ;
303+ for ( chunk_index, chunk) in chunks. by_ref ( ) . enumerate ( ) {
304+ #[ expect( clippy:: missing_panics_doc, reason = "infallible" ) ]
305+ let chunk: & [ u8 ; 16 ] = chunk. try_into ( ) . unwrap ( ) ;
306+
307+ // Compiler vectorizes this loop to a few SIMD ops
308+ let mut contains_lt = false ;
309+ for & byte in chunk {
310+ if byte == b'<' {
311+ contains_lt = true ;
312+ }
313+ }
314+
315+ if contains_lt {
316+ // Chunk contains at least one `<`.
317+ // Find them, and check if they're the start of `</script`.
318+ //
319+ // SAFETY: `index` is byte index of start of chunk.
320+ // We search bytes starting with first byte of chunk, and ending with last byte of chunk.
321+ // i.e. `index` to `index + 15` (inclusive).
322+ // If this chunk is towards the end of the string, reduce the range of bytes searched
323+ // so the last byte searched has at least 7 further bytes after it.
324+ // i.e. safe to read 8 bytes at `last_ptr`.
325+ cold_branch ( || unsafe {
326+ let index = chunk_index * 16 ;
327+ let remaining_bytes = bytes. len ( ) - index;
328+ let last_offset = cmp:: min ( remaining_bytes - 8 , 15 ) ;
329+ let ptr = bytes. as_ptr ( ) . add ( index) ;
330+ let last_ptr = ptr. add ( last_offset) ;
331+ search_bytes ( ptr, last_ptr) ;
332+ } ) ;
260333 }
261334 }
262335
263- // SAFETY:
264- // The slice guarantees to be a valid UTF-8 string.
265- // The consumed index is always pointed to a UTF-8 char boundary.
336+ // Search last chunk byte-by-byte.
337+ // Skip this if less than 8 bytes remaining, because less than 8 bytes can't contain `</script`.
338+ let last_chunk = chunks. remainder ( ) ;
339+ if last_chunk. len ( ) >= 8 {
340+ let ptr = last_chunk. as_ptr ( ) ;
341+ // SAFETY: `last_chunk.len() >= 8`, so `- 8` cannot wrap.
342+ // `last_chunk.as_ptr().add(last_chunk.len() - 8)` is in bounds of `last_chunk`.
343+ let last_ptr = unsafe { ptr. add ( last_chunk. len ( ) - 8 ) } ;
344+ search_bytes ( ptr, last_ptr) ;
345+ }
346+
347+ // SAFETY: `consumed` is either 0, or after `/`, so on a UTF-8 char boundary, and in bounds
266348 unsafe {
267- self . code . print_bytes_unchecked ( & slice[ consumed..] ) ;
349+ let remaining = bytes. get_unchecked ( consumed..) ;
350+ self . code . print_bytes_unchecked ( remaining) ;
268351 }
269352 }
270353
0 commit comments