@@ -95,13 +95,14 @@ where
9595 let res = unescape_char_or_byte ( & mut chars, mode) ;
9696 callback ( 0 ..( src. len ( ) - chars. as_str ( ) . len ( ) ) , res) ;
9797 }
98- Str | ByteStr => unescape_non_raw_common ( src, mode, callback) ,
99- RawStr | RawByteStr => check_raw_common ( src, mode, callback) ,
100- RawCStr => check_raw_common ( src, mode, & mut |r, mut result| {
101- if let Ok ( '\0' ) = result {
102- result = Err ( EscapeError :: NulInCStr ) ;
98+ Str | ByteStr => Unescape :: new ( src, |chars| scan_escape ( chars, mode) )
99+ . for_each ( |( res, r) | callback ( r, res) ) ,
100+ RawStr | RawByteStr => check_raw_common ( src, mode) . for_each ( |( res, r) | callback ( r, res) ) ,
101+ RawCStr => check_raw_common ( src, mode) . for_each ( |( mut res, r) | {
102+ if let Ok ( '\0' ) = res {
103+ res = Err ( EscapeError :: NulInCStr ) ;
103104 }
104- callback ( r, result )
105+ callback ( r, res ) ;
105106 } ) ,
106107 CStr => unreachable ! ( ) ,
107108 }
@@ -147,12 +148,13 @@ where
147148 F : FnMut ( Range < usize > , Result < MixedUnit , EscapeError > ) ,
148149{
149150 match mode {
150- CStr => unescape_non_raw_common ( src, mode , & mut |r , mut result | {
151- if let Ok ( MixedUnit :: Char ( '\0' ) ) = result {
152- result = Err ( EscapeError :: NulInCStr ) ;
151+ CStr => Unescape :: new ( src, |chars| scan_escape ( chars , mode ) ) . for_each ( | ( mut res , r ) | {
152+ if let Ok ( MixedUnit :: Char ( '\0' ) ) = res {
153+ res = Err ( EscapeError :: NulInCStr ) ;
153154 }
154- callback ( r, result )
155+ callback ( r, res ) ;
155156 } ) ,
157+
156158 Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable ! ( ) ,
157159 }
158160}
@@ -301,7 +303,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
301303 }
302304
303305 break std:: char:: from_u32 ( value) . ok_or ( {
304- if value > 0x10FFFF {
306+ if value > char :: MAX as u32 {
305307 EscapeError :: OutOfRangeUnicodeEscape
306308 } else {
307309 EscapeError :: LoneSurrogateUnicodeEscape
@@ -340,94 +342,141 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
340342 Ok ( res)
341343}
342344
343- /// Takes a contents of a string literal (without quotes) and produces a
344- /// sequence of escaped characters or errors.
345- fn unescape_non_raw_common < F , T : From < char > + From < u8 > > ( src : & str , mode : Mode , callback : & mut F )
346- where
347- F : FnMut ( Range < usize > , Result < T , EscapeError > ) ,
345+ /// Iterator that removes string continuations and interprets other backslash-escapes
346+ struct Unescape < ' s , T : From < char > + From < u8 > , F : FnMut ( & mut Chars < ' _ > ) -> Result < T , EscapeError > > {
347+ state : State ,
348+ chars : Chars < ' s > ,
349+ pos : usize ,
350+ scan_escape : F ,
351+ }
352+
353+ /// States for `Unescape` iterator state machine
354+ enum State {
355+ Start ,
356+ UnskippedWhitespace ( usize ) ,
357+ }
358+
359+ impl < T : From < char > + From < u8 > , F : FnMut ( & mut Chars < ' _ > ) -> Result < T , EscapeError > > Iterator
360+ for Unescape < ' _ , T , F >
348361{
349- let mut chars = src. chars ( ) ;
350- let allow_unicode_chars = mode. allow_unicode_chars ( ) ; // get this outside the loop
362+ type Item = ( Result < T , EscapeError > , Range < usize > ) ;
351363
352- // The `start` and `end` computation here is complicated because
353- // `skip_ascii_whitespace` makes us to skip over chars without counting
354- // them in the range computation.
355- while let Some ( c) = chars. next ( ) {
356- let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
357- let res = match c {
358- '\\' => {
359- match chars. clone ( ) . next ( ) {
360- Some ( '\n' ) => {
361- // Rust language specification requires us to skip whitespaces
362- // if unescaped '\' character is followed by '\n'.
363- // For details see [Rust language reference]
364+ fn next ( & mut self ) -> Option < Self :: Item > {
365+ match self . state {
366+ State :: Start => self . start ( ) ,
367+ State :: UnskippedWhitespace ( end) => self . unskipped_whitespace ( end) ,
368+ }
369+ }
370+ }
371+
372+ impl < ' s , T : From < char > + From < u8 > , F : FnMut ( & mut Chars < ' _ > ) -> Result < T , EscapeError > >
373+ Unescape < ' s , T , F >
374+ {
375+ pub ( crate ) fn new ( s : & ' s str , scan_escape : F ) -> Self {
376+ Self { state : State :: Start , chars : s. chars ( ) , pos : 0 , scan_escape }
377+ }
378+
379+ fn start ( & mut self ) -> Option < <Self as Iterator >:: Item > {
380+ if let Some ( c) = self . chars . next ( ) {
381+ match c {
382+ '\\' => {
383+ // peek
384+ if Some ( '\n' ) == self . chars . clone ( ) . next ( ) {
385+ assert_eq ! ( Some ( '\n' ) , self . chars. next( ) ) ;
386+ // skip whitespace for backslash newline, see [Rust language reference]
364387 // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
365- skip_ascii_whitespace ( & mut chars, start, & mut |range, err| {
366- callback ( range, Err ( err) )
367- } ) ;
368- continue ;
388+ self . skip_whitespace ( )
389+ } else {
390+ let mut chars_clone = self . chars . clone ( ) ;
391+ let res = ( self . scan_escape ) ( & mut chars_clone) ;
392+ let bytes_diff = self . chars . as_str ( ) . len ( ) - chars_clone. as_str ( ) . len ( ) ;
393+ let end = self . pos + 1 + bytes_diff;
394+ self . chars = chars_for_escape;
395+ let range = self . pos ..end;
396+ self . pos = end;
397+ Some ( ( res, range) )
369398 }
370- _ => scan_escape :: < T > ( & mut chars, mode) ,
399+ }
400+ c => {
401+ let res = match c {
402+ '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
403+ '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
404+ c => Ok ( c) ,
405+ } ;
406+ let end = self . pos + c. len_utf8 ( ) ;
407+ let range = self . pos ..end;
408+ self . pos = end;
409+ Some ( ( res. map ( T :: from) , range) )
371410 }
372411 }
373- '"' => Err ( EscapeError :: EscapeOnlyChar ) ,
374- '\r' => Err ( EscapeError :: BareCarriageReturn ) ,
375- _ => ascii_check ( c, allow_unicode_chars) . map ( T :: from) ,
376- } ;
377- let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
378- callback ( start..end, res) ;
412+ } else {
413+ None
414+ }
379415 }
380- }
381416
382- fn skip_ascii_whitespace < F > ( chars : & mut Chars < ' _ > , start : usize , callback : & mut F )
383- where
384- F : FnMut ( Range < usize > , EscapeError ) ,
385- {
386- let tail = chars. as_str ( ) ;
387- let first_non_space = tail
388- . bytes ( )
389- . position ( |b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r' )
390- . unwrap_or ( tail. len ( ) ) ;
391- if tail[ 1 ..first_non_space] . contains ( '\n' ) {
392- // The +1 accounts for the escaping slash.
393- let end = start + first_non_space + 1 ;
394- callback ( start..end, EscapeError :: MultipleSkippedLinesWarning ) ;
417+ /// Skip ASCII whitespace, except for the formfeed character
418+ /// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
419+ /// Warns on unescaped newline and following non-ASCII whitespace.
420+ fn skip_whitespace ( & mut self ) -> Option < <Self as Iterator >:: Item > {
421+ // the escaping slash and newline characters add 2 bytes
422+ let mut end = self . pos + 2 ;
423+ let mut contains_nl = false ;
424+ // manual next_if loop
425+ loop {
426+ let mut chars_clone = self . chars . clone ( ) ;
427+ match chars_clone. next ( ) {
428+ Some ( c) if c. is_ascii_whitespace ( ) && c != '\x0c' => {
429+ self . chars = chars_clone;
430+ end += 1 ;
431+ contains_nl = contains_nl || c == '\n' ;
432+ }
433+ _ => break ,
434+ }
435+ }
436+ if contains_nl {
437+ self . state = State :: UnskippedWhitespace ( end) ;
438+ Some ( ( Err ( EscapeError :: MultipleSkippedLinesWarning ) , self . pos ..end) )
439+ } else {
440+ self . unskipped_whitespace ( end)
441+ }
395442 }
396- let tail = & tail[ first_non_space..] ;
397- if let Some ( c) = tail. chars ( ) . next ( ) {
398- if c. is_whitespace ( ) {
399- // For error reporting, we would like the span to contain the character that was not
400- // skipped. The +1 is necessary to account for the leading \ that started the escape.
401- let end = start + first_non_space + c. len_utf8 ( ) + 1 ;
402- callback ( start..end, EscapeError :: UnskippedWhitespaceWarning ) ;
443+
444+ /// Helper for `skip_whitespace`
445+ fn unskipped_whitespace ( & mut self , end : usize ) -> Option < <Self as Iterator >:: Item > {
446+ self . state = State :: Start ;
447+ // peek
448+ if let Some ( c) = self . chars . clone ( ) . next ( ) {
449+ let range = self . pos ..end + c. len_utf8 ( ) ;
450+ self . pos = end;
451+ if c. is_whitespace ( ) {
452+ // for error reporting, include the character that was not skipped in the span
453+ Some ( ( Err ( EscapeError :: UnskippedWhitespaceWarning ) , range) )
454+ } else {
455+ self . start ( )
456+ }
457+ } else {
458+ None
403459 }
404460 }
405- * chars = tail. chars ( ) ;
406461}
407462
408463/// Takes a contents of a string literal (without quotes) and produces a
409464/// sequence of characters or errors.
410465/// NOTE: Raw strings do not perform any explicit character escaping, here we
411466/// only produce errors on bare CR.
412- fn check_raw_common < F > ( src : & str , mode : Mode , callback : & mut F )
413- where
414- F : FnMut ( Range < usize > , Result < char , EscapeError > ) ,
415- {
416- let mut chars = src. chars ( ) ;
467+ fn check_raw_common (
468+ src : & str ,
469+ mode : Mode ,
470+ ) -> impl Iterator < Item = ( Result < char , EscapeError > , Range < usize > ) > + ' _ {
417471 let allow_unicode_chars = mode. allow_unicode_chars ( ) ; // get this outside the loop
418472
419- // The `start` and `end` computation here matches the one in
420- // `unescape_non_raw_common` for consistency, even though this function
421- // doesn't have to worry about skipping any chars.
422- while let Some ( c) = chars. next ( ) {
423- let start = src. len ( ) - chars. as_str ( ) . len ( ) - c. len_utf8 ( ) ;
473+ src. char_indices ( ) . map ( move |( pos, c) | {
424474 let res = match c {
425475 '\r' => Err ( EscapeError :: BareCarriageReturnInRawString ) ,
426476 _ => ascii_check ( c, allow_unicode_chars) ,
427477 } ;
428- let end = src. len ( ) - chars. as_str ( ) . len ( ) ;
429- callback ( start..end, res) ;
430- }
478+ ( res, pos..pos + c. len_utf8 ( ) )
479+ } )
431480}
432481
433482#[ inline]
0 commit comments