@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326326 // we are sure that index within string
327327 normalized. push_str ( & text[ 0 ..i] ) ;
328328
329- let mut pos = normalize_xml_eol_step ( & mut normalized, bytes , i, '\n' ) ;
329+ let mut pos = normalize_xml_eol_step ( & mut normalized, text , i, '\n' ) ;
330330 while let Some ( i) = memchr3 ( b'\r' , 0xC2 , 0xE2 , & bytes[ pos..] ) {
331331 let index = pos + i;
332332 // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333333 // we are sure that index within string
334334 normalized. push_str ( & text[ pos..index] ) ;
335- pos = normalize_xml_eol_step ( & mut normalized, bytes , index, '\n' ) ;
335+ pos = normalize_xml_eol_step ( & mut normalized, text , index, '\n' ) ;
336336 }
337337 if let Some ( rest) = text. get ( pos..) {
338338 normalized. push_str ( rest) ;
@@ -378,21 +378,30 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378378///
379379/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380380/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381- fn normalize_xml_eol_step ( normalized : & mut String , input : & [ u8 ] , index : usize , ch : char ) -> usize {
381+ fn normalize_xml_eol_step ( normalized : & mut String , text : & str , index : usize , ch : char ) -> usize {
382+ let input = text. as_bytes ( ) ;
382383 match input[ index] {
383384 b'\r' => {
384- normalized. push ( ch) ;
385385 if index + 1 < input. len ( ) {
386386 let next = input[ index + 1 ] ;
387387 if next == b'\n' {
388+ normalized. push ( ch) ;
388389 return index + 2 ; // skip \r\n
389390 }
390391 // Because input is correct UTF-8 and in UTF-8 every character has
391392 // an unique prefix, byte C2 means only start of #x85 character
392393 if next == 0xC2 {
393- return index + 3 ; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85)
394+ if index + 2 < input. len ( ) && input[ index + 2 ] == 0x85 {
395+ normalized. push ( ch) ;
396+ } else {
397+ // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
398+ // we are sure that index within string
399+ normalized. push_str ( & text[ index..index + 3 ] ) ;
400+ }
401+ return index + 3 ; // skip \r + UTF-8 encoding of character (c2 xx)
394402 }
395403 }
404+ normalized. push ( ch) ;
396405 index + 1 // skip \r
397406 }
398407 b'\n' => {
@@ -401,13 +410,25 @@ fn normalize_xml_eol_step(normalized: &mut String, input: &[u8], index: usize, c
401410 }
402411 // Start of UTF-8 encoding of #x85 character (c2 85)
403412 0xC2 => {
404- normalized. push ( ch) ;
405- index + 2 // skip UTF-8 encoding of #x85 character (c2 85)
413+ if index + 1 < input. len ( ) && input[ index + 1 ] == 0x85 {
414+ normalized. push ( ch) ;
415+ } else {
416+ // NOTE: unsafe { text.get_unchecked(index..index + 2) } could be used because
417+ // we are sure that index within string
418+ normalized. push_str ( & text[ index..index + 2 ] ) ;
419+ }
420+ index + 2 // skip UTF-8 encoding of character (c2 xx)
406421 }
407422 // Start of UTF-8 encoding of #x2028 character (e2 80 a8)
408423 0xE2 => {
409- normalized. push ( ch) ;
410- index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8)
424+ if index + 2 < input. len ( ) && input[ index + 1 ] == 0x80 && input[ index + 2 ] == 0xA8 {
425+ normalized. push ( ch) ;
426+ } else {
427+ // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
428+ // we are sure that index within string
429+ normalized. push_str ( & text[ index..index + 3 ] ) ;
430+ }
431+ index + 3 // skip UTF-8 encoding of character (e2 xx xx)
411432 }
412433
413434 x => unreachable ! (
@@ -2094,6 +2115,102 @@ mod normalization {
20942115 "\n \n \n \n \n \n some\n \n \n text" ,
20952116 ) ;
20962117 }
2118+
2119+ #[ test]
2120+ fn utf8_0xc2 ( ) {
2121+ // All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010)
2122+ // Second byte follows the pattern 10xxxxxx
2123+ let first = str:: from_utf8 ( & [ 0b11000010 , 0b10000000 ] )
2124+ . unwrap ( )
2125+ . chars ( )
2126+ . next ( )
2127+ . unwrap ( ) ;
2128+ let last = str:: from_utf8 ( & [ 0b11000010 , 0b10111111 ] )
2129+ . unwrap ( )
2130+ . chars ( )
2131+ . next ( )
2132+ . unwrap ( ) ;
2133+ let mut utf8 = [ 0 ; 2 ] ;
2134+ for ch in first..=last {
2135+ ch. encode_utf8 ( & mut utf8) ;
2136+ let description = format ! ( "UTF-8 [{:02x} {:02x}] = `{}`" , utf8[ 0 ] , utf8[ 1 ] , ch) ;
2137+ let input = str:: from_utf8 ( & utf8) . expect ( & description) ;
2138+
2139+ dbg ! ( ( input, & description) ) ;
2140+ if ch == '\u{0085}' {
2141+ assert_eq ! ( normalize_xml_eols( input) , "\n " , "{}" , description) ;
2142+ } else {
2143+ assert_eq ! ( normalize_xml_eols( input) , input, "{}" , description) ;
2144+ }
2145+ }
2146+ assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
2147+ }
2148+
2149+ #[ test]
2150+ fn utf8_0x0d_0xc2 ( ) {
2151+ // All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010)
2152+ // Second byte follows the pattern 10xxxxxx
2153+ let first = str:: from_utf8 ( & [ 0b11000010 , 0b10000000 ] )
2154+ . unwrap ( )
2155+ . chars ( )
2156+ . next ( )
2157+ . unwrap ( ) ;
2158+ let last = str:: from_utf8 ( & [ 0b11000010 , 0b10111111 ] )
2159+ . unwrap ( )
2160+ . chars ( )
2161+ . next ( )
2162+ . unwrap ( ) ;
2163+ let mut utf8 = [ b'\r' , 0 , 0 ] ;
2164+ for ch in first..=last {
2165+ ch. encode_utf8 ( & mut utf8[ 1 ..] ) ;
2166+ let description = format ! (
2167+ "UTF-8 [{:02x} {:02x} {:02x}] = `{}`" ,
2168+ utf8[ 0 ] , utf8[ 1 ] , utf8[ 2 ] , ch
2169+ ) ;
2170+ let input = str:: from_utf8 ( & utf8) . expect ( & description) ;
2171+
2172+ dbg ! ( ( input, & description) ) ;
2173+ if ch == '\u{0085}' {
2174+ assert_eq ! ( normalize_xml_eols( input) , "\n " , "{}" , description) ;
2175+ } else {
2176+ assert_eq ! ( normalize_xml_eols( input) , input, "{}" , description) ;
2177+ }
2178+ }
2179+ assert_eq ! ( ( first..=last) . count( ) , 64 ) ;
2180+ }
2181+
2182+ #[ test]
2183+ fn utf8_0xe2 ( ) {
2184+ // All possible characters encoded in 3 bytes in UTF-8 which first byte is 0xE2 (0b11100010)
2185+ // Second and third bytes follows the pattern 10xxxxxx
2186+ let first = str:: from_utf8 ( & [ 0b11100010 , 0b10000000 , 0b10000000 ] )
2187+ . unwrap ( )
2188+ . chars ( )
2189+ . next ( )
2190+ . unwrap ( ) ;
2191+ let last = str:: from_utf8 ( & [ 0b11100010 , 0b10111111 , 0b10111111 ] )
2192+ . unwrap ( )
2193+ . chars ( )
2194+ . next ( )
2195+ . unwrap ( ) ;
2196+ let mut buf = [ 0 ; 3 ] ;
2197+ for ch in first..=last {
2198+ let input = & * ch. encode_utf8 ( & mut buf) ;
2199+ let buf = input. as_bytes ( ) ;
2200+ let description = format ! (
2201+ "UTF-8 [{:02x} {:02x} {:02x}] = `{}`" ,
2202+ buf[ 0 ] , buf[ 1 ] , buf[ 2 ] , ch
2203+ ) ;
2204+
2205+ dbg ! ( ( input, & description) ) ;
2206+ if ch == '\u{2028}' {
2207+ assert_eq ! ( normalize_xml_eols( input) , "\n " , "{}" , description) ;
2208+ } else {
2209+ assert_eq ! ( normalize_xml_eols( input) , input, "{}" , description) ;
2210+ }
2211+ }
2212+ assert_eq ! ( ( first..=last) . count( ) , 4096 ) ;
2213+ }
20972214 }
20982215
20992216 mod html {
0 commit comments