@@ -40,7 +40,11 @@ use str;
4040use sync:: Arc ;
4141use sys_common:: AsInner ;
4242
43- const UTF8_REPLACEMENT_CHARACTER : & str = "\u{FFFD} " ;
43+ /// Replacement for surrogate byte sequences in lossy conversion. This uses the
44+ /// Unicode replacement character (U+FFFD). It is repeated three times for
45+ /// consistency with lossy conversion of such byte sequences in other code paths
46+ /// (anything using core's `run_utf8_validation` function).
47+ const SURROGATE_REPLACEMENT : & str = "\u{FFFD} \u{FFFD} \u{FFFD} " ;
4448
4549/// A Unicode code point: from U+0000 to U+10FFFF.
4650///
@@ -346,20 +350,14 @@ impl Wtf8Buf {
346350
347351 /// Consumes the WTF-8 string and converts it lossily to UTF-8.
348352 ///
349- /// This does not copy the data (but may overwrite parts of it in place) .
353+ /// This does not copy the data if possible .
350354 ///
351- /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
352- pub fn into_string_lossy ( mut self ) -> String {
353- let mut pos = 0 ;
354- loop {
355- match self . next_surrogate ( pos) {
356- Some ( ( surrogate_pos, _) ) => {
357- pos = surrogate_pos + 3 ;
358- self . bytes [ surrogate_pos..pos]
359- . copy_from_slice ( UTF8_REPLACEMENT_CHARACTER . as_bytes ( ) ) ;
360- } ,
361- None => return unsafe { String :: from_utf8_unchecked ( self . bytes ) }
362- }
355+ /// Surrogates are replaced with `"\u{FFFD}\u{FFFD}\u{FFFD}"` (three
356+ /// instances of the replacement character “�”).
357+ pub fn into_string_lossy ( self ) -> String {
358+ match self . next_surrogate ( 0 ) {
359+ None => unsafe { String :: from_utf8_unchecked ( self . bytes ) } ,
360+ Some ( _) => self . as_slice ( ) . to_string_lossy ( ) . into_owned ( ) ,
363361 }
364362 }
365363
@@ -460,7 +458,7 @@ impl fmt::Display for Wtf8 {
460458 formatter. write_str ( unsafe {
461459 str:: from_utf8_unchecked ( & wtf8_bytes[ pos .. surrogate_pos] )
462460 } ) ?;
463- formatter. write_str ( UTF8_REPLACEMENT_CHARACTER ) ?;
461+ formatter. write_str ( SURROGATE_REPLACEMENT ) ?;
464462 pos = surrogate_pos + 3 ;
465463 } ,
466464 None => {
@@ -554,7 +552,8 @@ impl Wtf8 {
554552 /// Lossily converts the string to UTF-8.
555553 /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
556554 ///
557- /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
555+ /// Surrogates are replaced with `"\u{FFFD}\u{FFFD}\u{FFFD}"` (three
556+ /// instances of the replacement character “�”).
558557 ///
559558 /// This only copies the data if necessary (if it contains any surrogate).
560559 pub fn to_string_lossy ( & self ) -> Cow < str > {
@@ -565,13 +564,13 @@ impl Wtf8 {
565564 let wtf8_bytes = & self . bytes ;
566565 let mut utf8_bytes = Vec :: with_capacity ( self . len ( ) ) ;
567566 utf8_bytes. extend_from_slice ( & wtf8_bytes[ ..surrogate_pos] ) ;
568- utf8_bytes. extend_from_slice ( UTF8_REPLACEMENT_CHARACTER . as_bytes ( ) ) ;
567+ utf8_bytes. extend_from_slice ( SURROGATE_REPLACEMENT . as_bytes ( ) ) ;
569568 let mut pos = surrogate_pos + 3 ;
570569 loop {
571570 match self . next_surrogate ( pos) {
572571 Some ( ( surrogate_pos, _) ) => {
573572 utf8_bytes. extend_from_slice ( & wtf8_bytes[ pos .. surrogate_pos] ) ;
574- utf8_bytes. extend_from_slice ( UTF8_REPLACEMENT_CHARACTER . as_bytes ( ) ) ;
573+ utf8_bytes. extend_from_slice ( SURROGATE_REPLACEMENT . as_bytes ( ) ) ;
575574 pos = surrogate_pos + 3 ;
576575 } ,
577576 None => {
@@ -1095,7 +1094,7 @@ mod tests {
10951094 let mut string = Wtf8Buf :: from_str ( "aé 💩" ) ;
10961095 assert_eq ! ( string. clone( ) . into_string_lossy( ) , String :: from( "aé 💩" ) ) ;
10971096 string. push ( CodePoint :: from_u32 ( 0xD800 ) . unwrap ( ) ) ;
1098- assert_eq ! ( string. clone( ) . into_string_lossy( ) , String :: from( "aé 💩�" ) ) ;
1097+ assert_eq ! ( string. clone( ) . into_string_lossy( ) , String :: from( "aé 💩��� " ) ) ;
10991098 }
11001099
11011100 #[ test]
@@ -1238,7 +1237,7 @@ mod tests {
12381237 assert_eq ! ( Wtf8 :: from_str( "aé 💩" ) . to_string_lossy( ) , Cow :: Borrowed ( "aé 💩" ) ) ;
12391238 let mut string = Wtf8Buf :: from_str ( "aé 💩" ) ;
12401239 string. push ( CodePoint :: from_u32 ( 0xD800 ) . unwrap ( ) ) ;
1241- let expected: Cow < str > = Cow :: Owned ( String :: from ( "aé 💩�" ) ) ;
1240+ let expected: Cow < str > = Cow :: Owned ( String :: from ( "aé 💩��� " ) ) ;
12421241 assert_eq ! ( string. to_string_lossy( ) , expected) ;
12431242 }
12441243
@@ -1253,7 +1252,7 @@ mod tests {
12531252
12541253 let mut string = Wtf8Buf :: from_str ( "aé 💩" ) ;
12551254 string. push ( CodePoint :: from_u32 ( 0xD800 ) . unwrap ( ) ) ;
1256- assert_eq ! ( "aé 💩�" , d( string. as_inner( ) ) ) ;
1255+ assert_eq ! ( "aé 💩��� " , d( string. as_inner( ) ) ) ;
12571256 }
12581257
12591258 #[ test]
0 commit comments