@@ -40,7 +40,11 @@ use str;
40
40
use sync:: Arc ;
41
41
use sys_common:: AsInner ;
42
42
43
- const UTF8_REPLACEMENT_CHARACTER : & str = "\u{FFFD} " ;
43
+ /// Replacement for surrogate byte sequences in lossy conversion. This uses the
44
+ /// Unicode replacement character (U+FFFD). It is repeated three times for
45
+ /// consistency with lossy conversion of such byte sequences in other code paths
46
+ /// (anything using core's `run_utf8_validation` function).
47
+ const SURROGATE_REPLACEMENT : & str = "\u{FFFD} \u{FFFD} \u{FFFD} " ;
44
48
45
49
/// A Unicode code point: from U+0000 to U+10FFFF.
46
50
///
@@ -346,20 +350,14 @@ impl Wtf8Buf {
346
350
347
351
/// Consumes the WTF-8 string and converts it lossily to UTF-8.
348
352
///
349
- /// This does not copy the data (but may overwrite parts of it in place) .
353
+ /// This does not copy the data if possible .
350
354
///
351
- /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
352
- pub fn into_string_lossy ( mut self ) -> String {
353
- let mut pos = 0 ;
354
- loop {
355
- match self . next_surrogate ( pos) {
356
- Some ( ( surrogate_pos, _) ) => {
357
- pos = surrogate_pos + 3 ;
358
- self . bytes [ surrogate_pos..pos]
359
- . copy_from_slice ( UTF8_REPLACEMENT_CHARACTER . as_bytes ( ) ) ;
360
- } ,
361
- None => return unsafe { String :: from_utf8_unchecked ( self . bytes ) }
362
- }
355
+ /// Surrogates are replaced with `"\u{FFFD}\u{FFFD}\u{FFFD}"` (three
356
+ /// instances of the replacement character “�”).
357
+ pub fn into_string_lossy ( self ) -> String {
358
+ match self . next_surrogate ( 0 ) {
359
+ None => unsafe { String :: from_utf8_unchecked ( self . bytes ) } ,
360
+ Some ( _) => self . as_slice ( ) . to_string_lossy ( ) . into_owned ( ) ,
363
361
}
364
362
}
365
363
@@ -460,7 +458,7 @@ impl fmt::Display for Wtf8 {
460
458
formatter. write_str ( unsafe {
461
459
str:: from_utf8_unchecked ( & wtf8_bytes[ pos .. surrogate_pos] )
462
460
} ) ?;
463
- formatter. write_str ( UTF8_REPLACEMENT_CHARACTER ) ?;
461
+ formatter. write_str ( SURROGATE_REPLACEMENT ) ?;
464
462
pos = surrogate_pos + 3 ;
465
463
} ,
466
464
None => {
@@ -554,7 +552,8 @@ impl Wtf8 {
554
552
/// Lossily converts the string to UTF-8.
555
553
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
556
554
///
557
- /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
555
+ /// Surrogates are replaced with `"\u{FFFD}\u{FFFD}\u{FFFD}"` (three
556
+ /// instances of the replacement character “�”).
558
557
///
559
558
/// This only copies the data if necessary (if it contains any surrogate).
560
559
pub fn to_string_lossy ( & self ) -> Cow < str > {
@@ -565,13 +564,13 @@ impl Wtf8 {
565
564
let wtf8_bytes = & self . bytes ;
566
565
let mut utf8_bytes = Vec :: with_capacity ( self . len ( ) ) ;
567
566
utf8_bytes. extend_from_slice ( & wtf8_bytes[ ..surrogate_pos] ) ;
568
- utf8_bytes. extend_from_slice ( UTF8_REPLACEMENT_CHARACTER . as_bytes ( ) ) ;
567
+ utf8_bytes. extend_from_slice ( SURROGATE_REPLACEMENT . as_bytes ( ) ) ;
569
568
let mut pos = surrogate_pos + 3 ;
570
569
loop {
571
570
match self . next_surrogate ( pos) {
572
571
Some ( ( surrogate_pos, _) ) => {
573
572
utf8_bytes. extend_from_slice ( & wtf8_bytes[ pos .. surrogate_pos] ) ;
574
- utf8_bytes. extend_from_slice ( UTF8_REPLACEMENT_CHARACTER . as_bytes ( ) ) ;
573
+ utf8_bytes. extend_from_slice ( SURROGATE_REPLACEMENT . as_bytes ( ) ) ;
575
574
pos = surrogate_pos + 3 ;
576
575
} ,
577
576
None => {
@@ -1095,7 +1094,7 @@ mod tests {
1095
1094
let mut string = Wtf8Buf :: from_str ( "aé 💩" ) ;
1096
1095
assert_eq ! ( string. clone( ) . into_string_lossy( ) , String :: from( "aé 💩" ) ) ;
1097
1096
string. push ( CodePoint :: from_u32 ( 0xD800 ) . unwrap ( ) ) ;
1098
- assert_eq ! ( string. clone( ) . into_string_lossy( ) , String :: from( "aé 💩�" ) ) ;
1097
+ assert_eq ! ( string. clone( ) . into_string_lossy( ) , String :: from( "aé 💩��� " ) ) ;
1099
1098
}
1100
1099
1101
1100
#[ test]
@@ -1238,7 +1237,7 @@ mod tests {
1238
1237
assert_eq ! ( Wtf8 :: from_str( "aé 💩" ) . to_string_lossy( ) , Cow :: Borrowed ( "aé 💩" ) ) ;
1239
1238
let mut string = Wtf8Buf :: from_str ( "aé 💩" ) ;
1240
1239
string. push ( CodePoint :: from_u32 ( 0xD800 ) . unwrap ( ) ) ;
1241
- let expected: Cow < str > = Cow :: Owned ( String :: from ( "aé 💩�" ) ) ;
1240
+ let expected: Cow < str > = Cow :: Owned ( String :: from ( "aé 💩��� " ) ) ;
1242
1241
assert_eq ! ( string. to_string_lossy( ) , expected) ;
1243
1242
}
1244
1243
@@ -1253,7 +1252,7 @@ mod tests {
1253
1252
1254
1253
let mut string = Wtf8Buf :: from_str ( "aé 💩" ) ;
1255
1254
string. push ( CodePoint :: from_u32 ( 0xD800 ) . unwrap ( ) ) ;
1256
- assert_eq ! ( "aé 💩�" , d( string. as_inner( ) ) ) ;
1255
+ assert_eq ! ( "aé 💩��� " , d( string. as_inner( ) ) ) ;
1257
1256
}
1258
1257
1259
1258
#[ test]
0 commit comments