Skip to content

Commit a6a3871

Browse files
committed
give consistent UTF-8 lossy conversion
With this commit, lossy UTF-8 conversion of OsStr/OsString on Windows will output three Unicode replacement characters (U+FFFD), one per byte, for surrogate byte sequences, instead of just one, making it consistent with lossy conversion on Unix and with the lossy conversion of raw bytes sequences. fixes #56786
1 parent f4b07e0 commit a6a3871

File tree

2 files changed

+24
-23
lines changed

2 files changed

+24
-23
lines changed

src/libstd/ffi/os_str.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -564,12 +564,14 @@ impl OsStr {
564564
///
565565
/// // Here the values 0x0066 and 0x006f correspond to 'f' and 'o'
566566
/// // respectively. The value 0xD800 is a lone surrogate half, invalid
567-
/// // in a UTF-16 sequence.
567+
/// // in a UTF-16 sequence. Encoded in UTF-8 form this takes three
568+
/// // bytes; notice that lossy conversion of these particular invalid
569+
/// // UTF-8 sequences results in one replacement character per byte.
568570
/// let source = [0x0066, 0x006f, 0xD800, 0x006f];
569571
/// let os_string = OsString::from_wide(&source[..]);
570572
/// let os_str = os_string.as_os_str();
571573
///
572-
/// assert_eq!(os_str.to_string_lossy(), "fo�o");
574+
/// assert_eq!(os_str.to_string_lossy(), "fo���o");
573575
/// }
574576
/// ```
575577
#[stable(feature = "rust1", since = "1.0.0")]

src/libstd/sys_common/wtf8.rs

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,11 @@ use str;
4040
use sync::Arc;
4141
use sys_common::AsInner;
4242

43-
const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
43+
/// Replacement for surrogate byte sequences in lossy conversion. This uses the
44+
/// Unicode replacement character (U+FFFD). It is repeated three times for
45+
/// consistency with lossy conversion of such byte sequences in other code paths
46+
/// (anything using core's `run_utf8_validation` function).
47+
const SURROGATE_REPLACEMENT: &str = "\u{FFFD}\u{FFFD}\u{FFFD}";
4448

4549
/// A Unicode code point: from U+0000 to U+10FFFF.
4650
///
@@ -346,20 +350,14 @@ impl Wtf8Buf {
346350

347351
/// Consumes the WTF-8 string and converts it lossily to UTF-8.
348352
///
349-
/// This does not copy the data (but may overwrite parts of it in place).
353+
/// This does not copy the data if possible.
350354
///
351-
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
352-
pub fn into_string_lossy(mut self) -> String {
353-
let mut pos = 0;
354-
loop {
355-
match self.next_surrogate(pos) {
356-
Some((surrogate_pos, _)) => {
357-
pos = surrogate_pos + 3;
358-
self.bytes[surrogate_pos..pos]
359-
.copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
360-
},
361-
None => return unsafe { String::from_utf8_unchecked(self.bytes) }
362-
}
355+
/// Surrogates are replaced with `"\u{FFFD}\u{FFFD}\u{FFFD}"` (three
356+
/// instances of the replacement character “�”).
357+
pub fn into_string_lossy(self) -> String {
358+
match self.next_surrogate(0) {
359+
None => unsafe { String::from_utf8_unchecked(self.bytes) },
360+
Some(_) => self.as_slice().to_string_lossy().into_owned(),
363361
}
364362
}
365363

@@ -460,7 +458,7 @@ impl fmt::Display for Wtf8 {
460458
formatter.write_str(unsafe {
461459
str::from_utf8_unchecked(&wtf8_bytes[pos .. surrogate_pos])
462460
})?;
463-
formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
461+
formatter.write_str(SURROGATE_REPLACEMENT)?;
464462
pos = surrogate_pos + 3;
465463
},
466464
None => {
@@ -554,7 +552,8 @@ impl Wtf8 {
554552
/// Lossily converts the string to UTF-8.
555553
/// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
556554
///
557-
/// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
555+
/// Surrogates are replaced with `"\u{FFFD}\u{FFFD}\u{FFFD}"` (three
556+
/// instances of the replacement character “�”).
558557
///
559558
/// This only copies the data if necessary (if it contains any surrogate).
560559
pub fn to_string_lossy(&self) -> Cow<str> {
@@ -565,13 +564,13 @@ impl Wtf8 {
565564
let wtf8_bytes = &self.bytes;
566565
let mut utf8_bytes = Vec::with_capacity(self.len());
567566
utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
568-
utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
567+
utf8_bytes.extend_from_slice(SURROGATE_REPLACEMENT.as_bytes());
569568
let mut pos = surrogate_pos + 3;
570569
loop {
571570
match self.next_surrogate(pos) {
572571
Some((surrogate_pos, _)) => {
573572
utf8_bytes.extend_from_slice(&wtf8_bytes[pos .. surrogate_pos]);
574-
utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
573+
utf8_bytes.extend_from_slice(SURROGATE_REPLACEMENT.as_bytes());
575574
pos = surrogate_pos + 3;
576575
},
577576
None => {
@@ -1095,7 +1094,7 @@ mod tests {
10951094
let mut string = Wtf8Buf::from_str("aé 💩");
10961095
assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩"));
10971096
string.push(CodePoint::from_u32(0xD800).unwrap());
1098-
assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
1097+
assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩���"));
10991098
}
11001099

11011100
#[test]
@@ -1238,7 +1237,7 @@ mod tests {
12381237
assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩"));
12391238
let mut string = Wtf8Buf::from_str("aé 💩");
12401239
string.push(CodePoint::from_u32(0xD800).unwrap());
1241-
let expected: Cow<str> = Cow::Owned(String::from("aé 💩�"));
1240+
let expected: Cow<str> = Cow::Owned(String::from("aé 💩���"));
12421241
assert_eq!(string.to_string_lossy(), expected);
12431242
}
12441243

@@ -1253,7 +1252,7 @@ mod tests {
12531252

12541253
let mut string = Wtf8Buf::from_str("aé 💩");
12551254
string.push(CodePoint::from_u32(0xD800).unwrap());
1256-
assert_eq!("aé 💩�", d(string.as_inner()));
1255+
assert_eq!("aé 💩���", d(string.as_inner()));
12571256
}
12581257

12591258
#[test]

0 commit comments

Comments
 (0)