Skip to content

Commit 6174b8d

Browse files
committed
Refactor low-level UTF-16 decoding.
* Rename `utf16_items` to `decode_utf16`. "Items" is meaningless. * Move it to `rustc_unicode::char`, exposed in `std::char`. * Generalize it to any `u16` iterable, not just `&[u16]`. * Make it yield `Result` instead of a custom `Utf16Item` enum that was isomorphic to `Result`. This enable using the `FromIterator for Result` impl. * Add a `REPLACEMENT_CHARACTER` constant. * Document how `result.unwrap_or(REPLACEMENT_CHARACTER)` replaces `Utf16Item::to_char_lossy`.
1 parent c408b78 commit 6174b8d

File tree

10 files changed

+164
-61
lines changed

10 files changed

+164
-61
lines changed

src/libcollections/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
#![feature(unicode)]
5757
#![feature(unique)]
5858
#![feature(unsafe_no_drop_flag, filling_drop)]
59+
#![feature(decode_utf16)]
5960
#![feature(utf8_error)]
6061
#![cfg_attr(test, feature(rand, test))]
6162

src/libcollections/string.rs

+3-10
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ use core::ops::{self, Deref, Add, Index};
2020
use core::ptr;
2121
use core::slice;
2222
use core::str::pattern::Pattern;
23+
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
2324
use rustc_unicode::str as unicode_str;
24-
use rustc_unicode::str::Utf16Item;
2525

2626
use borrow::{Cow, IntoCow};
2727
use range::RangeArgument;
@@ -267,14 +267,7 @@ impl String {
267267
/// ```
268268
#[stable(feature = "rust1", since = "1.0.0")]
269269
pub fn from_utf16(v: &[u16]) -> Result<String, FromUtf16Error> {
270-
let mut s = String::with_capacity(v.len());
271-
for c in unicode_str::utf16_items(v) {
272-
match c {
273-
Utf16Item::ScalarValue(c) => s.push(c),
274-
Utf16Item::LoneSurrogate(_) => return Err(FromUtf16Error(())),
275-
}
276-
}
277-
Ok(s)
270+
decode_utf16(v.iter().cloned()).collect::<Result<_, _>>().map_err(|_| FromUtf16Error(()))
278271
}
279272

280273
/// Decode a UTF-16 encoded vector `v` into a string, replacing
@@ -294,7 +287,7 @@ impl String {
294287
#[inline]
295288
#[stable(feature = "rust1", since = "1.0.0")]
296289
pub fn from_utf16_lossy(v: &[u16]) -> String {
297-
unicode_str::utf16_items(v).map(|c| c.to_char_lossy()).collect()
290+
decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect()
298291
}
299292

300293
/// Creates a new `String` from a length, capacity, and pointer.

src/libcoretest/char.rs

+9
Original file line numberDiff line numberDiff line change
@@ -211,3 +211,12 @@ fn test_len_utf16() {
211211
assert!('\u{a66e}'.len_utf16() == 1);
212212
assert!('\u{1f4a9}'.len_utf16() == 2);
213213
}
214+
215+
#[test]
216+
fn test_decode_utf16() {
217+
fn check(s: &[u16], expected: &[Result<char, u16>]) {
218+
assert_eq!(::std::char::decode_utf16(s.iter().cloned()).collect::<Vec<_>>(), expected);
219+
}
220+
check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);
221+
check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);
222+
}

src/libcoretest/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#![feature(float_from_str_radix)]
2020
#![feature(flt2dec)]
2121
#![feature(dec2flt)]
22+
#![feature(decode_utf16)]
2223
#![feature(fmt_radix)]
2324
#![feature(iter_arith)]
2425
#![feature(iter_arith)]

src/librustc_unicode/char.rs

+113
Original file line numberDiff line numberDiff line change
@@ -503,3 +503,116 @@ impl char {
503503
ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
504504
}
505505
}
506+
507+
/// An iterator that decodes UTF-16 encoded codepoints from an iterator of `u16`s.
508+
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
509+
#[derive(Clone)]
510+
pub struct DecodeUtf16<I> where I: Iterator<Item=u16> {
511+
iter: I,
512+
buf: Option<u16>,
513+
}
514+
515+
/// Create an iterator over the UTF-16 encoded codepoints in `iterable`,
516+
/// returning unpaired surrogates as `Err`s.
517+
///
518+
/// # Examples
519+
///
520+
/// ```
521+
/// #![feature(decode_utf16)]
522+
///
523+
/// use std::char::decode_utf16;
524+
///
525+
/// fn main() {
526+
/// // 𝄞mus<invalid>ic<invalid>
527+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
528+
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
529+
/// 0xD834];
530+
///
531+
/// assert_eq!(decode_utf16(v.iter().cloned()).collect::<Vec<_>>(),
532+
/// vec![Ok('𝄞'),
533+
/// Ok('m'), Ok('u'), Ok('s'),
534+
/// Err(0xDD1E),
535+
/// Ok('i'), Ok('c'),
536+
/// Err(0xD834)]);
537+
/// }
538+
/// ```
539+
///
540+
/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
541+
///
542+
/// ```
543+
/// #![feature(decode_utf16)]
544+
///
545+
/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
546+
///
547+
/// fn main() {
548+
/// // 𝄞mus<invalid>ic<invalid>
549+
/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
550+
/// 0x0073, 0xDD1E, 0x0069, 0x0063,
551+
/// 0xD834];
552+
///
553+
/// assert_eq!(decode_utf16(v.iter().cloned())
554+
/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
555+
/// .collect::<String>(),
556+
/// "𝄞mus�ic�");
557+
/// }
558+
/// ```
559+
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
560+
#[inline]
561+
pub fn decode_utf16<I: IntoIterator<Item=u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
562+
DecodeUtf16 {
563+
iter: iterable.into_iter(),
564+
buf: None,
565+
}
566+
}
567+
568+
#[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
569+
impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
570+
type Item = Result<char, u16>;
571+
572+
fn next(&mut self) -> Option<Result<char, u16>> {
573+
let u = match self.buf.take() {
574+
Some(buf) => buf,
575+
None => match self.iter.next() {
576+
Some(u) => u,
577+
None => return None
578+
}
579+
};
580+
581+
if u < 0xD800 || 0xDFFF < u {
582+
// not a surrogate
583+
Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
584+
} else if u >= 0xDC00 {
585+
// a trailing surrogate
586+
Some(Err(u))
587+
} else {
588+
let u2 = match self.iter.next() {
589+
Some(u2) => u2,
590+
// eof
591+
None => return Some(Err(u))
592+
};
593+
if u2 < 0xDC00 || u2 > 0xDFFF {
594+
// not a trailing surrogate so we're not a valid
595+
// surrogate pair, so rewind to redecode u2 next time.
596+
self.buf = Some(u2);
597+
return Some(Err(u))
598+
}
599+
600+
// all ok, so lets decode it.
601+
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
602+
Some(Ok(unsafe { from_u32_unchecked(c) }))
603+
}
604+
}
605+
606+
#[inline]
607+
fn size_hint(&self) -> (usize, Option<usize>) {
608+
let (low, high) = self.iter.size_hint();
609+
// we could be entirely valid surrogates (2 elements per
610+
// char), or entirely non-surrogates (1 element per char)
611+
(low / 2, high)
612+
}
613+
}
614+
615+
/// U+FFFD REPLACEMENT CHARACTER (�) is used in Unicode to represent a decoding error.
616+
/// It can occur, for example, when giving ill-formed UTF-8 bytes to `String::from_utf8_lossy`.
617+
#[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")]
618+
pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';

src/librustc_unicode/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ mod tables;
4646
mod u_str;
4747
pub mod char;
4848

49+
#[allow(deprecated)]
4950
pub mod str {
5051
pub use u_str::{UnicodeStr, SplitWhitespace};
5152
pub use u_str::{utf8_char_width, is_utf16, Utf16Items, Utf16Item};

src/librustc_unicode/u_str.rs

+24-39
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
//! This module provides functionality to `str` that requires the Unicode methods provided by the
1414
//! unicode parts of the CharExt trait.
1515
16+
use char::{DecodeUtf16, decode_utf16};
1617
use core::char;
17-
use core::iter::Filter;
18+
use core::iter::{Cloned, Filter};
1819
use core::slice;
1920
use core::str::Split;
2021

@@ -119,11 +120,18 @@ pub fn is_utf16(v: &[u16]) -> bool {
119120

120121
/// An iterator that decodes UTF-16 encoded codepoints from a vector
121122
/// of `u16`s.
123+
#[deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
124+
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
125+
#[allow(deprecated)]
122126
#[derive(Clone)]
123127
pub struct Utf16Items<'a> {
124-
iter: slice::Iter<'a, u16>
128+
decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>
125129
}
130+
126131
/// The possibilities for values decoded from a `u16` stream.
132+
#[deprecated(since = "1.4.0", reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
133+
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
134+
#[allow(deprecated)]
127135
#[derive(Copy, PartialEq, Eq, Clone, Debug)]
128136
pub enum Utf16Item {
129137
/// A valid codepoint.
@@ -132,6 +140,7 @@ pub enum Utf16Item {
132140
LoneSurrogate(u16)
133141
}
134142

143+
#[allow(deprecated)]
135144
impl Utf16Item {
136145
/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
137146
/// replacement character (U+FFFD).
@@ -144,49 +153,22 @@ impl Utf16Item {
144153
}
145154
}
146155

156+
#[deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
157+
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
158+
#[allow(deprecated)]
147159
impl<'a> Iterator for Utf16Items<'a> {
148160
type Item = Utf16Item;
149161

150162
fn next(&mut self) -> Option<Utf16Item> {
151-
let u = match self.iter.next() {
152-
Some(u) => *u,
153-
None => return None
154-
};
155-
156-
if u < 0xD800 || 0xDFFF < u {
157-
// not a surrogate
158-
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(u as u32) }))
159-
} else if u >= 0xDC00 {
160-
// a trailing surrogate
161-
Some(Utf16Item::LoneSurrogate(u))
162-
} else {
163-
// preserve state for rewinding.
164-
let old = self.iter.clone();
165-
166-
let u2 = match self.iter.next() {
167-
Some(u2) => *u2,
168-
// eof
169-
None => return Some(Utf16Item::LoneSurrogate(u))
170-
};
171-
if u2 < 0xDC00 || u2 > 0xDFFF {
172-
// not a trailing surrogate so we're not a valid
173-
// surrogate pair, so rewind to redecode u2 next time.
174-
self.iter = old.clone();
175-
return Some(Utf16Item::LoneSurrogate(u))
176-
}
177-
178-
// all ok, so lets decode it.
179-
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
180-
Some(Utf16Item::ScalarValue(unsafe { char::from_u32_unchecked(c) }))
181-
}
163+
self.decoder.next().map(|result| match result {
164+
Ok(c) => Utf16Item::ScalarValue(c),
165+
Err(s) => Utf16Item::LoneSurrogate(s),
166+
})
182167
}
183168

184169
#[inline]
185170
fn size_hint(&self) -> (usize, Option<usize>) {
186-
let (low, high) = self.iter.size_hint();
187-
// we could be entirely valid surrogates (2 elements per
188-
// char), or entirely non-surrogates (1 element per char)
189-
(low / 2, high)
171+
self.decoder.size_hint()
190172
}
191173
}
192174

@@ -196,7 +178,7 @@ impl<'a> Iterator for Utf16Items<'a> {
196178
/// # Examples
197179
///
198180
/// ```
199-
/// #![feature(unicode)]
181+
/// #![feature(unicode, decode_utf16)]
200182
///
201183
/// extern crate rustc_unicode;
202184
///
@@ -216,8 +198,11 @@ impl<'a> Iterator for Utf16Items<'a> {
216198
/// LoneSurrogate(0xD834)]);
217199
/// }
218200
/// ```
201+
#[deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
202+
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
203+
#[allow(deprecated)]
219204
pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
220-
Utf16Items { iter : v.iter() }
205+
Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
221206
}
222207

223208
/// Iterator adaptor for encoding `char`s to UTF-16.

src/libserialize/json.rs

+6-6
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,6 @@ use std::str::FromStr;
209209
use std::string;
210210
use std::{char, f64, fmt, str};
211211
use std;
212-
use rustc_unicode::str as unicode_str;
213-
use rustc_unicode::str::Utf16Item;
214212

215213
use Encodable;
216214

@@ -1712,11 +1710,13 @@ impl<T: Iterator<Item=char>> Parser<T> {
17121710
_ => return self.error(UnexpectedEndOfHexEscape),
17131711
}
17141712

1715-
let buf = [n1, try!(self.decode_hex_escape())];
1716-
match unicode_str::utf16_items(&buf).next() {
1717-
Some(Utf16Item::ScalarValue(c)) => res.push(c),
1718-
_ => return self.error(LoneLeadingSurrogateInHexEscape),
1713+
let n2 = try!(self.decode_hex_escape());
1714+
if n2 < 0xDC00 || n2 > 0xDFFF {
1715+
return self.error(LoneLeadingSurrogateInHexEscape)
17191716
}
1717+
let c = (((n1 - 0xD800) as u32) << 10 |
1718+
(n2 - 0xDC00) as u32) + 0x1_0000;
1719+
res.push(char::from_u32(c).unwrap());
17201720
}
17211721

17221722
n => match char::from_u32(n as u32) {

src/libstd/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@
242242
#![feature(unicode)]
243243
#![feature(unique)]
244244
#![feature(unsafe_no_drop_flag, filling_drop)]
245+
#![feature(decode_utf16)]
245246
#![feature(vec_push_all)]
246247
#![feature(vec_resize)]
247248
#![feature(wrapping)]

src/libstd/sys/common/wtf8.rs

+5-6
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ use hash::{Hash, Hasher};
3737
use iter::FromIterator;
3838
use mem;
3939
use ops;
40-
use rustc_unicode::str::{Utf16Item, utf16_items};
4140
use slice;
4241
use str;
4342
use string::String;
@@ -186,14 +185,14 @@ impl Wtf8Buf {
186185
/// will always return the original code units.
187186
pub fn from_wide(v: &[u16]) -> Wtf8Buf {
188187
let mut string = Wtf8Buf::with_capacity(v.len());
189-
for item in utf16_items(v) {
188+
for item in char::decode_utf16(v.iter().cloned()) {
190189
match item {
191-
Utf16Item::ScalarValue(c) => string.push_char(c),
192-
Utf16Item::LoneSurrogate(s) => {
190+
Ok(ch) => string.push_char(ch),
191+
Err(surrogate) => {
193192
// Surrogates are known to be in the code point range.
194-
let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
193+
let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
195194
// Skip the WTF-8 concatenation check,
196-
// surrogate pairs are already decoded by utf16_items
195+
// surrogate pairs are already decoded by decode_utf16
197196
string.push_code_point_unchecked(code_point)
198197
}
199198
}

0 commit comments

Comments
 (0)