Skip to content

Commit 1476105

Browse files
committed
Auto merge of #40189 - SimonSapin:one-width, r=alexcrichton
Reduce std_unicode’s public API * Only keep one copy of the `UTF8_CHAR_WIDTH` table instead of one of each of libcore and libstd_unicode. * Move the `utf8_char_width` function to `core::str` under the `str_internals` unstable feature. * Remove `std_unicode::str::is_utf16`. It was only accessible through the `#[unstable]` crate std_unicode. It has never been used in the compiler or standard library since 47e7a05 added it in 2012 “for OS API interop”. It can be replaced with a one-liner: ```rust fn is_utf16(slice: &[u16]) -> bool { std::char::decode_utf16(s).all(|r| r.is_ok()) } ```
2 parents 042728e + 24b39c5 commit 1476105

File tree

8 files changed

+12
-118
lines changed

8 files changed

+12
-118
lines changed

src/libcollections/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
#![feature(slice_patterns)]
5555
#![feature(specialization)]
5656
#![feature(staged_api)]
57+
#![feature(str_internals)]
5758
#![feature(trusted_len)]
5859
#![feature(unicode)]
5960
#![feature(unique)]

src/libcollections/string.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@ use core::iter::{FromIterator, FusedIterator};
6262
use core::mem;
6363
use core::ops::{self, Add, AddAssign, Index, IndexMut};
6464
use core::ptr;
65+
use core::str as core_str;
6566
use core::str::pattern::Pattern;
6667
use std_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
67-
use std_unicode::str as unicode_str;
6868

6969
use borrow::{Cow, ToOwned};
7070
use range::RangeArgument;
@@ -575,7 +575,7 @@ impl String {
575575
if byte < 128 {
576576
// subseqidx handles this
577577
} else {
578-
let w = unicode_str::utf8_char_width(byte);
578+
let w = core_str::utf8_char_width(byte);
579579

580580
match w {
581581
2 => {

src/libcollectionstest/str.rs

-65
Original file line numberDiff line numberDiff line change
@@ -540,71 +540,6 @@ fn from_utf8_mostly_ascii() {
540540
}
541541
}
542542

543-
#[test]
544-
fn test_is_utf16() {
545-
use std_unicode::str::is_utf16;
546-
547-
macro_rules! pos {
548-
($($e:expr),*) => { { $(assert!(is_utf16($e));)* } }
549-
}
550-
551-
// non-surrogates
552-
pos!(&[0x0000],
553-
&[0x0001, 0x0002],
554-
&[0xD7FF],
555-
&[0xE000]);
556-
557-
// surrogate pairs (randomly generated with Python 3's
558-
// .encode('utf-16be'))
559-
pos!(&[0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
560-
&[0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
561-
&[0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
562-
563-
// mixtures (also random)
564-
pos!(&[0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
565-
&[0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
566-
&[0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
567-
568-
// negative tests
569-
macro_rules! neg {
570-
($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } }
571-
}
572-
573-
neg!(
574-
// surrogate + regular unit
575-
&[0xdb45, 0x0000],
576-
// surrogate + lead surrogate
577-
&[0xd900, 0xd900],
578-
// unterminated surrogate
579-
&[0xd8ff],
580-
// trail surrogate without a lead
581-
&[0xddb7]);
582-
583-
// random byte sequences that Python 3's .decode('utf-16be')
584-
// failed on
585-
neg!(&[0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
586-
&[0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
587-
&[0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
588-
&[0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
589-
&[0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
590-
&[0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
591-
&[0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
592-
&[0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
593-
&[0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
594-
&[0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
595-
&[0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
596-
&[0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
597-
&[0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
598-
&[0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
599-
&[0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
600-
&[0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
601-
&[0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
602-
&[0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
603-
&[0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
604-
&[0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
605-
&[0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
606-
}
607-
608543
#[test]
609544
fn test_as_bytes() {
610545
// no null

src/libcollectionstest/string.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ fn test_from_utf16() {
129129
let s_as_utf16 = s.encode_utf16().collect::<Vec<u16>>();
130130
let u_as_string = String::from_utf16(&u).unwrap();
131131

132-
assert!(::std_unicode::str::is_utf16(&u));
132+
assert!(::std_unicode::char::decode_utf16(u.iter().cloned()).all(|r| r.is_ok()));
133133
assert_eq!(s_as_utf16, u);
134134

135135
assert_eq!(u_as_string, s);

src/libcore/str/mod.rs

+7
Original file line numberDiff line numberDiff line change
@@ -1352,6 +1352,13 @@ static UTF8_CHAR_WIDTH: [u8; 256] = [
13521352
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
13531353
];
13541354

1355+
/// Given a first byte, determine how many bytes are in this UTF-8 character
1356+
#[unstable(feature = "str_internals", issue = "0")]
1357+
#[inline]
1358+
pub fn utf8_char_width(b: u8) -> usize {
1359+
return UTF8_CHAR_WIDTH[b as usize] as usize;
1360+
}
1361+
13551362
/// Mask of the value bits of a continuation byte
13561363
const CONT_MASK: u8 = 0b0011_1111;
13571364
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte

src/libstd/io/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@
256256
#![stable(feature = "rust1", since = "1.0.0")]
257257

258258
use cmp;
259-
use std_unicode::str as core_str;
259+
use core::str as core_str;
260260
use error as std_error;
261261
use fmt;
262262
use result;

src/libstd_unicode/lib.rs

-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ pub mod char;
4747
#[allow(deprecated)]
4848
pub mod str {
4949
pub use u_str::{SplitWhitespace, UnicodeStr};
50-
pub use u_str::{is_utf16, utf8_char_width};
5150
pub use u_str::Utf16Encoder;
5251
}
5352

src/libstd_unicode/u_str.rs

-48
Original file line numberDiff line numberDiff line change
@@ -77,54 +77,6 @@ impl UnicodeStr for str {
7777
}
7878
}
7979

80-
// https://tools.ietf.org/html/rfc3629
81-
static UTF8_CHAR_WIDTH: [u8; 256] = [
82-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
83-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
84-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
85-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
86-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
87-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
88-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
89-
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
90-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
91-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
92-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
93-
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
94-
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
95-
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
96-
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
97-
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
98-
];
99-
100-
/// Given a first byte, determine how many bytes are in this UTF-8 character
101-
#[inline]
102-
pub fn utf8_char_width(b: u8) -> usize {
103-
return UTF8_CHAR_WIDTH[b as usize] as usize;
104-
}
105-
106-
/// Determines if a vector of `u16` contains valid UTF-16
107-
pub fn is_utf16(v: &[u16]) -> bool {
108-
let mut it = v.iter();
109-
macro_rules! next { ($ret:expr) => {
110-
match it.next() { Some(u) => *u, None => return $ret }
111-
}
112-
}
113-
loop {
114-
let u = next!(true);
115-
116-
match char::from_u32(u as u32) {
117-
Some(_) => {}
118-
None => {
119-
let u2 = next!(false);
120-
if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
121-
return false;
122-
}
123-
}
124-
}
125-
}
126-
}
127-
12880
/// Iterator adaptor for encoding `char`s to UTF-16.
12981
#[derive(Clone)]
13082
pub struct Utf16Encoder<I> {

0 commit comments

Comments
 (0)