-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathnot_quite_std.rs
252 lines (223 loc) · 7.89 KB
/
not_quite_std.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
//! The code in this module is copied from Rust standard library
//! (the `std` crate and crates it is a facade for)
//! at commit 16d80de231abb2b1756f3951ffd4776d681035eb,
//! with the signature changed to use `Wtf8Buf`, `Wtf8`, and `CodePoint`
//! instead of `String`, `&str`, and `char`.
//!
//! FIXME: if and when this is moved into the standard library,
//! try to avoid the code duplication.
//! Maybe by having private generic code that is monomorphized to UTF-8 and WTF-8?
use core::char;
use core::mem;
use core::slice;
use super::{Wtf8Buf, Wtf8, CodePoint, IllFormedUtf16CodeUnits};
// UTF-8 ranges and tags for encoding characters
// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
const MAX_ONE_B: u32 = 0x80;
const MAX_TWO_B: u32 = 0x800;
const MAX_THREE_B: u32 = 0x10000;
/// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs
#[inline]
fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
// Marked #[inline] to allow llvm optimizing it away
if code < MAX_ONE_B && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code < MAX_TWO_B && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code < MAX_THREE_B && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}
/// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs
#[inline]
fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
// Marked #[inline] to allow llvm optimizing it away
if (ch & 0xFFFF) == ch && !dst.is_empty() {
// The BMP falls through (assuming non-surrogate, as it should)
dst[0] = ch as u16;
Some(1)
} else if dst.len() >= 2 {
// Supplementary planes break into surrogates.
ch -= 0x1_0000;
dst[0] = 0xD800 | ((ch >> 10) as u16);
dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
Some(2)
} else {
None
}
}
/// Copied from core::str::next_code_point
#[inline]
pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
// Decode UTF-8
let x = match bytes.next() {
None => return None,
Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
Some(&next_byte) => next_byte,
};
// Multibyte case follows
// Decode from a byte combination out of: [[[x y] z] w]
// NOTE: Performance is sensitive to the exact formulation here
let init = utf8_first_byte(x, 2);
let y = unwrap_or_0(bytes.next());
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
let z = unwrap_or_0(bytes.next());
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
// [x y z w] case
// use only the lower 3 bits of `init`
let w = unwrap_or_0(bytes.next());
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
Some(ch)
}
#[inline]
fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 }
/// Return the value of `ch` updated with continuation byte `byte`.
#[inline]
fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 }
#[inline]
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
match opt {
Some(&byte) => byte,
None => 0,
}
}
/// Mask of the value bits of a continuation byte
const CONT_MASK: u8 = 0b0011_1111;
/// Copied from String::push
/// This does **not** include the WTF-8 concatenation check.
#[inline]
pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) {
let cur_len = string.len();
// This may use up to 4 bytes.
string.reserve(4);
unsafe {
// Attempt to not use an intermediate buffer by just pushing bytes
// directly onto this string.
let slice = slice::from_raw_parts_mut(
string.bytes.as_mut_ptr().offset(cur_len as isize),
4,
);
let used = encode_utf8_raw(code_point.to_u32(), slice).unwrap_or(0);
string.bytes.set_len(cur_len + used);
}
}
/// Copied from core::str::StrPrelude::is_char_boundary
#[inline]
pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
if index == slice.len() { return true; }
match slice.bytes.get(index) {
None => false,
Some(&b) => b < 128u8 || b >= 192u8,
}
}
/// Copied from core::str::raw::slice_unchecked
#[inline]
pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
mem::transmute(slice::from_raw_parts(
s.bytes.as_ptr().offset(begin as isize),
end - begin,
))
}
/// Copied from core::str::raw::slice_error_fail
#[inline(never)]
pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
assert!(begin <= end);
panic!("index {} and/or {} in {:?} do not lie on character boundary",
begin, end, s);
}
/// Copied from core::str::Utf16CodeUnits::next
pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option<u16> {
if iter.extra != 0 {
let tmp = iter.extra;
iter.extra = 0;
return Some(tmp);
}
let mut buf = [0u16; 2];
iter.code_points.next().map(|code_point| {
let n = encode_utf16_raw(code_point.to_u32(), &mut buf).unwrap_or(0);
if n == 2 { iter.extra = buf[1]; }
buf[0]
})
}
/// Copied from src/librustc_unicode/char.rs
pub struct DecodeUtf16<I>
where I: Iterator<Item = u16>
{
iter: I,
buf: Option<u16>,
}
/// Copied from src/librustc_unicode/char.rs
#[inline]
pub fn decode_utf16<I: IntoIterator<Item = u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
DecodeUtf16 {
iter: iterable.into_iter(),
buf: None,
}
}
/// Copied from src/librustc_unicode/char.rs
impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
type Item = Result<char, u16>;
fn next(&mut self) -> Option<Result<char, u16>> {
let u = match self.buf.take() {
Some(buf) => buf,
None => match self.iter.next() {
Some(u) => u,
None => return None,
},
};
if u < 0xD800 || 0xDFFF < u {
// not a surrogate
Some(Ok(unsafe { char::from_u32_unchecked(u as u32) }))
} else if u >= 0xDC00 {
// a trailing surrogate
Some(Err(u))
} else {
let u2 = match self.iter.next() {
Some(u2) => u2,
// eof
None => return Some(Err(u)),
};
if u2 < 0xDC00 || u2 > 0xDFFF {
// not a trailing surrogate so we're not a valid
// surrogate pair, so rewind to redecode u2 next time.
self.buf = Some(u2);
return Some(Err(u));
}
// all ok, so lets decode it.
let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
Some(Ok(unsafe { char::from_u32_unchecked(c) }))
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let (low, high) = self.iter.size_hint();
// we could be entirely valid surrogates (2 elements per
// char), or entirely non-surrogates (1 element per char)
(low / 2, high)
}
}