Skip to content

Commit 69996c2

Browse files
author
bors-servo
committed
Auto merge of #147 - servo:ascii, r=Manishearth
Use std::ascii instead of duplicating it.
2 parents a2736b4 + 096369c commit 69996c2

File tree

6 files changed

+23
-164
lines changed

6 files changed

+23
-164
lines changed

src/tokenizer/buffer_queue.rs

+4-8
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,9 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10-
use util::str::AsciiCast;
1110
use util::smallcharset::SmallCharSet;
1211

13-
use std::str::CharRange;
12+
use std::ascii::AsciiExt;
1413
use std::collections::VecDeque;
1514

1615
use tendril::StrTendril;
@@ -122,19 +121,16 @@ impl BufferQueue {
122121
return None;
123122
}
124123

125-
for c in pat.chars() {
124+
for pattern_byte in pat.bytes() {
126125
if buffers_exhausted >= self.buffers.len() {
127126
return None;
128127
}
129128
let ref buf = self.buffers[buffers_exhausted];
130129

131-
let d = buf.char_at(consumed_from_last);
132-
match (c.to_ascii_opt(), d.to_ascii_opt()) {
133-
(Some(c), Some(d)) => if c.eq_ignore_case(d) { () } else { return Some(false) },
134-
_ => return Some(false),
130+
if !buf.as_bytes()[consumed_from_last].eq_ignore_ascii_case(&pattern_byte) {
131+
return Some(false)
135132
}
136133

137-
// d was an ASCII character; size must be 1 byte
138134
consumed_from_last += 1;
139135
if consumed_from_last >= buf.len() {
140136
buffers_exhausted += 1;

src/tokenizer/mod.rs

+6-4
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ use self::char_ref::{CharRef, CharRefTokenizer};
2626

2727
use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet};
2828

29-
use util::str::{lower_ascii, lower_ascii_letter};
29+
use util::str::lower_ascii_letter;
3030
use util::smallcharset::SmallCharSet;
3131

32+
use std::ascii::AsciiExt;
3233
use std::mem::replace;
3334
use std::default::Default;
3435
use std::borrow::Cow::{self, Borrowed};
@@ -739,7 +740,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
739740
'/' => go!(self: to SelfClosingStartTag),
740741
'>' => go!(self: emit_tag Data),
741742
'\0' => go!(self: error; push_tag '\u{fffd}'),
742-
c => go!(self: push_tag (lower_ascii(c))),
743+
c => go!(self: push_tag (c.to_ascii_lowercase())),
743744
}},
744745

745746
//§ script-data-escaped-less-than-sign-state
@@ -1039,7 +1040,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
10391040
'\t' | '\n' | '\x0C' | ' ' => (),
10401041
'\0' => go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName),
10411042
'>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1042-
c => go!(self: create_doctype; push_doctype_name (lower_ascii(c)); to DoctypeName),
1043+
c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1044+
to DoctypeName),
10431045
}},
10441046

10451047
//§ doctype-name-state
@@ -1048,7 +1050,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
10481050
=> go!(self: to AfterDoctypeName),
10491051
'>' => go!(self: emit_doctype; to Data),
10501052
'\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1051-
c => go!(self: push_doctype_name (lower_ascii(c))),
1053+
c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
10521054
}},
10531055

10541056
//§ after-doctype-name-state

src/tree_builder/actions.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ use tree_builder::rules::TreeBuilderStep;
2020
use tokenizer::{Attribute, Tag, StartTag, EndTag};
2121
use tokenizer::states::{RawData, RawKind};
2222

23-
use util::str::{AsciiExt, to_escaped_string};
23+
use util::str::to_escaped_string;
2424

25+
use std::ascii::AsciiExt;
2526
use std::{slice, fmt};
2627
use std::mem::replace;
2728
use std::iter::{Rev, Enumerate};

src/tree_builder/data.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99

1010
use tokenizer::Doctype;
1111
use tree_builder::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks};
12-
use util::str::AsciiExt;
1312

13+
use std::ascii::AsciiExt;
1414
use tendril::StrTendril;
1515

1616
// These should all be lowercase, for ASCII-case-insensitive matching.
@@ -104,7 +104,7 @@ pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool
104104
}
105105

106106
fn opt_to_ascii_lower(x: Option<&str>) -> Option<String> {
107-
x.map(|y| y.to_ascii_lower())
107+
x.map(|y| y.to_ascii_lowercase())
108108
}
109109

110110
let name = opt_tendril_as_slice(&doctype.name);

src/tree_builder/rules.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ use tree_builder::interface::{TreeSink, Quirks, AppendNode, NextParserState};
1717
use tokenizer::{Tag, StartTag, EndTag};
1818
use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext, Quiescent};
1919

20-
use util::str::{AsciiExt, is_ascii_whitespace};
20+
use util::str::is_ascii_whitespace;
2121

22+
use std::ascii::AsciiExt;
2223
use std::mem::replace;
2324
use std::borrow::Cow::Borrowed;
2425
use std::borrow::ToOwned;

src/util/str.rs

+7-148
Original file line numberDiff line numberDiff line change
@@ -19,178 +19,37 @@ pub fn to_escaped_string<T: fmt::Debug>(x: &T) -> String {
1919
buf.escape_default()
2020
}
2121

22-
// FIXME: The ASCII stuff is largely copied from std::ascii
23-
// (see rust-lang/rust#16801).
24-
25-
pub static ASCII_LOWER_MAP: [u8; 256] = [
26-
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
27-
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
28-
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
29-
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
30-
b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
31-
b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
32-
b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
33-
b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
34-
b'@',
35-
36-
b'a', b'b', b'c', b'd', b'e', b'f', b'g',
37-
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
38-
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
39-
b'x', b'y', b'z',
40-
41-
b'[', b'\\', b']', b'^', b'_',
42-
b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
43-
b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
44-
b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
45-
b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
46-
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
47-
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
48-
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
49-
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
50-
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
51-
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
52-
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
53-
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
54-
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
55-
0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
56-
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
57-
0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
58-
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
59-
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
60-
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
61-
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
62-
];
63-
64-
#[derive(Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
65-
pub struct Ascii {
66-
chr: u8,
67-
}
68-
69-
impl Ascii {
70-
pub fn to_char(self) -> char {
71-
self.chr as char
72-
}
73-
74-
#[inline]
75-
pub fn is_alphabetic(&self) -> bool {
76-
(self.chr >= 0x41 && self.chr <= 0x5A) || (self.chr >= 0x61 && self.chr <= 0x7A)
77-
}
78-
79-
#[inline]
80-
pub fn is_digit(&self) -> bool {
81-
self.chr >= 0x30 && self.chr <= 0x39
82-
}
83-
84-
#[inline]
85-
pub fn is_alphanumeric(&self) -> bool {
86-
self.is_alphabetic() || self.is_digit()
87-
}
88-
89-
#[inline]
90-
pub fn to_lowercase(self) -> Ascii {
91-
Ascii { chr: ASCII_LOWER_MAP[self.chr as usize] }
92-
}
93-
94-
#[inline]
95-
pub fn eq_ignore_case(self, other: Ascii) -> bool {
96-
ASCII_LOWER_MAP[self.chr as usize] == ASCII_LOWER_MAP[other.chr as usize]
97-
}
98-
}
99-
100-
pub trait AsciiCast {
101-
fn to_ascii_opt(&self) -> Option<Ascii>;
102-
}
103-
104-
impl AsciiCast for char {
105-
#[inline]
106-
fn to_ascii_opt(&self) -> Option<Ascii> {
107-
let n = *self as u32;
108-
if n < 0x80 {
109-
Some(Ascii { chr: n as u8 })
110-
} else {
111-
None
112-
}
113-
}
114-
}
115-
116-
pub trait AsciiExt<T> {
117-
fn to_ascii_lower(&self) -> T;
118-
fn eq_ignore_ascii_case(&self, other: Self) -> bool;
119-
}
120-
121-
impl<'a> AsciiExt<Vec<u8>> for &'a [u8] {
122-
#[inline]
123-
fn to_ascii_lower(&self) -> Vec<u8> {
124-
self.iter().map(|&byte| ASCII_LOWER_MAP[byte as usize]).collect()
125-
}
126-
127-
#[inline]
128-
fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
129-
self.len() == other.len() && self.iter().zip(other.iter()).all(
130-
|(byte_self, byte_other)| {
131-
ASCII_LOWER_MAP[*byte_self as usize] ==
132-
ASCII_LOWER_MAP[*byte_other as usize]
133-
}
134-
)
135-
}
136-
}
137-
138-
impl<'a> AsciiExt<String> for &'a str {
139-
#[inline]
140-
fn to_ascii_lower(&self) -> String {
141-
// Vec<u8>::to_ascii_lower() preserves the UTF-8 invariant.
142-
unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lower()) }
143-
}
144-
145-
#[inline]
146-
fn eq_ignore_ascii_case(&self, other: &str) -> bool {
147-
self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
148-
}
149-
}
150-
15122
/// If `c` is an ASCII letter, return the corresponding lowercase
15223
/// letter, otherwise None.
15324
pub fn lower_ascii_letter(c: char) -> Option<char> {
154-
match c.to_ascii_opt() {
155-
Some(a) => if a.is_alphabetic() { Some(a.to_lowercase().to_char()) } else { None },
156-
_ => None,
25+
match c {
26+
'a' ... 'z' => Some(c),
27+
'A' ... 'Z' => Some((c as u8 - b'A' + b'a') as char),
28+
_ => None
15729
}
15830
}
15931

160-
/// Map ASCII uppercase to lowercase; preserve other characters.
161-
pub fn lower_ascii(c: char) -> char {
162-
lower_ascii_letter(c).unwrap_or(c)
163-
}
164-
16532
/// Is the character an ASCII alphanumeric character?
16633
pub fn is_ascii_alnum(c: char) -> bool {
167-
c.to_ascii_opt().map_or(false, |a| a.is_alphanumeric())
34+
matches!(c, '0'...'9' | 'a'...'z' | 'A'...'Z')
16835
}
16936

17037
/// ASCII whitespace characters, as defined by
17138
/// tree construction modes that treat them specially.
17239
pub fn is_ascii_whitespace(c: char) -> bool {
173-
match c {
174-
'\t' | '\r' | '\n' | '\x0C' | ' ' => true,
175-
_ => false,
176-
}
40+
matches!(c, '\t' | '\r' | '\n' | '\x0C' | ' ')
17741
}
17842

17943
#[cfg(test)]
18044
#[allow(non_snake_case)]
18145
mod test {
182-
use super::{is_ascii_alnum, lower_ascii, lower_ascii_letter};
46+
use super::{is_ascii_alnum, lower_ascii_letter};
18347

18448
test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a'));
18549
test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a'));
18650
test_eq!(lower_letter_symbol_is_None, lower_ascii_letter('!'), None);
18751
test_eq!(lower_letter_nonascii_is_None, lower_ascii_letter('\u{a66e}'), None);
18852

189-
test_eq!(lower_a_is_a, lower_ascii('a'), 'a');
190-
test_eq!(lower_A_is_a, lower_ascii('A'), 'a');
191-
test_eq!(lower_symbol_unchanged, lower_ascii('!'), '!');
192-
test_eq!(lower_nonascii_unchanged, lower_ascii('\u{a66e}'), '\u{a66e}');
193-
19453
test_eq!(is_alnum_a, is_ascii_alnum('a'), true);
19554
test_eq!(is_alnum_A, is_ascii_alnum('A'), true);
19655
test_eq!(is_alnum_1, is_ascii_alnum('1'), true);

0 commit comments

Comments
 (0)