Auto merge of #147 - servo:ascii, r=Manishearth

bors-servo · bors-servo · commit 69996c278c7c · 2015-07-02T09:11:54.000-06:00
Use std::ascii instead of duplicating it.
diff --git a/src/tokenizer/buffer_queue.rs b/src/tokenizer/buffer_queue.rs
@@ -7,10 +7,9 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-use util::str::AsciiCast;
 use util::smallcharset::SmallCharSet;
 
-use std::str::CharRange;
+use std::ascii::AsciiExt;
 use std::collections::VecDeque;
 
 use tendril::StrTendril;
@@ -122,19 +121,16 @@ impl BufferQueue {
             return None;
         }
 
-        for c in pat.chars() {
+        for pattern_byte in pat.bytes() {
             if buffers_exhausted >= self.buffers.len() {
                 return None;
             }
             let ref buf = self.buffers[buffers_exhausted];
 
-            let d = buf.char_at(consumed_from_last);
-            match (c.to_ascii_opt(), d.to_ascii_opt()) {
-                (Some(c), Some(d)) => if c.eq_ignore_case(d) { () } else { return Some(false) },
-                _ => return Some(false),
+            if !buf.as_bytes()[consumed_from_last].eq_ignore_ascii_case(&pattern_byte) {
+                return Some(false)
             }
 
-            // d was an ASCII character; size must be 1 byte
             consumed_from_last += 1;
             if consumed_from_last >= buf.len() {
                 buffers_exhausted += 1;
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
@@ -26,9 +26,10 @@ use self::char_ref::{CharRef, CharRefTokenizer};
 
 use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet};
 
-use util::str::{lower_ascii, lower_ascii_letter};
+use util::str::lower_ascii_letter;
 use util::smallcharset::SmallCharSet;
 
+use std::ascii::AsciiExt;
 use std::mem::replace;
 use std::default::Default;
 use std::borrow::Cow::{self, Borrowed};
@@ -739,7 +740,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                 '/'  => go!(self: to SelfClosingStartTag),
                 '>'  => go!(self: emit_tag Data),
                 '\0' => go!(self: error; push_tag '\u{fffd}'),
-                c    => go!(self: push_tag (lower_ascii(c))),
+                c    => go!(self: push_tag (c.to_ascii_lowercase())),
             }},
 
             //§ script-data-escaped-less-than-sign-state
@@ -1039,7 +1040,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                 '\t' | '\n' | '\x0C' | ' ' => (),
                 '\0' => go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName),
                 '>'  => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
-                c    => go!(self: create_doctype; push_doctype_name (lower_ascii(c)); to DoctypeName),
+                c    => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase()); 
+                                  to DoctypeName),
             }},
 
             //§ doctype-name-state
@@ -1048,7 +1050,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                      => go!(self: to AfterDoctypeName),
                 '>'  => go!(self: emit_doctype; to Data),
                 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
-                c    => go!(self: push_doctype_name (lower_ascii(c))),
+                c    => go!(self: push_doctype_name (c.to_ascii_lowercase())),
             }},
 
             //§ after-doctype-name-state
diff --git a/src/tree_builder/actions.rs b/src/tree_builder/actions.rs
@@ -20,8 +20,9 @@ use tree_builder::rules::TreeBuilderStep;
 use tokenizer::{Attribute, Tag, StartTag, EndTag};
 use tokenizer::states::{RawData, RawKind};
 
-use util::str::{AsciiExt, to_escaped_string};
+use util::str::to_escaped_string;
 
+use std::ascii::AsciiExt;
 use std::{slice, fmt};
 use std::mem::replace;
 use std::iter::{Rev, Enumerate};
diff --git a/src/tree_builder/data.rs b/src/tree_builder/data.rs
@@ -9,8 +9,8 @@
 
 use tokenizer::Doctype;
 use tree_builder::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks};
-use util::str::AsciiExt;
 
+use std::ascii::AsciiExt;
 use tendril::StrTendril;
 
 // These should all be lowercase, for ASCII-case-insensitive matching.
@@ -104,7 +104,7 @@ pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool
     }
 
     fn opt_to_ascii_lower(x: Option<&str>) -> Option<String> {
-        x.map(|y| y.to_ascii_lower())
+        x.map(|y| y.to_ascii_lowercase())
     }
 
     let name = opt_tendril_as_slice(&doctype.name);
diff --git a/src/tree_builder/rules.rs b/src/tree_builder/rules.rs
@@ -17,8 +17,9 @@ use tree_builder::interface::{TreeSink, Quirks, AppendNode, NextParserState};
 use tokenizer::{Tag, StartTag, EndTag};
 use tokenizer::states::{Rcdata, Rawtext, ScriptData, Plaintext, Quiescent};
 
-use util::str::{AsciiExt, is_ascii_whitespace};
+use util::str::is_ascii_whitespace;
 
+use std::ascii::AsciiExt;
 use std::mem::replace;
 use std::borrow::Cow::Borrowed;
 use std::borrow::ToOwned;
diff --git a/src/util/str.rs b/src/util/str.rs
@@ -19,178 +19,37 @@ pub fn to_escaped_string<T: fmt::Debug>(x: &T) -> String {
     buf.escape_default()
 }
 
-// FIXME: The ASCII stuff is largely copied from std::ascii
-// (see rust-lang/rust#16801).
-
-pub static ASCII_LOWER_MAP: [u8; 256] = [
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
-    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
-    b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
-    b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
-    b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
-    b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
-    b'@',
-
-          b'a', b'b', b'c', b'd', b'e', b'f', b'g',
-    b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
-    b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
-    b'x', b'y', b'z',
-
-                      b'[', b'\\', b']', b'^', b'_',
-    b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
-    b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
-    b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
-    b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
-    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
-    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
-    0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
-    0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-    0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
-    0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
-    0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
-    0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
-    0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
-    0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
-    0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
-    0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
-    0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
-    0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
-    0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
-];
-
-#[derive(Copy, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
-pub struct Ascii {
-    chr: u8,
-}
-
-impl Ascii {
-    pub fn to_char(self) -> char {
-        self.chr as char
-    }
-
-    #[inline]
-    pub fn is_alphabetic(&self) -> bool {
-        (self.chr >= 0x41 && self.chr <= 0x5A) || (self.chr >= 0x61 && self.chr <= 0x7A)
-    }
-
-    #[inline]
-    pub fn is_digit(&self) -> bool {
-        self.chr >= 0x30 && self.chr <= 0x39
-    }
-
-    #[inline]
-    pub fn is_alphanumeric(&self) -> bool {
-        self.is_alphabetic() || self.is_digit()
-    }
-
-    #[inline]
-    pub fn to_lowercase(self) -> Ascii {
-        Ascii { chr: ASCII_LOWER_MAP[self.chr as usize] }
-    }
-
-    #[inline]
-    pub fn eq_ignore_case(self, other: Ascii) -> bool {
-        ASCII_LOWER_MAP[self.chr as usize] == ASCII_LOWER_MAP[other.chr as usize]
-    }
-}
-
-pub trait AsciiCast {
-    fn to_ascii_opt(&self) -> Option<Ascii>;
-}
-
-impl AsciiCast for char {
-    #[inline]
-    fn to_ascii_opt(&self) -> Option<Ascii> {
-        let n = *self as u32;
-        if n < 0x80 {
-            Some(Ascii { chr: n as u8 })
-        } else {
-            None
-        }
-    }
-}
-
-pub trait AsciiExt<T> {
-    fn to_ascii_lower(&self) -> T;
-    fn eq_ignore_ascii_case(&self, other: Self) -> bool;
-}
-
-impl<'a> AsciiExt<Vec<u8>> for &'a [u8] {
-    #[inline]
-    fn to_ascii_lower(&self) -> Vec<u8> {
-        self.iter().map(|&byte| ASCII_LOWER_MAP[byte as usize]).collect()
-    }
-
-    #[inline]
-    fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
-        self.len() == other.len() && self.iter().zip(other.iter()).all(
-            |(byte_self, byte_other)| {
-                ASCII_LOWER_MAP[*byte_self as usize] ==
-                    ASCII_LOWER_MAP[*byte_other as usize]
-            }
-        )
-    }
-}
-
-impl<'a> AsciiExt<String> for &'a str {
-    #[inline]
-    fn to_ascii_lower(&self) -> String {
-        // Vec<u8>::to_ascii_lower() preserves the UTF-8 invariant.
-        unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lower()) }
-    }
-
-    #[inline]
-    fn eq_ignore_ascii_case(&self, other: &str) -> bool {
-        self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
-    }
-}
-
 /// If `c` is an ASCII letter, return the corresponding lowercase
 /// letter, otherwise None.
 pub fn lower_ascii_letter(c: char) -> Option<char> {
-    match c.to_ascii_opt() {
-        Some(a) => if a.is_alphabetic() { Some(a.to_lowercase().to_char()) } else { None },
-        _ => None,
+    match c {
+        'a' ... 'z' => Some(c),
+        'A' ... 'Z' => Some((c as u8 - b'A' + b'a') as char),
+        _ => None
     }
 }
 
-/// Map ASCII uppercase to lowercase; preserve other characters.
-pub fn lower_ascii(c: char) -> char {
-    lower_ascii_letter(c).unwrap_or(c)
-}
-
 /// Is the character an ASCII alphanumeric character?
 pub fn is_ascii_alnum(c: char) -> bool {
-    c.to_ascii_opt().map_or(false, |a| a.is_alphanumeric())
+    matches!(c, '0'...'9' | 'a'...'z' | 'A'...'Z')
 }
 
 /// ASCII whitespace characters, as defined by
 /// tree construction modes that treat them specially.
 pub fn is_ascii_whitespace(c: char) -> bool {
-    match c {
-        '\t' | '\r' | '\n' | '\x0C' | ' ' => true,
-        _ => false,
-    }
+    matches!(c, '\t' | '\r' | '\n' | '\x0C' | ' ')
 }
 
 #[cfg(test)]
 #[allow(non_snake_case)]
 mod test {
-    use super::{is_ascii_alnum, lower_ascii, lower_ascii_letter};
+    use super::{is_ascii_alnum, lower_ascii_letter};
 
     test_eq!(lower_letter_a_is_a, lower_ascii_letter('a'), Some('a'));
     test_eq!(lower_letter_A_is_a, lower_ascii_letter('A'), Some('a'));
     test_eq!(lower_letter_symbol_is_None, lower_ascii_letter('!'), None);
     test_eq!(lower_letter_nonascii_is_None, lower_ascii_letter('\u{a66e}'), None);
 
-    test_eq!(lower_a_is_a, lower_ascii('a'), 'a');
-    test_eq!(lower_A_is_a, lower_ascii('A'), 'a');
-    test_eq!(lower_symbol_unchanged, lower_ascii('!'), '!');
-    test_eq!(lower_nonascii_unchanged, lower_ascii('\u{a66e}'), '\u{a66e}');
-
     test_eq!(is_alnum_a, is_ascii_alnum('a'), true);
     test_eq!(is_alnum_A, is_ascii_alnum('A'), true);
     test_eq!(is_alnum_1, is_ascii_alnum('1'), true);

Original file line number	Diff line number	Diff line change
`@@ -9,8 +9,8 @@`
`9`	`9`
`10`	`10`	`use tokenizer::Doctype;`
`11`	`11`	`use tree_builder::interface::{QuirksMode, Quirks, LimitedQuirks, NoQuirks};`
`12`		`-use util::str::AsciiExt;`
`13`	`12`
	`13`	`+use std::ascii::AsciiExt;`
`14`	`14`	`use tendril::StrTendril;`
`15`	`15`
`16`	`16`	`// These should all be lowercase, for ASCII-case-insensitive matching.`
`@@ -104,7 +104,7 @@ pub fn doctype_error_and_quirks(doctype: &Doctype, iframe_srcdoc: bool) -> (bool`
`104`	`104`	`}`
`105`	`105`
`106`	`106`	`fn opt_to_ascii_lower(x: Option<&str>) -> Option<String> {`
`107`		`- x.map(\|y\| y.to_ascii_lower())`
	`107`	`+ x.map(\|y\| y.to_ascii_lowercase())`
`108`	`108`	`}`
`109`	`109`
`110`	`110`	`let name = opt_tendril_as_slice(&doctype.name);`