diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 0ba0558be4..1d10572f6d 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -581,6 +581,9 @@ impl Parser { _ => unreachable!(), }, start => { + if !self.flags.unicode { + let _ = try!(self.codepoint_to_one_byte(start)); + } self.bump(); try!(self.parse_class_range(&mut class, start)); } @@ -610,11 +613,6 @@ impl Parser { fn parse_class_range(&mut self, class: &mut CharClass, start: char) -> Result<()> { if !self.bump_if('-') { - // Make sure we haven't parsed Unicode literals when we shouldn't have. - if !self.flags.unicode { - let _ = try!(self.codepoint_to_one_byte(start)); - } - // Not a range, so just push a singleton range. class.ranges.push(ClassRange::one(start)); return Ok(()); @@ -647,7 +645,13 @@ impl Parser { // Because `parse_escape` can never return `LeftParen`. _ => unreachable!(), }, - _ => self.bump(), + _ => { + let c = self.bump(); + if !self.flags.unicode { + let _ = try!(self.codepoint_to_one_byte(c)); + } + c + } }; if end < start { // e.g., [z-a] @@ -656,11 +660,6 @@ impl Parser { end: end, })); } - // Make sure we haven't parsed Unicode literals when we shouldn't have. - if !self.flags.unicode { - let _ = try!(self.codepoint_to_one_byte(start)); - let _ = try!(self.codepoint_to_one_byte(end)); - } class.ranges.push(ClassRange::new(start, end)); Ok(()) } @@ -2015,6 +2014,8 @@ mod tests { assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')]))); assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)]))); + assert_eq!(pb(r"(?-u)[\xFF]"), + Expr::ClassBytes(bclass(&[(0xFF, 0xFF)]))); assert_eq!(pb("(?-u)[\n]"), Expr::ClassBytes(bclass(&[(b'\n', b'\n')]))); assert_eq!(pb(r"(?-u)[\n]"), @@ -2418,8 +2419,8 @@ mod tests { #[test] fn unicode_class_literal_not_allowed() { let flags = Flags { allow_bytes: true, .. Flags::default() }; - test_err!(r"(?-u)[☃]", 7, ErrorKind::UnicodeNotAllowed, flags); - test_err!(r"(?-u)[☃-☃]", 9, ErrorKind::UnicodeNotAllowed, flags); + test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags); + test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags); } #[test] diff --git a/src/backtrack.rs b/src/backtrack.rs index b0e0e02035..3c06254c6b 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { ip = inst.goto1; } EmptyLook(ref inst) => { - let prev = self.input.previous_char(at); - let next = self.input.next_char(at); - if inst.matches(prev, next) { + if self.input.is_empty_match(at, inst) { ip = inst.goto; } else { return false; diff --git a/src/dfa.rs b/src/dfa.rs index d216f2cbae..0216f25620 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -1847,7 +1847,7 @@ mod tests { expected == got && state.flags() == StateFlags(flags) } QuickCheck::new() - .gen(StdGen::new(self::rand::thread_rng(), 70_000)) + .gen(StdGen::new(self::rand::thread_rng(), 10_000)) .quickcheck(p as fn(Vec, u8) -> bool); } diff --git a/src/exec.rs b/src/exec.rs index 755342497b..65a3935a72 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -921,7 +921,7 @@ impl<'c> ExecNoSync<'c> { matches, slots, quit_after_match, - ByteInput::new(text), + ByteInput::new(text, self.ro.nfa.only_utf8), start) } else { pikevm::Fsm::exec( @@ -949,7 +949,7 @@ impl<'c> ExecNoSync<'c> { &self.cache, matches, slots, - ByteInput::new(text), + ByteInput::new(text, self.ro.nfa.only_utf8), start) } else { backtrack::Bounded::exec( diff --git a/src/input.rs b/src/input.rs index f96a6be075..a8547d5902 100644 --- a/src/input.rs +++ b/src/input.rs @@ -16,8 +16,9 @@ use std::u32; use syntax; -use utf8::{decode_utf8, decode_last_utf8}; use literals::LiteralSearcher; +use prog::InstEmptyLook; +use utf8::{decode_utf8, decode_last_utf8}; /// Represents a location in the input. #[derive(Clone, Copy, Debug)] @@ -83,6 +84,10 @@ pub trait Input { /// If no such character could be decoded, then `Char` is absent. fn previous_char(&self, at: InputAt) -> Char; + /// Return true if the given empty width instruction matches at the + /// input position given. + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; + /// Scan the input for a matching prefix. fn prefix_at( &self, @@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T { fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) } + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + (**self).is_empty_match(at, empty) + } + fn prefix_at( &self, prefixes: &LiteralSearcher, @@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + c.is_none() || c == '\n' + } + EndLine => { + let c = self.next_char(at); + c.is_none() || c == '\n' + } + StartText => self.previous_char(at).is_none(), + EndText => self.next_char(at).is_none(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_byte() == c2.is_word_byte() + } + } + } + fn prefix_at( &self, prefixes: &LiteralSearcher, @@ -178,12 +219,18 @@ impl<'t> Input for CharInput<'t> { /// easy access to necessary Unicode decoding (used for word boundary look /// ahead/look behind). #[derive(Clone, Copy, Debug)] -pub struct ByteInput<'t>(&'t [u8]); +pub struct ByteInput<'t> { + text: &'t [u8], + only_utf8: bool, +} impl<'t> ByteInput<'t> { /// Return a new byte-based input reader for the given string. - pub fn new(s: &'t [u8]) -> ByteInput<'t> { - ByteInput(s) + pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { + ByteInput { + text: text, + only_utf8: only_utf8, + } } } @@ -191,7 +238,7 @@ impl<'t> ops::Deref for ByteInput<'t> { type Target = [u8]; fn deref(&self) -> &[u8] { - self.0 + self.text } } @@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> { decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() } + fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { + use prog::EmptyLook::*; + match empty.look { + StartLine => { + let c = self.previous_char(at); + c.is_none() || c == '\n' + } + EndLine => { + let c = self.next_char(at); + c.is_none() || c == '\n' + } + StartText => self.previous_char(at).is_none(), + EndText => self.next_char(at).is_none(), + WordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() != c2.is_word_char() + } + NotWordBoundary => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + c1.is_word_char() == c2.is_word_char() + } + WordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() != c2.is_word_byte() + } + NotWordBoundaryAscii => { + let (c1, c2) = (self.previous_char(at), self.next_char(at)); + if self.only_utf8 { + // If we must match UTF-8, then we can't match word + // boundaries at invalid UTF-8. + if c1.is_none() && !at.is_start() { + return false; + } + if c2.is_none() && !at.is_end() { + return false; + } + } + c1.is_word_byte() == c2.is_word_byte() + } + } + } + fn prefix_at( &self, prefixes: &LiteralSearcher, @@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> { } fn len(&self) -> usize { - self.0.len() + self.text.len() } fn as_bytes(&self) -> &[u8] { - self.0 + &self.text } } @@ -276,7 +375,7 @@ impl Char { pub fn is_word_byte(self) -> bool { match char::from_u32(self.0) { None => false, - Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8), + Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8), Some(_) => false, } } diff --git a/src/pikevm.rs b/src/pikevm.rs index a18011bab0..b96f0e7588 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> { nlist.set.insert(ip); match self.prog[ip] { EmptyLook(ref inst) => { - let prev = self.input.previous_char(at); - let next = self.input.next_char(at); - if inst.matches(prev, next) { + if self.input.is_empty_match(at, inst) { ip = inst.goto; } } diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs index c127e8aa28..9beb7c0cb1 100644 --- a/tests/word_boundary_ascii.rs +++ b/tests/word_boundary_ascii.rs @@ -2,6 +2,7 @@ // For Unicode word boundaries, the tests are precisely inverted. matiter!(ascii1, r"\bx\b", "áxβ", (2, 3)); matiter!(ascii2, r"\Bx\B", "áxβ"); +matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5)); // We can still get Unicode mode in byte regexes. matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ"); diff --git a/tests/word_boundary_unicode.rs b/tests/word_boundary_unicode.rs index 42bcba51b4..43612a91ac 100644 --- a/tests/word_boundary_unicode.rs +++ b/tests/word_boundary_unicode.rs @@ -5,3 +5,4 @@ matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3)); matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3)); matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ"); +matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));