Match (?-u:\B) correctly in the NFA engines when valid UTF-8 is requi…

…red. This commit fixes a bug where matching (?-u:\B) (that is, "not an ASCII word boundary") in the NFA engines could produce match positions at invalid UTF-8 sequence boundaries. The specific problem is that determining whether (?-u:\B) matches or not relies on knowing whether we must report matches only at UTF-8 boundaries, and this wasn't actually being taken into account. (Instead, we prefer to enforce this invariant in the compiler, so that the matching engines mostly don't have to care about it.) But of course, the zero-width assertions are kind of a special case all around, so we need to handle ASCII word boundaries differently depending on whether we require valid UTF-8. This bug was noticed because the DFA actually handles this correctly (by encoding ASCII word boundaries into the state machine itself, which in turn guarantees the valid UTF-8 invariant) while the NFAs don't, leading to an inconsistency. Fix #241.
rust-lang · Jul 10, 2016 · cd99d4d · cd99d4d
1 parent 81297f0
commit cd99d4d
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 30 deletions.
diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
@@ -581,6 +581,9 @@ impl Parser {
                     _ => unreachable!(),
                 },
                 start => {
+                    if !self.flags.unicode {
+                        let _ = try!(self.codepoint_to_one_byte(start));
+                    }
                     self.bump();
                     try!(self.parse_class_range(&mut class, start));
                 }
@@ -610,11 +613,6 @@ impl Parser {
     fn parse_class_range(&mut self, class: &mut CharClass, start: char)
                         -> Result<()> {
         if !self.bump_if('-') {
-            // Make sure we haven't parsed Unicode literals when we shouldn't have.
-            if !self.flags.unicode {
-                let _ = try!(self.codepoint_to_one_byte(start));
-            }
-
             // Not a range, so just push a singleton range.
             class.ranges.push(ClassRange::one(start));
             return Ok(());
@@ -647,7 +645,13 @@ impl Parser {
                 // Because `parse_escape` can never return `LeftParen`.
                 _ => unreachable!(),
             },
-            _ => self.bump(),
+            _ => {
+                let c = self.bump();
+                if !self.flags.unicode {
+                    let _ = try!(self.codepoint_to_one_byte(c));
+                }
+                c
+            }
         };
         if end < start {
             // e.g., [z-a]
@@ -656,11 +660,6 @@ impl Parser {
                 end: end,
             }));
         }
-        // Make sure we haven't parsed Unicode literals when we shouldn't have.
-        if !self.flags.unicode {
-            let _ = try!(self.codepoint_to_one_byte(start));
-            let _ = try!(self.codepoint_to_one_byte(end));
-        }
         class.ranges.push(ClassRange::new(start, end));
         Ok(())
     }
@@ -2015,6 +2014,8 @@ mod tests {
 
         assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
         assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
+        assert_eq!(pb(r"(?-u)[\xFF]"),
+                   Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
         assert_eq!(pb("(?-u)[\n]"),
                    Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
         assert_eq!(pb(r"(?-u)[\n]"),
@@ -2418,8 +2419,8 @@ mod tests {
     #[test]
     fn unicode_class_literal_not_allowed() {
         let flags = Flags { allow_bytes: true, .. Flags::default() };
-        test_err!(r"(?-u)[☃]", 7, ErrorKind::UnicodeNotAllowed, flags);
-        test_err!(r"(?-u)[☃-☃]", 9, ErrorKind::UnicodeNotAllowed, flags);
+        test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
+        test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
     }
 
     #[test]

diff --git a/src/backtrack.rs b/src/backtrack.rs
@@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
                     ip = inst.goto1;
                 }
                 EmptyLook(ref inst) => {
-                    let prev = self.input.previous_char(at);
-                    let next = self.input.next_char(at);
-                    if inst.matches(prev, next) {
+                    if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     } else {
                         return false;

diff --git a/src/dfa.rs b/src/dfa.rs
@@ -1847,7 +1847,7 @@ mod tests {
             expected == got && state.flags() == StateFlags(flags)
         }
         QuickCheck::new()
-            .gen(StdGen::new(self::rand::thread_rng(), 70_000))
+            .gen(StdGen::new(self::rand::thread_rng(), 10_000))
             .quickcheck(p as fn(Vec<u32>, u8) -> bool);
     }
 

diff --git a/src/exec.rs b/src/exec.rs
@@ -921,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
                 matches,
                 slots,
                 quit_after_match,
-                ByteInput::new(text),
+                ByteInput::new(text, self.ro.nfa.only_utf8),
                 start)
         } else {
             pikevm::Fsm::exec(
@@ -949,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
                 &self.cache,
                 matches,
                 slots,
-                ByteInput::new(text),
+                ByteInput::new(text, self.ro.nfa.only_utf8),
                 start)
         } else {
             backtrack::Bounded::exec(

diff --git a/src/input.rs b/src/input.rs
@@ -16,8 +16,9 @@ use std::u32;
 
 use syntax;
 
-use utf8::{decode_utf8, decode_last_utf8};
 use literals::LiteralSearcher;
+use prog::InstEmptyLook;
+use utf8::{decode_utf8, decode_last_utf8};
 
 /// Represents a location in the input.
 #[derive(Clone, Copy, Debug)]
@@ -83,6 +84,10 @@ pub trait Input {
     /// If no such character could be decoded, then `Char` is absent.
     fn previous_char(&self, at: InputAt) -> Char;
 
+    /// Return true if the given empty width instruction matches at the
+    /// input position given.
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
+
     /// Scan the input for a matching prefix.
     fn prefix_at(
         &self,
@@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {
 
     fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        (**self).is_empty_match(at, empty)
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
     }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        use prog::EmptyLook::*;
+        match empty.look {
+            StartLine => {
+                let c = self.previous_char(at);
+                c.is_none() || c == '\n'
+            }
+            EndLine => {
+                let c = self.next_char(at);
+                c.is_none() || c == '\n'
+            }
+            StartText => self.previous_char(at).is_none(),
+            EndText => self.next_char(at).is_none(),
+            WordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() != c2.is_word_char()
+            }
+            NotWordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() == c2.is_word_char()
+            }
+            WordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_byte() != c2.is_word_byte()
+            }
+            NotWordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_byte() == c2.is_word_byte()
+            }
+        }
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -178,20 +219,26 @@ impl<'t> Input for CharInput<'t> {
 /// easy access to necessary Unicode decoding (used for word boundary look
 /// ahead/look behind).
 #[derive(Clone, Copy, Debug)]
-pub struct ByteInput<'t>(&'t [u8]);
+pub struct ByteInput<'t> {
+    text: &'t [u8],
+    only_utf8: bool,
+}
 
 impl<'t> ByteInput<'t> {
     /// Return a new byte-based input reader for the given string.
-    pub fn new(s: &'t [u8]) -> ByteInput<'t> {
-        ByteInput(s)
+    pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
+        ByteInput {
+            text: text,
+            only_utf8: only_utf8,
+        }
     }
 }
 
 impl<'t> ops::Deref for ByteInput<'t> {
     type Target = [u8];
 
     fn deref(&self) -> &[u8] {
-        self.0
+        self.text
     }
 }
 
@@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
     }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        use prog::EmptyLook::*;
+        match empty.look {
+            StartLine => {
+                let c = self.previous_char(at);
+                c.is_none() || c == '\n'
+            }
+            EndLine => {
+                let c = self.next_char(at);
+                c.is_none() || c == '\n'
+            }
+            StartText => self.previous_char(at).is_none(),
+            EndText => self.next_char(at).is_none(),
+            WordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() != c2.is_word_char()
+            }
+            NotWordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() == c2.is_word_char()
+            }
+            WordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                if self.only_utf8 {
+                    // If we must match UTF-8, then we can't match word
+                    // boundaries at invalid UTF-8.
+                    if c1.is_none() && !at.is_start() {
+                        return false;
+                    }
+                    if c2.is_none() && !at.is_end() {
+                        return false;
+                    }
+                }
+                c1.is_word_byte() != c2.is_word_byte()
+            }
+            NotWordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                if self.only_utf8 {
+                    // If we must match UTF-8, then we can't match word
+                    // boundaries at invalid UTF-8.
+                    if c1.is_none() && !at.is_start() {
+                        return false;
+                    }
+                    if c2.is_none() && !at.is_end() {
+                        return false;
+                    }
+                }
+                c1.is_word_byte() == c2.is_word_byte()
+            }
+        }
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
     }
 
     fn len(&self) -> usize {
-        self.0.len()
+        self.text.len()
     }
 
     fn as_bytes(&self) -> &[u8] {
-        self.0
+        &self.text
     }
 }
 
@@ -276,7 +375,7 @@ impl Char {
     pub fn is_word_byte(self) -> bool {
         match char::from_u32(self.0) {
             None => false,
-            Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
+            Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
             Some(_) => false,
         }
     }

diff --git a/src/pikevm.rs b/src/pikevm.rs
@@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
             nlist.set.insert(ip);
             match self.prog[ip] {
                 EmptyLook(ref inst) => {
-                    let prev = self.input.previous_char(at);
-                    let next = self.input.next_char(at);
-                    if inst.matches(prev, next) {
+                    if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     }
                 }

diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs
@@ -2,6 +2,7 @@
 // For Unicode word boundaries, the tests are precisely inverted.
 matiter!(ascii1, r"\bx\b", "áxβ", (2, 3));
 matiter!(ascii2, r"\Bx\B", "áxβ");
+matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
 
 // We can still get Unicode mode in byte regexes.
 matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ");

diff --git a/tests/word_boundary_unicode.rs b/tests/word_boundary_unicode.rs
@@ -5,3 +5,4 @@ matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
 
 matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
 matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
+matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@ matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));

		matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
		matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
		matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));