From cd99d4df64e89c3b9a95f66441045400152527a8 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 9 Jul 2016 22:20:59 -0400
Subject: [PATCH] Match (?-u:\B) correctly in the NFA engines when valid UTF-8
 is required.

This commit fixes a bug where matching (?-u:\B) (that is, "not an ASCII
word boundary") in the NFA engines could produce match positions at invalid
UTF-8 sequence boundaries. The specific problem is that determining whether
(?-u:\B) matches or not relies on knowing whether we must report matches
only at UTF-8 boundaries, and this wasn't actually being taken into
account. (Instead, we prefer to enforce this invariant in the compiler, so
that the matching engines mostly don't have to care about it.) But of
course, the zero-width assertions are kind of a special case all around,
so we need to handle ASCII word boundaries differently depending on
whether we require valid UTF-8.

This bug was noticed because the DFA actually handles this correctly (by
encoding ASCII word boundaries into the state machine itself, which in turn
guarantees the valid UTF-8 invariant) while the NFAs don't, leading to an
inconsistency.

Fix #241.
---
 regex-syntax/src/parser.rs     |  27 ++++----
 src/backtrack.rs               |   4 +-
 src/dfa.rs                     |   2 +-
 src/exec.rs                    |   4 +-
 src/input.rs                   | 115 ++++++++++++++++++++++++++++++---
 src/pikevm.rs                  |   4 +-
 tests/word_boundary_ascii.rs   |   1 +
 tests/word_boundary_unicode.rs |   1 +
 8 files changed, 128 insertions(+), 30 deletions(-)

diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs
index 0ba0558be4..1d10572f6d 100644
--- a/regex-syntax/src/parser.rs
+++ b/regex-syntax/src/parser.rs
@@ -581,6 +581,9 @@ impl Parser {
                     _ => unreachable!(),
                 },
                 start => {
+                    if !self.flags.unicode {
+                        let _ = try!(self.codepoint_to_one_byte(start));
+                    }
                     self.bump();
                     try!(self.parse_class_range(&mut class, start));
                 }
@@ -610,11 +613,6 @@ impl Parser {
     fn parse_class_range(&mut self, class: &mut CharClass, start: char)
                         -> Result<()> {
         if !self.bump_if('-') {
-            // Make sure we haven't parsed Unicode literals when we shouldn't have.
-            if !self.flags.unicode {
-                let _ = try!(self.codepoint_to_one_byte(start));
-            }
-
             // Not a range, so just push a singleton range.
             class.ranges.push(ClassRange::one(start));
             return Ok(());
@@ -647,7 +645,13 @@ impl Parser {
                 // Because `parse_escape` can never return `LeftParen`.
                 _ => unreachable!(),
             },
-            _ => self.bump(),
+            _ => {
+                let c = self.bump();
+                if !self.flags.unicode {
+                    let _ = try!(self.codepoint_to_one_byte(c));
+                }
+                c
+            }
         };
         if end < start {
             // e.g., [z-a]
@@ -656,11 +660,6 @@ impl Parser {
                 end: end,
             }));
         }
-        // Make sure we haven't parsed Unicode literals when we shouldn't have.
-        if !self.flags.unicode {
-            let _ = try!(self.codepoint_to_one_byte(start));
-            let _ = try!(self.codepoint_to_one_byte(end));
-        }
         class.ranges.push(ClassRange::new(start, end));
         Ok(())
     }
@@ -2015,6 +2014,8 @@ mod tests {
 
         assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
         assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
+        assert_eq!(pb(r"(?-u)[\xFF]"),
+                   Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
         assert_eq!(pb("(?-u)[\n]"),
                    Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
         assert_eq!(pb(r"(?-u)[\n]"),
@@ -2418,8 +2419,8 @@ mod tests {
     #[test]
     fn unicode_class_literal_not_allowed() {
         let flags = Flags { allow_bytes: true, .. Flags::default() };
-        test_err!(r"(?-u)[☃]", 7, ErrorKind::UnicodeNotAllowed, flags);
-        test_err!(r"(?-u)[☃-☃]", 9, ErrorKind::UnicodeNotAllowed, flags);
+        test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
+        test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
     }
 
     #[test]
diff --git a/src/backtrack.rs b/src/backtrack.rs
index b0e0e02035..3c06254c6b 100644
--- a/src/backtrack.rs
+++ b/src/backtrack.rs
@@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
                     ip = inst.goto1;
                 }
                 EmptyLook(ref inst) => {
-                    let prev = self.input.previous_char(at);
-                    let next = self.input.next_char(at);
-                    if inst.matches(prev, next) {
+                    if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     } else {
                         return false;
diff --git a/src/dfa.rs b/src/dfa.rs
index d216f2cbae..0216f25620 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -1847,7 +1847,7 @@ mod tests {
             expected == got && state.flags() == StateFlags(flags)
         }
         QuickCheck::new()
-            .gen(StdGen::new(self::rand::thread_rng(), 70_000))
+            .gen(StdGen::new(self::rand::thread_rng(), 10_000))
             .quickcheck(p as fn(Vec<u32>, u8) -> bool);
     }
 
diff --git a/src/exec.rs b/src/exec.rs
index 755342497b..65a3935a72 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -921,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
                 matches,
                 slots,
                 quit_after_match,
-                ByteInput::new(text),
+                ByteInput::new(text, self.ro.nfa.only_utf8),
                 start)
         } else {
             pikevm::Fsm::exec(
@@ -949,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
                 &self.cache,
                 matches,
                 slots,
-                ByteInput::new(text),
+                ByteInput::new(text, self.ro.nfa.only_utf8),
                 start)
         } else {
             backtrack::Bounded::exec(
diff --git a/src/input.rs b/src/input.rs
index f96a6be075..a8547d5902 100644
--- a/src/input.rs
+++ b/src/input.rs
@@ -16,8 +16,9 @@ use std::u32;
 
 use syntax;
 
-use utf8::{decode_utf8, decode_last_utf8};
 use literals::LiteralSearcher;
+use prog::InstEmptyLook;
+use utf8::{decode_utf8, decode_last_utf8};
 
 /// Represents a location in the input.
 #[derive(Clone, Copy, Debug)]
@@ -83,6 +84,10 @@ pub trait Input {
     /// If no such character could be decoded, then `Char` is absent.
     fn previous_char(&self, at: InputAt) -> Char;
 
+    /// Return true if the given empty width instruction matches at the
+    /// input position given.
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
+
     /// Scan the input for a matching prefix.
     fn prefix_at(
         &self,
@@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {
 
     fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        (**self).is_empty_match(at, empty)
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
     }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        use prog::EmptyLook::*;
+        match empty.look {
+            StartLine => {
+                let c = self.previous_char(at);
+                c.is_none() || c == '\n'
+            }
+            EndLine => {
+                let c = self.next_char(at);
+                c.is_none() || c == '\n'
+            }
+            StartText => self.previous_char(at).is_none(),
+            EndText => self.next_char(at).is_none(),
+            WordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() != c2.is_word_char()
+            }
+            NotWordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() == c2.is_word_char()
+            }
+            WordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_byte() != c2.is_word_byte()
+            }
+            NotWordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_byte() == c2.is_word_byte()
+            }
+        }
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -178,12 +219,18 @@ impl<'t> Input for CharInput<'t> {
 /// easy access to necessary Unicode decoding (used for word boundary look
 /// ahead/look behind).
 #[derive(Clone, Copy, Debug)]
-pub struct ByteInput<'t>(&'t [u8]);
+pub struct ByteInput<'t> {
+    text: &'t [u8],
+    only_utf8: bool,
+}
 
 impl<'t> ByteInput<'t> {
     /// Return a new byte-based input reader for the given string.
-    pub fn new(s: &'t [u8]) -> ByteInput<'t> {
-        ByteInput(s)
+    pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
+        ByteInput {
+            text: text,
+            only_utf8: only_utf8,
+        }
     }
 }
 
@@ -191,7 +238,7 @@ impl<'t> ops::Deref for ByteInput<'t> {
     type Target = [u8];
 
     fn deref(&self) -> &[u8] {
-        self.0
+        self.text
     }
 }
 
@@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
     }
 
+    fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
+        use prog::EmptyLook::*;
+        match empty.look {
+            StartLine => {
+                let c = self.previous_char(at);
+                c.is_none() || c == '\n'
+            }
+            EndLine => {
+                let c = self.next_char(at);
+                c.is_none() || c == '\n'
+            }
+            StartText => self.previous_char(at).is_none(),
+            EndText => self.next_char(at).is_none(),
+            WordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() != c2.is_word_char()
+            }
+            NotWordBoundary => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                c1.is_word_char() == c2.is_word_char()
+            }
+            WordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                if self.only_utf8 {
+                    // If we must match UTF-8, then we can't match word
+                    // boundaries at invalid UTF-8.
+                    if c1.is_none() && !at.is_start() {
+                        return false;
+                    }
+                    if c2.is_none() && !at.is_end() {
+                        return false;
+                    }
+                }
+                c1.is_word_byte() != c2.is_word_byte()
+            }
+            NotWordBoundaryAscii => {
+                let (c1, c2) = (self.previous_char(at), self.next_char(at));
+                if self.only_utf8 {
+                    // If we must match UTF-8, then we can't match word
+                    // boundaries at invalid UTF-8.
+                    if c1.is_none() && !at.is_start() {
+                        return false;
+                    }
+                    if c2.is_none() && !at.is_end() {
+                        return false;
+                    }
+                }
+                c1.is_word_byte() == c2.is_word_byte()
+            }
+        }
+    }
+
     fn prefix_at(
         &self,
         prefixes: &LiteralSearcher,
@@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
     }
 
     fn len(&self) -> usize {
-        self.0.len()
+        self.text.len()
     }
 
     fn as_bytes(&self) -> &[u8] {
-        self.0
+        &self.text
     }
 }
 
@@ -276,7 +375,7 @@ impl Char {
     pub fn is_word_byte(self) -> bool {
         match char::from_u32(self.0) {
             None => false,
-            Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
+            Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
             Some(_) => false,
         }
     }
diff --git a/src/pikevm.rs b/src/pikevm.rs
index a18011bab0..b96f0e7588 100644
--- a/src/pikevm.rs
+++ b/src/pikevm.rs
@@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
             nlist.set.insert(ip);
             match self.prog[ip] {
                 EmptyLook(ref inst) => {
-                    let prev = self.input.previous_char(at);
-                    let next = self.input.next_char(at);
-                    if inst.matches(prev, next) {
+                    if self.input.is_empty_match(at, inst) {
                         ip = inst.goto;
                     }
                 }
diff --git a/tests/word_boundary_ascii.rs b/tests/word_boundary_ascii.rs
index c127e8aa28..9beb7c0cb1 100644
--- a/tests/word_boundary_ascii.rs
+++ b/tests/word_boundary_ascii.rs
@@ -2,6 +2,7 @@
 // For Unicode word boundaries, the tests are precisely inverted.
 matiter!(ascii1, r"\bx\b", "áxβ", (2, 3));
 matiter!(ascii2, r"\Bx\B", "áxβ");
+matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));
 
 // We can still get Unicode mode in byte regexes.
 matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ");
diff --git a/tests/word_boundary_unicode.rs b/tests/word_boundary_unicode.rs
index 42bcba51b4..43612a91ac 100644
--- a/tests/word_boundary_unicode.rs
+++ b/tests/word_boundary_unicode.rs
@@ -5,3 +5,4 @@ matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));
 
 matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
 matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
+matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));