Skip to content

Commit

Permalink
Match (?-u:\B) correctly in the NFA engines when valid UTF-8 is requi…
Browse files Browse the repository at this point in the history
…red.

This commit fixes a bug where matching (?-u:\B) (that is, "not an ASCII
word boundary") in the NFA engines could produce match positions at invalid
UTF-8 sequence boundaries. The specific problem is that determining whether
(?-u:\B) matches or not relies on knowing whether we must report matches
only at UTF-8 boundaries, and this wasn't actually being taken into
account. (Instead, we prefer to enforce this invariant in the compiler, so
that the matching engines mostly don't have to care about it.) But of
course, the zero-width assertions are kind of a special case all around,
so we need to handle ASCII word boundaries differently depending on
whether we require valid UTF-8.

This bug was noticed because the DFA actually handles this correctly (by
encoding ASCII word boundaries into the state machine itself, which in turn
guarantees the valid UTF-8 invariant) while the NFAs don't, leading to an
inconsistency.

Fix #241.
  • Loading branch information
BurntSushi committed Jul 10, 2016
1 parent 81297f0 commit cd99d4d
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 30 deletions.
27 changes: 14 additions & 13 deletions regex-syntax/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,6 +581,9 @@ impl Parser {
_ => unreachable!(),
},
start => {
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(start));
}
self.bump();
try!(self.parse_class_range(&mut class, start));
}
Expand Down Expand Up @@ -610,11 +613,6 @@ impl Parser {
fn parse_class_range(&mut self, class: &mut CharClass, start: char)
-> Result<()> {
if !self.bump_if('-') {
// Make sure we haven't parsed Unicode literals when we shouldn't have.
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(start));
}

// Not a range, so just push a singleton range.
class.ranges.push(ClassRange::one(start));
return Ok(());
Expand Down Expand Up @@ -647,7 +645,13 @@ impl Parser {
// Because `parse_escape` can never return `LeftParen`.
_ => unreachable!(),
},
_ => self.bump(),
_ => {
let c = self.bump();
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(c));
}
c
}
};
if end < start {
// e.g., [z-a]
Expand All @@ -656,11 +660,6 @@ impl Parser {
end: end,
}));
}
// Make sure we haven't parsed Unicode literals when we shouldn't have.
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(start));
let _ = try!(self.codepoint_to_one_byte(end));
}
class.ranges.push(ClassRange::new(start, end));
Ok(())
}
Expand Down Expand Up @@ -2015,6 +2014,8 @@ mod tests {

assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
assert_eq!(pb(r"(?-u)[\xFF]"),
Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
assert_eq!(pb("(?-u)[\n]"),
Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
assert_eq!(pb(r"(?-u)[\n]"),
Expand Down Expand Up @@ -2418,8 +2419,8 @@ mod tests {
#[test]
fn unicode_class_literal_not_allowed() {
let flags = Flags { allow_bytes: true, .. Flags::default() };
test_err!(r"(?-u)[☃]", 7, ErrorKind::UnicodeNotAllowed, flags);
test_err!(r"(?-u)[☃-☃]", 9, ErrorKind::UnicodeNotAllowed, flags);
test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
}

#[test]
Expand Down
4 changes: 1 addition & 3 deletions src/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
ip = inst.goto1;
}
EmptyLook(ref inst) => {
let prev = self.input.previous_char(at);
let next = self.input.next_char(at);
if inst.matches(prev, next) {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
} else {
return false;
Expand Down
2 changes: 1 addition & 1 deletion src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1847,7 +1847,7 @@ mod tests {
expected == got && state.flags() == StateFlags(flags)
}
QuickCheck::new()
.gen(StdGen::new(self::rand::thread_rng(), 70_000))
.gen(StdGen::new(self::rand::thread_rng(), 10_000))
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
}

Expand Down
4 changes: 2 additions & 2 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
matches,
slots,
quit_after_match,
ByteInput::new(text),
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
} else {
pikevm::Fsm::exec(
Expand Down Expand Up @@ -949,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
&self.cache,
matches,
slots,
ByteInput::new(text),
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
} else {
backtrack::Bounded::exec(
Expand Down
115 changes: 107 additions & 8 deletions src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ use std::u32;

use syntax;

use utf8::{decode_utf8, decode_last_utf8};
use literals::LiteralSearcher;
use prog::InstEmptyLook;
use utf8::{decode_utf8, decode_last_utf8};

/// Represents a location in the input.
#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -83,6 +84,10 @@ pub trait Input {
/// If no such character could be decoded, then `Char` is absent.
fn previous_char(&self, at: InputAt) -> Char;

/// Return true if the given empty width instruction matches at the
/// input position given.
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;

/// Scan the input for a matching prefix.
fn prefix_at(
&self,
Expand All @@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {

fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
(**self).is_empty_match(at, empty)
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand Down Expand Up @@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
}

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
use prog::EmptyLook::*;
match empty.look {
StartLine => {
let c = self.previous_char(at);
c.is_none() || c == '\n'
}
EndLine => {
let c = self.next_char(at);
c.is_none() || c == '\n'
}
StartText => self.previous_char(at).is_none(),
EndText => self.next_char(at).is_none(),
WordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() != c2.is_word_char()
}
NotWordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() == c2.is_word_char()
}
WordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_byte() != c2.is_word_byte()
}
NotWordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_byte() == c2.is_word_byte()
}
}
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand All @@ -178,20 +219,26 @@ impl<'t> Input for CharInput<'t> {
/// easy access to necessary Unicode decoding (used for word boundary look
/// ahead/look behind).
#[derive(Clone, Copy, Debug)]
pub struct ByteInput<'t>(&'t [u8]);
pub struct ByteInput<'t> {
text: &'t [u8],
only_utf8: bool,
}

impl<'t> ByteInput<'t> {
/// Return a new byte-based input reader for the given string.
pub fn new(s: &'t [u8]) -> ByteInput<'t> {
ByteInput(s)
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
ByteInput {
text: text,
only_utf8: only_utf8,
}
}
}

impl<'t> ops::Deref for ByteInput<'t> {
type Target = [u8];

fn deref(&self) -> &[u8] {
self.0
self.text
}
}

Expand All @@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
}

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
use prog::EmptyLook::*;
match empty.look {
StartLine => {
let c = self.previous_char(at);
c.is_none() || c == '\n'
}
EndLine => {
let c = self.next_char(at);
c.is_none() || c == '\n'
}
StartText => self.previous_char(at).is_none(),
EndText => self.next_char(at).is_none(),
WordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() != c2.is_word_char()
}
NotWordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() == c2.is_word_char()
}
WordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
if self.only_utf8 {
// If we must match UTF-8, then we can't match word
// boundaries at invalid UTF-8.
if c1.is_none() && !at.is_start() {
return false;
}
if c2.is_none() && !at.is_end() {
return false;
}
}
c1.is_word_byte() != c2.is_word_byte()
}
NotWordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
if self.only_utf8 {
// If we must match UTF-8, then we can't match word
// boundaries at invalid UTF-8.
if c1.is_none() && !at.is_start() {
return false;
}
if c2.is_none() && !at.is_end() {
return false;
}
}
c1.is_word_byte() == c2.is_word_byte()
}
}
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand All @@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
}

fn len(&self) -> usize {
self.0.len()
self.text.len()
}

fn as_bytes(&self) -> &[u8] {
self.0
&self.text
}
}

Expand Down Expand Up @@ -276,7 +375,7 @@ impl Char {
pub fn is_word_byte(self) -> bool {
match char::from_u32(self.0) {
None => false,
Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
Some(_) => false,
}
}
Expand Down
4 changes: 1 addition & 3 deletions src/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
nlist.set.insert(ip);
match self.prog[ip] {
EmptyLook(ref inst) => {
let prev = self.input.previous_char(at);
let next = self.input.next_char(at);
if inst.matches(prev, next) {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
}
}
Expand Down
1 change: 1 addition & 0 deletions tests/word_boundary_ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// For Unicode word boundaries, the tests are precisely inverted.
matiter!(ascii1, r"\bx\b", "áxβ", (2, 3));
matiter!(ascii2, r"\Bx\B", "áxβ");
matiter!(ascii3, r"\B", "0\u{7EF5E}", (2, 2), (3, 3), (4, 4), (5, 5));

// We can still get Unicode mode in byte regexes.
matiter!(unicode1, r"(?u:\b)x(?u:\b)", "áxβ");
Expand Down
1 change: 1 addition & 0 deletions tests/word_boundary_unicode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ matiter!(unicode2, r"\Bx\B", "áxβ", (2, 3));

matiter!(ascii1, r"(?-u:\b)x(?-u:\b)", "áxβ", (2, 3));
matiter!(ascii2, r"(?-u:\B)x(?-u:\B)", "áxβ");
matiter!(ascii3, r"(?-u:\B)", "0\u{7EF5E}", (5, 5));

0 comments on commit cd99d4d

Please sign in to comment.