Skip to content

Commit

Permalink
Fixes #169.
Browse files Browse the repository at this point in the history
This fixes an issue where prefix literals could contain alternates such
that an Aho-Corasick match could be ambiguous. More specifically, AC
returns matches in the order in which they appear in the text rather than
specifically "leftmost first." This means that if an alternate is a
substring of another alternate, then AC will return a match for that first
even if the other alternate is left of it. In essence, this causes a prefix
scan to skip too far ahead in the text.

We fix this by ensuring that prefix literals are unambiguous by truncating
all literals such that none of them are substrings of another. This is
written in a way that is somewhat expensive, but we constrain the number
of literals greatly, so the quadratic runtime shouldn't be too noticeable.
  • Loading branch information
BurntSushi committed Feb 19, 2016
1 parent 277926f commit 55a1fc9
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 89 deletions.
271 changes: 183 additions & 88 deletions src/literals.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use memchr::{memchr, memchr2, memchr3};
use char_utf8::encode_utf8;
use inst::{Insts, Inst, InstBytes, InstRanges};

#[derive(Clone, Eq, PartialEq)]
pub struct AlternateLiterals {
at_match: bool,
literals: Vec<Vec<u8>>,
Expand All @@ -29,10 +30,11 @@ impl AlternateLiterals {
if self.literals.is_empty() {
Literals::empty()
} else {
let at_match = self.at_match;
let alts = self.unambiguous();
let at_match = alts.at_match;
Literals {
at_match: at_match,
matcher: LiteralMatcher::new(self),
matcher: LiteralMatcher::new(alts),
}
}
}
Expand All @@ -53,6 +55,13 @@ impl AlternateLiterals {
self.literals.len() >= 1 && self.literals.iter().all(|s| s.len() == 1)
}

fn all_same_length(&self) -> bool {
if self.literals.is_empty() {
return true;
}
self.literals.iter().all(|s| s.len() == self.literals[0].len())
}

fn is_one_literal(&self) -> bool {
self.literals.len() == 1
}
Expand Down Expand Up @@ -123,6 +132,71 @@ impl AlternateLiterals {
}
}
}

/// Returns a new set of alternate literals that are guaranteed to be
/// unambiguous.
///
/// State differently, the set of literals returned is guaranteed to never
/// result in any overlapping matches.
///
/// Duplicate literals are dropped. Literals that are otherwise distinct
/// will be possibly truncated.
fn unambiguous(&self) -> AlternateLiterals {
fn position(needle: &[u8], mut haystack: &[u8]) -> Option<usize> {
let mut i = 0;
while haystack.len() >= needle.len() {
if needle == &haystack[..needle.len()] {
return Some(i);
}
i += 1;
haystack = &haystack[1..];
}
None
}

// This function is a bit gratuitous and allocation heavy, but in
// general, we limit the number of alternates to be pretty small.

if self.all_same_length() {
return self.clone();
}
let mut new = AlternateLiterals {
at_match: self.at_match,
literals: Vec::with_capacity(self.literals.len()),
};
'OUTER:
for lit1 in &self.literals {
if new.literals.is_empty() {
new.literals.push(lit1.clone());
continue;
}
let mut candidate = lit1.clone();
for lit2 in &mut new.literals {
if &candidate == lit2 {
// If the literal is already in the set, then we can
// just drop it.
continue 'OUTER;
}
if lit1.len() <= lit2.len() {
if let Some(i) = position(&candidate, lit2) {
new.at_match = false;
lit2.truncate(i);
}
} else {
if let Some(i) = position(lit2, &candidate) {
new.at_match = false;
candidate.truncate(i);
}
}
}
new.literals.push(candidate);
}
new.literals.retain(|lit| !lit.is_empty());
// This is OK only because the alternates are unambiguous.
new.literals.sort();
new.literals.dedup();
new
}
}

pub struct BuildPrefixes<'a> {
Expand Down Expand Up @@ -324,20 +398,59 @@ enum LiteralMatcher {
/// A single byte prefix.
Byte(u8),
/// A set of two or more single byte prefixes.
/// This could be reduced to a bitset, which would use only 8 bytes,
/// but I don't think we care.
Bytes {
chars: Vec<u8>,
sparse: Vec<bool>,
},
/// A single substring. (Likely using Boyer-Moore with memchr.)
Single(SingleSearch),
/// A full Aho-Corasick DFA. A "full" DFA in this case means that all of
/// the failure transitions have been expanded and the entire DFA is
/// represented by a memory inefficient sparse matrix. This makes matching
/// extremely fast. We only use this "full" DFA when the number of bytes
/// in our literals does not exceed 250. This generally leads to a DFA that
/// consumes 250KB of memory.
FullAutomaton(FullAcAutomaton<Vec<u8>>),
/// An Aho-Corasick automaton.
AC(FullAcAutomaton<Vec<u8>>),
}

impl LiteralMatcher {
/// Create a new prefix matching machine.
fn new(mut alts: AlternateLiterals) -> Self {
use self::LiteralMatcher::*;

if alts.is_empty() {
Empty
} else if alts.distinct_single_bytes() >= 26 {
// Why do we do this? Well, it's a heuristic to prevent thrashing.
// Basically, if our literal matcher has lots of literals that are
// a single byte, then we lose a lot of the benefits of fast
// literal searching. In particular, single bytes have a high
// probability of matching. In a regex that rarely matches, we end
// up ping-ponging between the literal matcher and the regex engine
// for every byte of input. That's bad juju.
//
// Note that we only count distinct starting bytes from literals of
// length 1. For literals longer than that, we assume they have
// a lower probability of matching.
//
// This particular heuristic would be triggered on, e.g.,
// `[a-z].+`. The prefix here is a single byte that is very likely
// to match on any given byte in the input, so it's quicker just
// to let the matching engine process it.
//
// TODO(burntsushi): Consider lowering the threshold!
Empty
} else if alts.is_single_byte() {
Byte(alts.literals[0][0])
} else if alts.all_single_bytes() {
let mut set = vec![false; 256];
let mut bytes = vec![];
for lit in alts.literals {
bytes.push(lit[0]);
set[lit[0] as usize] = true;
}
Bytes { chars: bytes, sparse: set }
} else if alts.is_one_literal() {
Single(SingleSearch::new(alts.literals.pop().unwrap()))
} else {
AC(AcAutomaton::new(alts.literals).into_full())
}
}
}

impl Literals {
Expand Down Expand Up @@ -377,7 +490,7 @@ impl Literals {
Single(ref searcher) => {
searcher.find(haystack).map(|i| (i, i + searcher.pat.len()))
}
FullAutomaton(ref aut) => {
AC(ref aut) => {
aut.find(haystack).next().map(|m| (m.start, m.end))
}
}
Expand All @@ -396,34 +509,7 @@ impl Literals {
Byte(_) => 1,
Bytes { ref chars, .. } => chars.len(),
Single(_) => 1,
FullAutomaton(ref aut) => aut.len(),
}
}

/// Returns true iff the prefix match preserves priority.
///
/// For example, given the alternation `ab|a` and the target string `ab`,
/// does the prefix machine guarantee that `ab` will match? (A full
/// Aho-Corasick automaton does not!)
pub fn preserves_priority(&self) -> bool {
use self::LiteralMatcher::*;
match self.matcher {
Empty => true,
Byte(_) => true,
Bytes{..} => true,
Single(_) => true,
FullAutomaton(ref aut) => {
// Okay, so the automaton can respect priority in one
// particular case: when every pattern is of the same length.
// The trick is that the automaton will report the leftmost
// match, which in this case, corresponds to the correct
// match for the regex engine. If any other alternate matches
// at the same position, then they must be exactly equivalent.

// Guaranteed at least one prefix by construction, so use
// that for the length.
aut.patterns().iter().all(|p| p.len() == aut.pattern(0).len())
}
AC(ref aut) => aut.len(),
}
}

Expand All @@ -440,7 +526,7 @@ impl Literals {
(single.pat.len() * mem::size_of::<u8>())
+ (single.shift.len() * mem::size_of::<usize>())
}
FullAutomaton(ref aut) => aut.heap_bytes(),
AC(ref aut) => aut.heap_bytes(),
}
}

Expand All @@ -465,52 +551,7 @@ impl Literals {
chars.iter().map(|&byte| vec![byte]).collect()
}
Single(ref searcher) => vec![searcher.pat.clone()],
FullAutomaton(ref aut) => aut.patterns().iter().cloned().collect(),
}
}
}

impl LiteralMatcher {
/// Create a new prefix matching machine.
fn new(mut alts: AlternateLiterals) -> Self {
use self::LiteralMatcher::*;

if alts.is_empty() {
Empty
} else if alts.distinct_single_bytes() >= 26 {
// Why do we do this? Well, it's a heuristic to prevent thrashing.
// Basically, if our literal matcher has lots of literals that are
// a single byte, then we lose a lot of the benefits of fast
// literal searching. In particular, single bytes have a high
// probability of matching. In a regex that rarely matches, we end
// up ping-ponging between the literal matcher and the regex engine
// for every byte of input. That's bad juju.
//
// Note that we only count distinct starting bytes from literals of
// length 1. For literals longer than that, we assume they have
// a lower probability of matching.
//
// This particular heuristic would be triggered on, e.g.,
// `[a-z].+`. The prefix here is a single byte that is very likely
// to match on any given byte in the input, so it's quicker just
// to let the matching engine process it.
//
// TODO(burntsushi): Consider lowering the threshold!
Empty
} else if alts.is_single_byte() {
Byte(alts.literals[0][0])
} else if alts.all_single_bytes() {
let mut set = vec![false; 256];
let mut bytes = vec![];
for lit in alts.literals {
bytes.push(lit[0]);
set[lit[0] as usize] = true;
}
Bytes { chars: bytes, sparse: set }
} else if alts.is_one_literal() {
Single(SingleSearch::new(alts.literals.pop().unwrap()))
} else {
FullAutomaton(AcAutomaton::new(alts.literals).into_full())
AC(ref aut) => aut.patterns().iter().cloned().collect(),
}
}
}
Expand Down Expand Up @@ -586,6 +627,19 @@ fn find_singles(
None
}

impl fmt::Debug for AlternateLiterals {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut strings = vec![];
for lit in &self.literals {
strings.push(String::from_utf8_lossy(lit).into_owned());
}
f.debug_struct("AlternateLiterals")
.field("at_match", &self.at_match)
.field("literals", &strings)
.finish()
}
}

impl fmt::Debug for Literals {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::LiteralMatcher::*;
Expand All @@ -601,14 +655,15 @@ impl fmt::Debug for Literals {
write!(f, "alternate single bytes: {}", chars.join(", "))
}
Single(ref searcher) => write!(f, "{:?}", searcher),
FullAutomaton(ref aut) => write!(f, "{:?}", aut),
AC(ref aut) => write!(f, "{:?}", aut),
}
}
}

#[cfg(test)]
mod tests {
use program::ProgramBuilder;
use super::AlternateLiterals;

macro_rules! prog {
($re:expr) => { ProgramBuilder::new($re).compile().unwrap() }
Expand Down Expand Up @@ -702,6 +757,46 @@ mod tests {
assert_eq!(prefixes_complete!("☃"), vec!["☃"]);
}

macro_rules! alts {
($($s:expr),*) => {{
AlternateLiterals {
at_match: false,
literals: vec![$($s.as_bytes().to_owned()),*],
}
}}
}

#[test]
fn unambiguous() {
let given = alts!("z", "azb");
let expected = alts!("a", "z");
assert_eq!(expected, given.unambiguous());

let given = alts!("zaaaaaaaaaa", "aa");
let expected = alts!("aa", "z");
assert_eq!(expected, given.unambiguous());

let given = alts!("Sherlock", "Watson");
let expected = alts!("Sherlock", "Watson");
assert_eq!(expected, given.unambiguous());

let given = alts!("abc", "bc");
let expected = alts!("a", "bc");
assert_eq!(expected, given.unambiguous());

let given = alts!("bc", "abc");
let expected = alts!("a", "bc");
assert_eq!(expected, given.unambiguous());

let given = alts!("a", "aa");
let expected = alts!("a");
assert_eq!(expected, given.unambiguous());

let given = alts!("ab", "a");
let expected = alts!("a");
assert_eq!(expected, given.unambiguous());
}

// That this test fails suggests that the literal finder needs to be
// completely rewritten. Ug. It's not that it is wrong currently, but
// it's not as good at finding literals as it should be.
Expand Down
2 changes: 1 addition & 1 deletion src/program.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ impl Program {
/// If this returns true, then it is possible to avoid running any of the
/// NFA or DFA based matching engines entirely.
pub fn is_prefix_match(&self) -> bool {
self.prefixes.at_match() && self.prefixes.preserves_priority()
self.prefixes.at_match()
}

/// Returns true if the underlying program is reversed.
Expand Down
3 changes: 3 additions & 0 deletions tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@ mat!(regression_alt_in_alt2, r"^(.*?)(\n|\r\n?|$)", "ab\rcd", Some((0, 3)));

mat!(one_unicode, r"☃", "☃", Some((0, 3)));

// Regression test for https://github.com/rust-lang-nursery/regex/issues/169
mat!(regression_leftmost_first_prefix, r"z*azb", "azb", Some((0, 3)));

// A whole mess of tests from Glenn Fowler's regex test suite.
// Generated by the 'src/etc/regex-match-tests' program.
#[path = "matches.rs"]
Expand Down

0 comments on commit 55a1fc9

Please sign in to comment.