Skip to content

Commit

Permalink
Merge pull request #262 from rust-lang-nursery/fix-bugs
Browse files Browse the repository at this point in the history
fix several small bugs found from fuzzing
  • Loading branch information
BurntSushi authored Jul 10, 2016
2 parents cf04879 + 84a2bf5 commit 01c92c8
Show file tree
Hide file tree
Showing 14 changed files with 166 additions and 48 deletions.
6 changes: 6 additions & 0 deletions regex-syntax/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1336,6 +1336,9 @@ pub enum ErrorKind {
/// This never returned if the parser is permitted to allow expressions
/// that match arbitrary bytes.
InvalidUtf8,
/// A character class was constructed such that it is empty.
/// e.g., `[^\d\D]`.
EmptyClass,
/// Hints that destructuring should not be exhaustive.
///
/// This enum may grow additional variants, so this makes sure clients
Expand Down Expand Up @@ -1398,6 +1401,7 @@ impl ErrorKind {
FlagNotAllowed(_) => "flag not allowed",
UnicodeNotAllowed => "Unicode features not allowed",
InvalidUtf8 => "matching arbitrary bytes is not allowed",
EmptyClass => "empty character class",
__Nonexhaustive => unreachable!(),
}
}
Expand Down Expand Up @@ -1507,6 +1511,8 @@ impl fmt::Display for ErrorKind {
(u) flag is not set."),
InvalidUtf8 =>
write!(f, "Matching arbitrary bytes is not allowed."),
EmptyClass =>
write!(f, "Empty character classes are not allowed."),
__Nonexhaustive => unreachable!(),
}
}
Expand Down
46 changes: 34 additions & 12 deletions regex-syntax/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -581,12 +581,18 @@ impl Parser {
_ => unreachable!(),
},
start => {
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(start));
}
self.bump();
try!(self.parse_class_range(&mut class, start));
}
}
}
class = self.class_transform(negated, class).canonicalize();
if class.is_empty() {
return Err(self.err(ErrorKind::EmptyClass));
}
Ok(Build::Expr(if self.flags.unicode {
Expr::Class(class)
} else {
Expand Down Expand Up @@ -639,7 +645,13 @@ impl Parser {
// Because `parse_escape` can never return `LeftParen`.
_ => unreachable!(),
},
_ => self.bump(),
_ => {
let c = self.bump();
if !self.flags.unicode {
let _ = try!(self.codepoint_to_one_byte(c));
}
c
}
};
if end < start {
// e.g., [z-a]
Expand Down Expand Up @@ -1277,7 +1289,7 @@ mod tests {
ErrorKind,
};
use unicode::regex::{PERLD, PERLS, PERLW};
use super::{LOWER, UPPER, Flags, Parser, ascii_class};
use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};

static YI: &'static [(char, char)] = &[
('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
Expand Down Expand Up @@ -2002,6 +2014,8 @@ mod tests {

assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
assert_eq!(pb(r"(?-u)[\xFF]"),
Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
assert_eq!(pb("(?-u)[\n]"),
Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
assert_eq!(pb(r"(?-u)[\n]"),
Expand Down Expand Up @@ -2127,10 +2141,10 @@ mod tests {

#[test]
fn class_multiple_class_negate_negate() {
let nperld = class(PERLD).negate();
let nperlw = class(PERLW).negate();
let nyi = class(YI).negate();
let cls = CharClass::empty().merge(nperld).merge(nyi);
assert_eq!(p(r"[^\D\P{Yi}]"), Expr::Class(cls.negate()));
let cls = CharClass::empty().merge(nperlw).merge(nyi);
assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
}

#[test]
Expand All @@ -2149,10 +2163,10 @@ mod tests {

#[test]
fn class_multiple_class_negate_negate_casei() {
let nperld = class(PERLD).negate();
let nperlw = class(PERLW).negate();
let nyi = class(YI).negate();
let class = CharClass::empty().merge(nperld).merge(nyi);
assert_eq!(p(r"(?i)[^\D\P{Yi}]"),
let class = CharClass::empty().merge(nperlw).merge(nyi);
assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
Expr::Class(class.case_fold().negate()));
}

Expand Down Expand Up @@ -2236,10 +2250,10 @@ mod tests {

#[test]
fn ascii_classes_negate_multiple() {
let (nlower, nupper) = (class(LOWER).negate(), class(UPPER).negate());
let cls = CharClass::empty().merge(nlower).merge(nupper);
assert_eq!(p("[[:^lower:][:^upper:]]"), Expr::Class(cls.clone()));
assert_eq!(p("[^[:^lower:][:^upper:]]"), Expr::Class(cls.negate()));
let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
let cls = CharClass::empty().merge(nlower).merge(nword);
assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
}

#[test]
Expand Down Expand Up @@ -2402,6 +2416,13 @@ mod tests {
test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags);
}

#[test]
fn unicode_class_literal_not_allowed() {
let flags = Flags { allow_bytes: true, .. Flags::default() };
test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
}

#[test]
fn unicode_hex_not_allowed() {
let flags = Flags { allow_bytes: true, .. Flags::default() };
Expand Down Expand Up @@ -2725,6 +2746,7 @@ mod tests {
fn error_class_empty_range() {
test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
}

#[test]
Expand Down
4 changes: 1 addition & 3 deletions src/backtrack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,9 +242,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
ip = inst.goto1;
}
EmptyLook(ref inst) => {
let prev = self.input.previous_char(at);
let next = self.input.next_char(at);
if inst.matches(prev, next) {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
} else {
return false;
Expand Down
1 change: 1 addition & 0 deletions src/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,7 @@ impl Compiler {
}

fn c_class(&mut self, ranges: &[ClassRange]) -> Result {
assert!(!ranges.is_empty());
if self.compiled.uses_bytes() {
CompileClass {
c: self,
Expand Down
2 changes: 1 addition & 1 deletion src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1847,7 +1847,7 @@ mod tests {
expected == got && state.flags() == StateFlags(flags)
}
QuickCheck::new()
.gen(StdGen::new(self::rand::thread_rng(), 70_000))
.gen(StdGen::new(self::rand::thread_rng(), 10_000))
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
}

Expand Down
10 changes: 7 additions & 3 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,11 @@ impl<'c> ExecNoSync<'c> {
lits.find_start(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AnchoredEnd => self.ro.suffixes.find_end(&text),
AnchoredEnd => {
let lits = &self.ro.suffixes;
lits.find_end(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
}
}

Expand Down Expand Up @@ -917,7 +921,7 @@ impl<'c> ExecNoSync<'c> {
matches,
slots,
quit_after_match,
ByteInput::new(text),
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
} else {
pikevm::Fsm::exec(
Expand Down Expand Up @@ -945,7 +949,7 @@ impl<'c> ExecNoSync<'c> {
&self.cache,
matches,
slots,
ByteInput::new(text),
ByteInput::new(text, self.ro.nfa.only_utf8),
start)
} else {
backtrack::Bounded::exec(
Expand Down
115 changes: 107 additions & 8 deletions src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ use std::u32;

use syntax;

use utf8::{decode_utf8, decode_last_utf8};
use literals::LiteralSearcher;
use prog::InstEmptyLook;
use utf8::{decode_utf8, decode_last_utf8};

/// Represents a location in the input.
#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -83,6 +84,10 @@ pub trait Input {
/// If no such character could be decoded, then `Char` is absent.
fn previous_char(&self, at: InputAt) -> Char;

/// Return true if the given empty width instruction matches at the
/// input position given.
fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;

/// Scan the input for a matching prefix.
fn prefix_at(
&self,
Expand All @@ -104,6 +109,10 @@ impl<'a, T: Input> Input for &'a T {

fn previous_char(&self, at: InputAt) -> Char { (**self).previous_char(at) }

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
(**self).is_empty_match(at, empty)
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand Down Expand Up @@ -155,6 +164,38 @@ impl<'t> Input for CharInput<'t> {
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
}

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
use prog::EmptyLook::*;
match empty.look {
StartLine => {
let c = self.previous_char(at);
c.is_none() || c == '\n'
}
EndLine => {
let c = self.next_char(at);
c.is_none() || c == '\n'
}
StartText => self.previous_char(at).is_none(),
EndText => self.next_char(at).is_none(),
WordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() != c2.is_word_char()
}
NotWordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() == c2.is_word_char()
}
WordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_byte() != c2.is_word_byte()
}
NotWordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_byte() == c2.is_word_byte()
}
}
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand All @@ -178,20 +219,26 @@ impl<'t> Input for CharInput<'t> {
/// easy access to necessary Unicode decoding (used for word boundary look
/// ahead/look behind).
#[derive(Clone, Copy, Debug)]
pub struct ByteInput<'t>(&'t [u8]);
pub struct ByteInput<'t> {
text: &'t [u8],
only_utf8: bool,
}

impl<'t> ByteInput<'t> {
/// Return a new byte-based input reader for the given string.
pub fn new(s: &'t [u8]) -> ByteInput<'t> {
ByteInput(s)
pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
ByteInput {
text: text,
only_utf8: only_utf8,
}
}
}

impl<'t> ops::Deref for ByteInput<'t> {
type Target = [u8];

fn deref(&self) -> &[u8] {
self.0
self.text
}
}

Expand All @@ -213,6 +260,58 @@ impl<'t> Input for ByteInput<'t> {
decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
}

fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
use prog::EmptyLook::*;
match empty.look {
StartLine => {
let c = self.previous_char(at);
c.is_none() || c == '\n'
}
EndLine => {
let c = self.next_char(at);
c.is_none() || c == '\n'
}
StartText => self.previous_char(at).is_none(),
EndText => self.next_char(at).is_none(),
WordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() != c2.is_word_char()
}
NotWordBoundary => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
c1.is_word_char() == c2.is_word_char()
}
WordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
if self.only_utf8 {
// If we must match UTF-8, then we can't match word
// boundaries at invalid UTF-8.
if c1.is_none() && !at.is_start() {
return false;
}
if c2.is_none() && !at.is_end() {
return false;
}
}
c1.is_word_byte() != c2.is_word_byte()
}
NotWordBoundaryAscii => {
let (c1, c2) = (self.previous_char(at), self.next_char(at));
if self.only_utf8 {
// If we must match UTF-8, then we can't match word
// boundaries at invalid UTF-8.
if c1.is_none() && !at.is_start() {
return false;
}
if c2.is_none() && !at.is_end() {
return false;
}
}
c1.is_word_byte() == c2.is_word_byte()
}
}
}

fn prefix_at(
&self,
prefixes: &LiteralSearcher,
Expand All @@ -222,11 +321,11 @@ impl<'t> Input for ByteInput<'t> {
}

fn len(&self) -> usize {
self.0.len()
self.text.len()
}

fn as_bytes(&self) -> &[u8] {
self.0
&self.text
}
}

Expand Down Expand Up @@ -276,7 +375,7 @@ impl Char {
pub fn is_word_byte(self) -> bool {
match char::from_u32(self.0) {
None => false,
Some(c) if c <= '\u{FF}' => syntax::is_word_byte(c as u8),
Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
Some(_) => false,
}
}
Expand Down
4 changes: 1 addition & 3 deletions src/pikevm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,9 +322,7 @@ impl<'r, I: Input> Fsm<'r, I> {
nlist.set.insert(ip);
match self.prog[ip] {
EmptyLook(ref inst) => {
let prev = self.input.previous_char(at);
let next = self.input.next_char(at);
if inst.matches(prev, next) {
if self.input.is_empty_match(at, inst) {
ip = inst.goto;
}
}
Expand Down
Loading

0 comments on commit 01c92c8

Please sign in to comment.