Skip to content

Commit

Permalink
lite: add special word boundaries to regex-lite
Browse files Browse the repository at this point in the history
This was substantially easier. Coupling, private abstractions and slow
code are so much easier to deal with.

Ref #469
  • Loading branch information
BurntSushi committed Oct 9, 2023
1 parent 2743a7a commit dbc5e6d
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 28 deletions.
42 changes: 42 additions & 0 deletions regex-lite/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,24 @@ pub(crate) enum Look {
Word = 1 << 6,
/// Match an ASCII-only negation of a word boundary.
WordNegate = 1 << 7,
/// Match the start of an ASCII-only word boundary. That is, this matches a
/// position at either the beginning of the haystack or where the previous
/// character is not a word character and the following character is a word
/// character.
WordStart = 1 << 8,
/// Match the end of an ASCII-only word boundary. That is, this matches
/// a position at either the end of the haystack or where the previous
/// character is a word character and the following character is not a word
/// character.
WordEnd = 1 << 9,
/// Match the start half of an ASCII-only word boundary. That is, this
/// matches a position at either the beginning of the haystack or where the
/// previous character is not a word character.
WordStartHalf = 1 << 10,
/// Match the end half of an ASCII-only word boundary. That is, this
/// matches a position at either the end of the haystack or where the
/// following character is not a word character.
WordEndHalf = 1 << 11,
}

impl Look {
Expand Down Expand Up @@ -631,6 +649,30 @@ impl Look {
at < haystack.len() && utf8::is_word_byte(haystack[at]);
word_before == word_after
}
WordStart => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
!word_before && word_after
}
WordEnd => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
word_before && !word_after
}
WordStartHalf => {
let word_before =
at > 0 && utf8::is_word_byte(haystack[at - 1]);
!word_before
}
WordEndHalf => {
let word_after =
at < haystack.len() && utf8::is_word_byte(haystack[at]);
!word_after
}
}
}
}
Expand Down
89 changes: 86 additions & 3 deletions regex-lite/src/hir/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
"character class difference is not supported";
const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
"character class symmetric difference is not supported";
const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
"special word boundary assertion is unclosed or has an invalid character";
const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
"special word boundary assertion is unrecognized";
const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
"found start of special word boundary or repetition without an end";

/// A regular expression parser.
///
Expand Down Expand Up @@ -479,12 +485,86 @@ impl<'a> Parser<'a> {
'v' => special('\x0B'),
'A' => Ok(Hir::look(hir::Look::Start)),
'z' => Ok(Hir::look(hir::Look::End)),
'b' => Ok(Hir::look(hir::Look::Word)),
'b' => {
let mut hir = Hir::look(hir::Look::Word);
if !self.is_done() && self.char() == '{' {
if let Some(special) =
self.maybe_parse_special_word_boundary()?
{
hir = special;
}
}
Ok(hir)
}
'B' => Ok(Hir::look(hir::Look::WordNegate)),
'<' => Ok(Hir::look(hir::Look::WordStart)),
'>' => Ok(Hir::look(hir::Look::WordEnd)),
_ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
}
}

/// Attempt to parse a specialty word boundary. That is, `\b{start}`,
/// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
///
/// This is similar to `maybe_parse_ascii_class` in that, in most cases,
/// if it fails it will just return `None` with no error. This is done
/// because `\b{5}` is a valid expression and we want to let that be parsed
/// by the existing counted repetition parsing code. (I thought about just
/// invoking the counted repetition code from here, but it seemed a little
/// ham-fisted.)
///
/// Unlike `maybe_parse_ascii_class` though, this can return an error.
/// Namely, if we definitely know it isn't a counted repetition, then we
/// return an error specific to the specialty word boundaries.
///
/// This assumes the parser is positioned at a `{` immediately following
/// a `\b`. When `None` is returned, the parser is returned to the position
/// at which it started: pointing at a `{`.
///
/// The position given should correspond to the start of the `\b`.
fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
assert_eq!(self.char(), '{');

let is_valid_char = |c| match c {
'A'..='Z' | 'a'..='z' | '-' => true,
_ => false,
};
let start = self.pos();
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
}
// This is one of the critical bits: if the first non-whitespace
// character isn't in [-A-Za-z] (i.e., this can't be a special word
// boundary), then we bail and let the counted repetition parser deal
// with this.
if !is_valid_char(self.char()) {
self.pos.set(start);
self.char.set(Some('{'));
return Ok(None);
}

// Now collect up our chars until we see a '}'.
let mut scratch = String::new();
while !self.is_done() && is_valid_char(self.char()) {
scratch.push(self.char());
self.bump_and_bump_space();
}
if self.is_done() || self.char() != '}' {
return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
}
self.bump();
let kind = match scratch.as_str() {
"start" => hir::Look::WordStart,
"end" => hir::Look::WordEnd,
"start-half" => hir::Look::WordStartHalf,
"end-half" => hir::Look::WordEndHalf,
_ => {
return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
}
};
Ok(Some(Hir::look(kind)))
}

/// Parse a hex representation of a Unicode codepoint. This handles both
/// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
/// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
Expand Down Expand Up @@ -1948,8 +2028,6 @@ bar
assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\<"));
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\>"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
Expand Down Expand Up @@ -1983,6 +2061,11 @@ bar
assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
}

#[test]
Expand Down
58 changes: 33 additions & 25 deletions regex-lite/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,16 @@ x{n}? exactly n x
### Empty matches
<pre class="rust">
^ the beginning of a haystack (or start-of-line with multi-line mode)
$ the end of a haystack (or end-of-line with multi-line mode)
\A only the beginning of a haystack (even with multi-line mode enabled)
\z only the end of a haystack (even with multi-line mode enabled)
\b an ASCII word boundary (\w on one side and \W, \A, or \z on other)
\B not an ASCII word boundary
^ the beginning of a haystack (or start-of-line with multi-line mode)
$ the end of a haystack (or end-of-line with multi-line mode)
\A only the beginning of a haystack (even with multi-line mode enabled)
\z only the end of a haystack (even with multi-line mode enabled)
\b an ASCII word boundary (\w on one side and \W, \A, or \z on other)
\B not an ASCII word boundary
\b{start} an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
\b{end} an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
\b{start-half} half of an ASCII start-of-word boundary (\W|\A on the left)
\b{end-half} half of an ASCII end-of-word boundary (\W|\z on the right)
</pre>
The empty regex is valid and matches the empty string. For example, the
Expand Down Expand Up @@ -581,25 +585,29 @@ Note that this includes all possible escape sequences, even ones that are
documented elsewhere.
<pre class="rust">
\* literal *, applies to all ASCII except [0-9A-Za-z<>]
\a bell (\x07)
\f form feed (\x0C)
\t horizontal tab
\n new line
\r carriage return
\v vertical tab (\x0B)
\A matches at the beginning of a haystack
\z matches at the end of a haystack
\b word boundary assertion
\B negated word boundary assertion
\x7F hex character code (exactly two digits)
\x{10FFFF} any hex character code corresponding to a Unicode code point
\u007F hex character code (exactly four digits)
\u{7F} any hex character code corresponding to a Unicode code point
\U0000007F hex character code (exactly eight digits)
\U{7F} any hex character code corresponding to a Unicode code point
\d, \s, \w Perl character class
\D, \S, \W negated Perl character class
\* literal *, applies to all ASCII except [0-9A-Za-z<>]
\a bell (\x07)
\f form feed (\x0C)
\t horizontal tab
\n new line
\r carriage return
\v vertical tab (\x0B)
\A matches at the beginning of a haystack
\z matches at the end of a haystack
\b word boundary assertion
\B negated word boundary assertion
\b{start}, \< start-of-word boundary assertion
\b{end}, \> end-of-word boundary assertion
\b{start-half} half of a start-of-word boundary assertion
\b{end-half} half of a end-of-word boundary assertion
\x7F hex character code (exactly two digits)
\x{10FFFF} any hex character code corresponding to a Unicode code point
\u007F hex character code (exactly four digits)
\u{7F} any hex character code corresponding to a Unicode code point
\U0000007F hex character code (exactly eight digits)
\U{7F} any hex character code corresponding to a Unicode code point
\d, \s, \w Perl character class
\D, \S, \W negated Perl character class
</pre>
### Perl character classes (ASCII only)
Expand Down
1 change: 1 addition & 0 deletions regex-lite/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ fn suite() -> anyhow::Result<regex_test::RegexTests> {
load!("unicode");
load!("utf8");
load!("word-boundary");
load!("word-boundary-special");
load!("fowler/basic");
load!("fowler/nullsubexpr");
load!("fowler/repetition");
Expand Down

0 comments on commit dbc5e6d

Please sign in to comment.