From d894c631cb6c9a062c13f015062a56dcf7fc8f46 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 17 Feb 2017 21:39:29 -0500 Subject: [PATCH 1/6] Fix a bug in literal extraction. When doing literal extraction, a non-empty concatenation should always be cut when a `^` (for prefixes) or a `$` (for suffixes) is seen. If a counted repetition is used, e.g., `${2}`, then the cut detection fails. We add in a special case to handle it. Fixes #321 --- regex-syntax/src/literals.rs | 8 +++++--- tests/regression.rs | 4 ++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/regex-syntax/src/literals.rs b/regex-syntax/src/literals.rs index 0b89ccb250..e3de16732a 100644 --- a/regex-syntax/src/literals.rs +++ b/regex-syntax/src/literals.rs @@ -819,7 +819,7 @@ fn repeat_range_literals( let n = cmp::min(lits.limit_size, min as usize); let es = iter::repeat(e.clone()).take(n).collect(); f(&Concat(es), lits); - if n < min as usize { + if n < min as usize || lits.contains_empty() { lits.cut(); } } @@ -1156,8 +1156,9 @@ mod tests { // Test regexes with empty assertions. test_lit!(pfx_empty1, prefixes, "^a", M("a")); - test_lit!(pfx_empty2, prefixes, "^abc", M("abc")); - test_lit!(pfx_empty3, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); + test_lit!(pfx_empty2, prefixes, "a${2}", C("a")); + test_lit!(pfx_empty3, prefixes, "^abc", M("abc")); + test_lit!(pfx_empty4, prefixes, "(?:^abc)|(?:^z)", M("abc"), M("z")); // Make sure some curious regexes have no prefixes. test_lit!(pfx_nothing1, prefixes, "."); @@ -1306,6 +1307,7 @@ mod tests { // Test regexes with empty assertions. test_lit!(sfx_empty1, suffixes, "a$", M("a")); + test_lit!(sfx_empty2, suffixes, "${2}a", C("a")); // Make sure some curious regexes have no suffixes. test_lit!(sfx_nothing1, suffixes, "."); diff --git a/tests/regression.rs b/tests/regression.rs index 3d42df8608..1feeda17cb 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -82,3 +82,7 @@ mat!(endl_or_wb, r"(?m:$)|(?-u:\b)", "\u{6084e}", Some((4, 4))); mat!(zero_or_end, r"(?i-u:\x00)|$", "\u{e682f}", Some((4, 4))); mat!(y_or_endl, r"(?i-u:y)|(?m:$)", "\u{b4331}", Some((4, 4))); mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1))); + +// See: https://github.com/rust-lang/regex/issues/321 +ismatch!(strange_anchor_non_complete_prefix, r"a^{2}", "", false); +ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false); From d813518e2a199884cd38a4e32497a7453db79697 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 17 Feb 2017 21:59:21 -0500 Subject: [PATCH 2/6] Extend ending position of capture search. When searching for captures, we first use the DFA to find the start and end of the match. We then pass just the matched region of text to the NFA engine to find sub-capture locations. This is a key optimization that prevents the NFA engine from searching a lot more text than what is necessary in some cases. One problem with this is that some instructions determine their match state based on whether the engine is at the boundary of the search text. For example, `$` matches if and only if the engine is at EOF. If we only provide the matched text region, then assertions like `\b` might not work, since it needs to examine at least one character past the end of the match. If we provide the matched text region plus one character, then `$` may match when it shouldn't. Therefore, we provide the matched text plus (at most) two characters. Fixes #334 --- src/exec.rs | 9 ++++++--- tests/regression.rs | 4 ++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/exec.rs b/src/exec.rs index d5d3bf3f7a..68a9e18f43 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -850,9 +850,12 @@ impl<'c> ExecNoSync<'c> { match_start: usize, match_end: usize, ) -> Option<(usize, usize)> { - // We can't use match_end directly, because we may need to examine - // one "character" after the end of a match for lookahead operators. - let e = cmp::min(next_utf8(text, match_end), text.len()); + // We can't use match_end directly, because we may need to examine one + // "character" after the end of a match for lookahead operators. We + // need to move two characters beyond the end, since some look-around + // operations may falsely assume a premature end of text otherwise. + let e = cmp::min( + next_utf8(text, next_utf8(text, match_end)), text.len()); self.captures_nfa(slots, &text[..e], match_start) } diff --git a/tests/regression.rs b/tests/regression.rs index 1feeda17cb..7a30b1527f 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -86,3 +86,7 @@ mat!(wb_start_x, r"(?u:\b)^(?-u:X)", "X", Some((0, 1))); // See: https://github.com/rust-lang/regex/issues/321 ismatch!(strange_anchor_non_complete_prefix, r"a^{2}", "", false); ismatch!(strange_anchor_non_complete_suffix, r"${2}a", "", false); + +// See: https://github.com/rust-lang/regex/issues/334 +mat!(captures_after_dfa_premature_end, r"a(b*(X|$))?", "abcbX", + Some((0, 1)), None, None); From 204293ebdec143ff360389628e5d97a5ac08991e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 18 Feb 2017 10:44:24 -0500 Subject: [PATCH 3/6] Byte based regexes enable Unicode by default. --- src/re_builder.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/re_builder.rs b/src/re_builder.rs index 3849c892d6..12b2515649 100644 --- a/src/re_builder.rs +++ b/src/re_builder.rs @@ -115,8 +115,6 @@ impl RegexBuilder { } /// Set the value for the Unicode (`u`) flag. - /// - /// For byte based regular expressions, this is disabled by default. pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { self.0.unicode = yes; self @@ -228,8 +226,6 @@ impl RegexSetBuilder { } /// Set the value for the Unicode (`u`) flag. - /// - /// For byte based regular expressions, this is disabled by default. pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { self.0.unicode = yes; self From 767f80f3c1f14f24611f04085ba9b54cbcc8bc6c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 18 Feb 2017 10:44:36 -0500 Subject: [PATCH 4/6] Add tip about `x` flag. Fixes #326 --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 2dcdad2091..96cdb45c86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -161,6 +161,10 @@ assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); # } ``` +If you wish to match against whitespace in this mode, you can still use `\s`, +`\n`, `\t`, etc. For escaping a single space character, you can use its hex +character code `\x20` or temporarily disable the `x` flag, e.g., `(?-x: )`. + # Example: match multiple regular expressions simultaneously This demonstrates how to use a `RegexSet` to match multiple (possibly From 3a6138bfb13355ed776f04e273c57a455bd90168 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 18 Feb 2017 11:04:14 -0500 Subject: [PATCH 5/6] Add another replacement string example. This shows how to use curly braces in the replacement string, and more specifically, explains why they are sometimes necessary. Fixes #333 --- src/re_bytes.rs | 21 ++++++++++++++++----- src/re_unicode.rs | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 27ed144a62..0a45d6b1e6 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -427,12 +427,23 @@ impl Regex { /// Note that using `$2` instead of `$first` or `$1` instead of `$last` /// would produce the same result. To write a literal `$` use `$$`. /// - /// If `$name` isn't a valid capture group (whether the name doesn't exist - /// or isn't a valid index), then it is replaced with the empty string. + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: /// - /// The longest possible name is used. e.g., `$1a` looks up the capture - /// group named `1a` and not the capture group at index `1`. To exert more - /// precise control over the name, use braces, e.g., `${1}a`. + /// ```rust + /// # extern crate regex; use regex::bytes::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P\w+)\s+(?P\w+)").unwrap(); + /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]); + /// assert_eq!(result, &b"deep_fried"[..]); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. /// /// Finally, sometimes you just want to replace a literal string with no /// regard for capturing group expansion. This can be done by wrapping a diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 1e22275f3c..4ff16024c6 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -501,6 +501,24 @@ impl Regex { /// Note that using `$2` instead of `$first` or `$1` instead of `$last` /// would produce the same result. To write a literal `$` use `$$`. /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement and surrounding literal text. + /// For example, if we wanted to join two words together with an + /// underscore: + /// + /// ```rust + /// # extern crate regex; use regex::Regex; + /// # fn main() { + /// let re = Regex::new(r"(?P\w+)\s+(?P\w+)").unwrap(); + /// let result = re.replace("deep fried", "${first}_$second"); + /// assert_eq!(result, "deep_fried"); + /// # } + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// /// Finally, sometimes you just want to replace a literal string with no /// regard for capturing group expansion. This can be done by wrapping a /// byte string with `NoExpand`: From 9ae9418c98d7eb2aa875395da070f8138a395d98 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 18 Feb 2017 11:29:53 -0500 Subject: [PATCH 6/6] Add Captures::get example. The example shows how to get the matched text of any capture group while defaulting to the empty string if that particular group didn't participate in the match. Fixes #338 --- src/re_bytes.rs | 16 ++++++++++++++++ src/re_unicode.rs | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 0a45d6b1e6..8a8550eb65 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -789,6 +789,22 @@ impl<'t> Captures<'t> { /// Returns the match associated with the capture group at index `i`. If /// `i` does not correspond to a capture group, or if the capture group /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::bytes::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures(b"abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(text1, &b"123"[..]); + /// assert_eq!(text2, &b""[..]); + /// ``` pub fn get(&self, i: usize) -> Option> { self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) } diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 4ff16024c6..d90be4fb1c 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -934,6 +934,22 @@ impl<'t> Captures<'t> { /// Returns the match associated with the capture group at index `i`. If /// `i` does not correspond to a capture group, or if the capture group /// did not participate in the match, then `None` is returned. + /// + /// # Examples + /// + /// Get the text of the match with a default of an empty string if this + /// group didn't participate in the match: + /// + /// ```rust + /// # use regex::Regex; + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let text1 = caps.get(1).map_or("", |m| m.as_str()); + /// let text2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(text1, "123"); + /// assert_eq!(text2, ""); + /// ``` pub fn get(&self, i: usize) -> Option> { self.locs.pos(i).map(|(s, e)| Match::new(self.text, s, e)) }