Skip to content

Commit a39e02c

Browse files
committed
api: add new 'Regex::static_captures_len' method
This adds a new routine for computing the static number of capture groups that will appear in every match. If the number of groups is not invariant across all matches, then there is no static capture length. This is meant to help implement higher level convenience APIs for extracting capture groups, such as the one described in #824. We may wind up including such APIs in the regex crate itself, but this commit stops short of that. Instead, we just add this new property which should permit those APIs to exist outside of this crate for now. Closes #908
1 parent 324e904 commit a39e02c

File tree

7 files changed

+207
-0
lines changed

7 files changed

+207
-0
lines changed

regex-syntax/src/hir/mod.rs

+80
Original file line numberDiff line numberDiff line change
@@ -1833,6 +1833,7 @@ struct PropertiesI {
18331833
look_set_suffix: LookSet,
18341834
utf8: bool,
18351835
captures_len: usize,
1836+
static_captures_len: Option<usize>,
18361837
literal: bool,
18371838
alternation_literal: bool,
18381839
}
@@ -1990,6 +1991,44 @@ impl Properties {
19901991
self.0.captures_len
19911992
}
19921993

1994+
/// Returns the total number of explicit capturing groups that appear in
1995+
/// every possible match.
1996+
///
1997+
/// If the number of capture groups can vary depending on the match, then
1998+
/// this returns `None`. That is, a value is only returned when the number
1999+
/// of matching groups is invariant or "static."
2000+
///
2001+
/// Note that this does not include the implicit capturing group
2002+
/// corresponding to the entire match.
2003+
///
2004+
/// # Example
2005+
///
2006+
/// This shows a few cases where a static number of capture groups is
2007+
/// available and a few cases where it is not.
2008+
///
2009+
/// ```
2010+
/// use regex_syntax::parse;
2011+
///
2012+
/// let len = |pattern| {
2013+
/// parse(pattern).map(|h| h.properties().static_captures_len())
2014+
/// };
2015+
///
2016+
/// assert_eq!(Some(0), len("a")?);
2017+
/// assert_eq!(Some(1), len("(a)")?);
2018+
/// assert_eq!(Some(1), len("(a)|(b)")?);
2019+
/// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2020+
/// assert_eq!(None, len("(a)|b")?);
2021+
/// assert_eq!(None, len("a|(b)")?);
2022+
/// assert_eq!(None, len("(b)*")?);
2023+
/// assert_eq!(Some(1), len("(b)+")?);
2024+
///
2025+
/// # Ok::<(), Box<dyn std::error::Error>>(())
2026+
/// ```
2027+
#[inline]
2028+
pub fn static_captures_len(&self) -> Option<usize> {
2029+
self.0.static_captures_len
2030+
}
2031+
19932032
/// Return true if and only if this HIR is a simple literal. This is
19942033
/// only true when this HIR expression is either itself a `Literal` or a
19952034
/// concatenation of only `Literal`s.
@@ -2100,6 +2139,13 @@ impl Properties {
21002139
} else {
21012140
LookSet::full()
21022141
};
2142+
// And also, an empty alternate means we have 0 static capture groups,
2143+
// but we otherwise start with the number corresponding to the first
2144+
// alternate. If any subsequent alternate has a different number of
2145+
// static capture groups, then we overall have a variation and not a
2146+
// static number of groups.
2147+
let static_captures_len =
2148+
it.peek().and_then(|p| p.borrow().static_captures_len());
21032149
// The base case is an empty alternation, which matches nothing.
21042150
// Note though that empty alternations aren't possible, because the
21052151
// Hir::alternation smart constructor rewrites those as empty character
@@ -2112,6 +2158,7 @@ impl Properties {
21122158
look_set_suffix: fix,
21132159
utf8: true,
21142160
captures_len: 0,
2161+
static_captures_len,
21152162
literal: false,
21162163
alternation_literal: true,
21172164
};
@@ -2125,6 +2172,9 @@ impl Properties {
21252172
props.utf8 = props.utf8 && p.is_utf8();
21262173
props.captures_len =
21272174
props.captures_len.saturating_add(p.captures_len());
2175+
if props.static_captures_len != p.static_captures_len() {
2176+
props.static_captures_len = None;
2177+
}
21282178
props.alternation_literal =
21292179
props.alternation_literal && p.is_alternation_literal();
21302180
if !min_poisoned {
@@ -2180,6 +2230,7 @@ impl Properties {
21802230
// since it too can match the empty string.
21812231
utf8: true,
21822232
captures_len: 0,
2233+
static_captures_len: Some(0),
21832234
literal: false,
21842235
alternation_literal: false,
21852236
};
@@ -2196,6 +2247,7 @@ impl Properties {
21962247
look_set_suffix: LookSet::empty(),
21972248
utf8: core::str::from_utf8(&lit.0).is_ok(),
21982249
captures_len: 0,
2250+
static_captures_len: Some(0),
21992251
literal: true,
22002252
alternation_literal: true,
22012253
};
@@ -2212,6 +2264,7 @@ impl Properties {
22122264
look_set_suffix: LookSet::empty(),
22132265
utf8: class.is_utf8(),
22142266
captures_len: 0,
2267+
static_captures_len: Some(0),
22152268
literal: false,
22162269
alternation_literal: false,
22172270
};
@@ -2241,6 +2294,7 @@ impl Properties {
22412294
// property borderline useless.
22422295
utf8: true,
22432296
captures_len: 0,
2297+
static_captures_len: Some(0),
22442298
literal: false,
22452299
alternation_literal: false,
22462300
};
@@ -2268,6 +2322,7 @@ impl Properties {
22682322
look_set_suffix: LookSet::empty(),
22692323
utf8: p.is_utf8(),
22702324
captures_len: p.captures_len(),
2325+
static_captures_len: p.static_captures_len(),
22712326
literal: false,
22722327
alternation_literal: false,
22732328
};
@@ -2278,6 +2333,23 @@ impl Properties {
22782333
inner.look_set_prefix = p.look_set_prefix();
22792334
inner.look_set_suffix = p.look_set_suffix();
22802335
}
2336+
// If the static captures len of the sub-expression is not known or is
2337+
// zero, then it automatically propagates to the repetition, regardless
2338+
// of the repetition. Otherwise, it might change, but only when the
2339+
// repetition can match 0 times.
2340+
if rep.min == 0
2341+
&& inner.static_captures_len.map_or(false, |len| len > 0)
2342+
{
2343+
// If we require a match 0 times, then our captures len is
2344+
// guaranteed to be zero. Otherwise, if we *can* match the empty
2345+
// string, then it's impossible to know how many captures will be
2346+
// in the resulting match.
2347+
if rep.max == Some(0) {
2348+
inner.static_captures_len = Some(0);
2349+
} else {
2350+
inner.static_captures_len = None;
2351+
}
2352+
}
22812353
Properties(Box::new(inner))
22822354
}
22832355

@@ -2286,6 +2358,9 @@ impl Properties {
22862358
let p = capture.sub.properties();
22872359
Properties(Box::new(PropertiesI {
22882360
captures_len: p.captures_len().saturating_add(1),
2361+
static_captures_len: p
2362+
.static_captures_len()
2363+
.map(|len| len.saturating_add(1)),
22892364
literal: false,
22902365
alternation_literal: false,
22912366
..*p.0.clone()
@@ -2306,6 +2381,7 @@ impl Properties {
23062381
look_set_suffix: LookSet::empty(),
23072382
utf8: true,
23082383
captures_len: 0,
2384+
static_captures_len: Some(0),
23092385
literal: true,
23102386
alternation_literal: true,
23112387
};
@@ -2316,6 +2392,10 @@ impl Properties {
23162392
props.utf8 = props.utf8 && p.is_utf8();
23172393
props.captures_len =
23182394
props.captures_len.saturating_add(p.captures_len());
2395+
props.static_captures_len = p
2396+
.static_captures_len()
2397+
.and_then(|len1| Some((len1, props.static_captures_len?)))
2398+
.and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
23192399
props.literal = props.literal && p.is_literal();
23202400
props.alternation_literal =
23212401
props.alternation_literal && p.is_alternation_literal();

regex-syntax/src/hir/translate.rs

+35
Original file line numberDiff line numberDiff line change
@@ -3204,6 +3204,41 @@ mod tests {
32043204
assert_eq!(1, props(r"([a&&b])").captures_len());
32053205
}
32063206

3207+
#[test]
3208+
fn analysis_static_captures_len() {
3209+
let len = |pattern| props(pattern).static_captures_len();
3210+
assert_eq!(Some(0), len(r""));
3211+
assert_eq!(Some(0), len(r"foo|bar"));
3212+
assert_eq!(None, len(r"(foo)|bar"));
3213+
assert_eq!(None, len(r"foo|(bar)"));
3214+
assert_eq!(Some(1), len(r"(foo|bar)"));
3215+
assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
3216+
assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
3217+
assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
3218+
assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
3219+
assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
3220+
assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
3221+
assert_eq!(None, len(r"(a)(b)(extra)?"));
3222+
assert_eq!(Some(1), len(r"(foo)|(bar)"));
3223+
assert_eq!(Some(2), len(r"(foo)(bar)"));
3224+
assert_eq!(Some(2), len(r"(foo)+(bar)"));
3225+
assert_eq!(None, len(r"(foo)*(bar)"));
3226+
assert_eq!(Some(0), len(r"(foo)?{0}"));
3227+
assert_eq!(None, len(r"(foo)?{1}"));
3228+
assert_eq!(Some(1), len(r"(foo){1}"));
3229+
assert_eq!(Some(1), len(r"(foo){1,}"));
3230+
assert_eq!(Some(1), len(r"(foo){1,}?"));
3231+
assert_eq!(None, len(r"(foo){1,}??"));
3232+
assert_eq!(None, len(r"(foo){0,}"));
3233+
assert_eq!(Some(1), len(r"(foo)(?:bar)"));
3234+
assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
3235+
assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
3236+
assert_eq!(
3237+
Some(2),
3238+
len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
3239+
);
3240+
}
3241+
32073242
#[test]
32083243
fn analysis_is_all_assertions() {
32093244
// Positive examples.

src/compile.rs

+2
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ impl Compiler {
161161
self.fill_to_next(patch.hole);
162162
self.compiled.matches = vec![self.insts.len()];
163163
self.push_compiled(Inst::Match(0));
164+
self.compiled.static_captures_len =
165+
expr.properties().static_captures_len();
164166
self.compile_finish()
165167
}
166168

src/exec.rs

+6
Original file line numberDiff line numberDiff line change
@@ -1361,6 +1361,12 @@ impl Exec {
13611361
pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
13621362
&self.ro.nfa.capture_name_idx
13631363
}
1364+
1365+
/// If the number of capture groups in every match is always the same, then
1366+
/// return that number. Otherwise return `None`.
1367+
pub fn static_captures_len(&self) -> Option<usize> {
1368+
self.ro.nfa.static_captures_len
1369+
}
13641370
}
13651371

13661372
impl Clone for Exec {

src/prog.rs

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ pub struct Program {
2727
pub captures: Vec<Option<String>>,
2828
/// Pointers to all named capture groups into `captures`.
2929
pub capture_name_idx: Arc<HashMap<String, usize>>,
30+
/// If the number of capture groups is the same for all possible matches,
31+
/// then this is that number.
32+
pub static_captures_len: Option<usize>,
3033
/// A pointer to the start instruction. This can vary depending on how
3134
/// the program was compiled. For example, programs for use with the DFA
3235
/// engine have a `.*?` inserted at the beginning of unanchored regular
@@ -83,6 +86,7 @@ impl Program {
8386
matches: vec![],
8487
captures: vec![],
8588
capture_name_idx: Arc::new(HashMap::new()),
89+
static_captures_len: None,
8690
start: 0,
8791
byte_classes: vec![0; 256],
8892
only_utf8: true,

src/re_bytes.rs

+40
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,46 @@ impl Regex {
667667
self.0.capture_names().len()
668668
}
669669

670+
/// Returns the total number of capturing groups that appear in every
671+
/// possible match.
672+
///
673+
/// If the number of capture groups can vary depending on the match, then
674+
/// this returns `None`. That is, a value is only returned when the number
675+
/// of matching groups is invariant or "static."
676+
///
677+
/// Note that like [`Regex::captures_len`], this **does** include the
678+
/// implicit capturing group corresponding to the entire match. Therefore,
679+
/// when a non-None value is returned, it is guaranteed to be at least `1`.
680+
/// Stated differently, a return value of `Some(0)` is impossible.
681+
///
682+
/// # Example
683+
///
684+
/// This shows a few cases where a static number of capture groups is
685+
/// available and a few cases where it is not.
686+
///
687+
/// ```
688+
/// use regex::bytes::Regex;
689+
///
690+
/// let len = |pattern| {
691+
/// Regex::new(pattern).map(|re| re.static_captures_len())
692+
/// };
693+
///
694+
/// assert_eq!(Some(1), len("a")?);
695+
/// assert_eq!(Some(2), len("(a)")?);
696+
/// assert_eq!(Some(2), len("(a)|(b)")?);
697+
/// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
698+
/// assert_eq!(None, len("(a)|b")?);
699+
/// assert_eq!(None, len("a|(b)")?);
700+
/// assert_eq!(None, len("(b)*")?);
701+
/// assert_eq!(Some(2), len("(b)+")?);
702+
///
703+
/// # Ok::<(), Box<dyn std::error::Error>>(())
704+
/// ```
705+
#[inline]
706+
pub fn static_captures_len(&self) -> Option<usize> {
707+
self.0.static_captures_len().map(|len| len.saturating_add(1))
708+
}
709+
670710
/// Returns an empty set of capture locations that can be reused in
671711
/// multiple calls to `captures_read` or `captures_read_at`.
672712
pub fn capture_locations(&self) -> CaptureLocations {

src/re_unicode.rs

+40
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,46 @@ impl Regex {
725725
self.0.capture_names().len()
726726
}
727727

728+
/// Returns the total number of capturing groups that appear in every
729+
/// possible match.
730+
///
731+
/// If the number of capture groups can vary depending on the match, then
732+
/// this returns `None`. That is, a value is only returned when the number
733+
/// of matching groups is invariant or "static."
734+
///
735+
/// Note that like [`Regex::captures_len`], this **does** include the
736+
/// implicit capturing group corresponding to the entire match. Therefore,
737+
/// when a non-None value is returned, it is guaranteed to be at least `1`.
738+
/// Stated differently, a return value of `Some(0)` is impossible.
739+
///
740+
/// # Example
741+
///
742+
/// This shows a few cases where a static number of capture groups is
743+
/// available and a few cases where it is not.
744+
///
745+
/// ```
746+
/// use regex::Regex;
747+
///
748+
/// let len = |pattern| {
749+
/// Regex::new(pattern).map(|re| re.static_captures_len())
750+
/// };
751+
///
752+
/// assert_eq!(Some(1), len("a")?);
753+
/// assert_eq!(Some(2), len("(a)")?);
754+
/// assert_eq!(Some(2), len("(a)|(b)")?);
755+
/// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?);
756+
/// assert_eq!(None, len("(a)|b")?);
757+
/// assert_eq!(None, len("a|(b)")?);
758+
/// assert_eq!(None, len("(b)*")?);
759+
/// assert_eq!(Some(2), len("(b)+")?);
760+
///
761+
/// # Ok::<(), Box<dyn std::error::Error>>(())
762+
/// ```
763+
#[inline]
764+
pub fn static_captures_len(&self) -> Option<usize> {
765+
self.0.static_captures_len().map(|len| len.saturating_add(1))
766+
}
767+
728768
/// Returns an empty set of capture locations that can be reused in
729769
/// multiple calls to `captures_read` or `captures_read_at`.
730770
pub fn capture_locations(&self) -> CaptureLocations {

0 commit comments

Comments
 (0)