@@ -1833,6 +1833,7 @@ struct PropertiesI {
1833
1833
look_set_suffix : LookSet ,
1834
1834
utf8 : bool ,
1835
1835
captures_len : usize ,
1836
+ static_captures_len : Option < usize > ,
1836
1837
literal : bool ,
1837
1838
alternation_literal : bool ,
1838
1839
}
@@ -1990,6 +1991,44 @@ impl Properties {
1990
1991
self . 0 . captures_len
1991
1992
}
1992
1993
1994
+ /// Returns the total number of explicit capturing groups that appear in
1995
+ /// every possible match.
1996
+ ///
1997
+ /// If the number of capture groups can vary depending on the match, then
1998
+ /// this returns `None`. That is, a value is only returned when the number
1999
+ /// of matching groups is invariant or "static."
2000
+ ///
2001
+ /// Note that this does not include the implicit capturing group
2002
+ /// corresponding to the entire match.
2003
+ ///
2004
+ /// # Example
2005
+ ///
2006
+ /// This shows a few cases where a static number of capture groups is
2007
+ /// available and a few cases where it is not.
2008
+ ///
2009
+ /// ```
2010
+ /// use regex_syntax::parse;
2011
+ ///
2012
+ /// let len = |pattern| {
2013
+ /// parse(pattern).map(|h| h.properties().static_captures_len())
2014
+ /// };
2015
+ ///
2016
+ /// assert_eq!(Some(0), len("a")?);
2017
+ /// assert_eq!(Some(1), len("(a)")?);
2018
+ /// assert_eq!(Some(1), len("(a)|(b)")?);
2019
+ /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2020
+ /// assert_eq!(None, len("(a)|b")?);
2021
+ /// assert_eq!(None, len("a|(b)")?);
2022
+ /// assert_eq!(None, len("(b)*")?);
2023
+ /// assert_eq!(Some(1), len("(b)+")?);
2024
+ ///
2025
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
2026
+ /// ```
2027
+ #[ inline]
2028
+ pub fn static_captures_len ( & self ) -> Option < usize > {
2029
+ self . 0 . static_captures_len
2030
+ }
2031
+
1993
2032
/// Return true if and only if this HIR is a simple literal. This is
1994
2033
/// only true when this HIR expression is either itself a `Literal` or a
1995
2034
/// concatenation of only `Literal`s.
@@ -2100,6 +2139,13 @@ impl Properties {
2100
2139
} else {
2101
2140
LookSet :: full ( )
2102
2141
} ;
2142
+ // And also, an empty alternate means we have 0 static capture groups,
2143
+ // but we otherwise start with the number corresponding to the first
2144
+ // alternate. If any subsequent alternate has a different number of
2145
+ // static capture groups, then we overall have a variation and not a
2146
+ // static number of groups.
2147
+ let static_captures_len =
2148
+ it. peek ( ) . and_then ( |p| p. borrow ( ) . static_captures_len ( ) ) ;
2103
2149
// The base case is an empty alternation, which matches nothing.
2104
2150
// Note though that empty alternations aren't possible, because the
2105
2151
// Hir::alternation smart constructor rewrites those as empty character
@@ -2112,6 +2158,7 @@ impl Properties {
2112
2158
look_set_suffix : fix,
2113
2159
utf8 : true ,
2114
2160
captures_len : 0 ,
2161
+ static_captures_len,
2115
2162
literal : false ,
2116
2163
alternation_literal : true ,
2117
2164
} ;
@@ -2125,6 +2172,9 @@ impl Properties {
2125
2172
props. utf8 = props. utf8 && p. is_utf8 ( ) ;
2126
2173
props. captures_len =
2127
2174
props. captures_len . saturating_add ( p. captures_len ( ) ) ;
2175
+ if props. static_captures_len != p. static_captures_len ( ) {
2176
+ props. static_captures_len = None ;
2177
+ }
2128
2178
props. alternation_literal =
2129
2179
props. alternation_literal && p. is_alternation_literal ( ) ;
2130
2180
if !min_poisoned {
@@ -2180,6 +2230,7 @@ impl Properties {
2180
2230
// since it too can match the empty string.
2181
2231
utf8 : true ,
2182
2232
captures_len : 0 ,
2233
+ static_captures_len : Some ( 0 ) ,
2183
2234
literal : false ,
2184
2235
alternation_literal : false ,
2185
2236
} ;
@@ -2196,6 +2247,7 @@ impl Properties {
2196
2247
look_set_suffix : LookSet :: empty ( ) ,
2197
2248
utf8 : core:: str:: from_utf8 ( & lit. 0 ) . is_ok ( ) ,
2198
2249
captures_len : 0 ,
2250
+ static_captures_len : Some ( 0 ) ,
2199
2251
literal : true ,
2200
2252
alternation_literal : true ,
2201
2253
} ;
@@ -2212,6 +2264,7 @@ impl Properties {
2212
2264
look_set_suffix : LookSet :: empty ( ) ,
2213
2265
utf8 : class. is_utf8 ( ) ,
2214
2266
captures_len : 0 ,
2267
+ static_captures_len : Some ( 0 ) ,
2215
2268
literal : false ,
2216
2269
alternation_literal : false ,
2217
2270
} ;
@@ -2241,6 +2294,7 @@ impl Properties {
2241
2294
// property borderline useless.
2242
2295
utf8 : true ,
2243
2296
captures_len : 0 ,
2297
+ static_captures_len : Some ( 0 ) ,
2244
2298
literal : false ,
2245
2299
alternation_literal : false ,
2246
2300
} ;
@@ -2268,6 +2322,7 @@ impl Properties {
2268
2322
look_set_suffix : LookSet :: empty ( ) ,
2269
2323
utf8 : p. is_utf8 ( ) ,
2270
2324
captures_len : p. captures_len ( ) ,
2325
+ static_captures_len : p. static_captures_len ( ) ,
2271
2326
literal : false ,
2272
2327
alternation_literal : false ,
2273
2328
} ;
@@ -2278,6 +2333,23 @@ impl Properties {
2278
2333
inner. look_set_prefix = p. look_set_prefix ( ) ;
2279
2334
inner. look_set_suffix = p. look_set_suffix ( ) ;
2280
2335
}
2336
+ // If the static captures len of the sub-expression is not known or is
2337
+ // zero, then it automatically propagates to the repetition, regardless
2338
+ // of the repetition. Otherwise, it might change, but only when the
2339
+ // repetition can match 0 times.
2340
+ if rep. min == 0
2341
+ && inner. static_captures_len . map_or ( false , |len| len > 0 )
2342
+ {
2343
+ // If we require a match 0 times, then our captures len is
2344
+ // guaranteed to be zero. Otherwise, if we *can* match the empty
2345
+ // string, then it's impossible to know how many captures will be
2346
+ // in the resulting match.
2347
+ if rep. max == Some ( 0 ) {
2348
+ inner. static_captures_len = Some ( 0 ) ;
2349
+ } else {
2350
+ inner. static_captures_len = None ;
2351
+ }
2352
+ }
2281
2353
Properties ( Box :: new ( inner) )
2282
2354
}
2283
2355
@@ -2286,6 +2358,9 @@ impl Properties {
2286
2358
let p = capture. sub . properties ( ) ;
2287
2359
Properties ( Box :: new ( PropertiesI {
2288
2360
captures_len : p. captures_len ( ) . saturating_add ( 1 ) ,
2361
+ static_captures_len : p
2362
+ . static_captures_len ( )
2363
+ . map ( |len| len. saturating_add ( 1 ) ) ,
2289
2364
literal : false ,
2290
2365
alternation_literal : false ,
2291
2366
..* p. 0 . clone ( )
@@ -2306,6 +2381,7 @@ impl Properties {
2306
2381
look_set_suffix : LookSet :: empty ( ) ,
2307
2382
utf8 : true ,
2308
2383
captures_len : 0 ,
2384
+ static_captures_len : Some ( 0 ) ,
2309
2385
literal : true ,
2310
2386
alternation_literal : true ,
2311
2387
} ;
@@ -2316,6 +2392,10 @@ impl Properties {
2316
2392
props. utf8 = props. utf8 && p. is_utf8 ( ) ;
2317
2393
props. captures_len =
2318
2394
props. captures_len . saturating_add ( p. captures_len ( ) ) ;
2395
+ props. static_captures_len = p
2396
+ . static_captures_len ( )
2397
+ . and_then ( |len1| Some ( ( len1, props. static_captures_len ?) ) )
2398
+ . and_then ( |( len1, len2) | Some ( len1. saturating_add ( len2) ) ) ;
2319
2399
props. literal = props. literal && p. is_literal ( ) ;
2320
2400
props. alternation_literal =
2321
2401
props. alternation_literal && p. is_alternation_literal ( ) ;
0 commit comments