Skip to content

Commit 561b743

Browse files
committed
refactor(regular_expression): Improve initizalizing state (#15045)
Fixes #6358
1 parent 7656e2b commit 561b743

File tree

1 file changed

+107
-85
lines changed
  • crates/oxc_regular_expression/src/parser/pattern_parser

1 file changed

+107
-85
lines changed

crates/oxc_regular_expression/src/parser/pattern_parser/state.rs

Lines changed: 107 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
use oxc_span::Atom;
2-
use rustc_hash::{FxHashMap, FxHashSet};
2+
use rustc_hash::FxHashSet;
33

44
use crate::parser::reader::Reader;
55

6-
/// Currently all of properties are read only from outside of this module.
6+
/// NOTE: Currently all of properties are read-only from outside of this module.
77
/// Even inside of this module, it is not changed after initialized.
88
#[derive(Debug)]
99
pub struct State<'a> {
@@ -49,12 +49,7 @@ impl<'a> State<'a> {
4949
}
5050
}
5151

52-
enum SimpleUnit<'a> {
53-
Open,
54-
Close,
55-
Pipe,
56-
GroupName(Atom<'a>),
57-
}
52+
// ---
5853

5954
/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets>
6055
fn parse_capturing_groups<'a>(
@@ -67,18 +62,12 @@ fn parse_capturing_groups<'a>(
6762
// (?=...), (?!...), (?<=...), (?<!...)
6863
let mut num_of_left_capturing_parens = 0;
6964

70-
// Collect capturing group names
71-
let mut group_names: FxHashMap<Atom<'a>, (u32, u32)> = FxHashMap::default();
72-
// At the same time, check duplicates
73-
// If you want to process this most efficiently:
74-
// - define a scope for each Disjunction
75-
// - then check for duplicates for each `|` while inheriting the parent-child relationship
76-
// ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
77-
// However, duplicates are rare in the first place.
78-
// And as long as it works simply, this may be enough.
79-
let mut may_duplicates: FxHashMap<Atom<'a>, DuplicatedNamedCapturingGroupOffsets> =
80-
FxHashMap::default();
81-
let mut simplified: Vec<SimpleUnit<'a>> = vec![];
65+
// Track all named groups with their depth and alternative path
66+
let mut named_groups: Vec<NamedGroupInfo<'a>> = Vec::new();
67+
let mut group_names: FxHashSet<Atom<'a>> = FxHashSet::default();
68+
69+
// Track alternatives and depth
70+
let mut tracker = AlternativeTracker::new();
8271

8372
let mut in_escape = false;
8473
let mut in_character_class = false;
@@ -92,22 +81,22 @@ fn parse_capturing_groups<'a>(
9281
} else if cp == ']' as u32 {
9382
in_character_class = false;
9483
} else if !in_character_class && cp == '|' as u32 {
95-
simplified.push(SimpleUnit::Pipe);
84+
tracker.mark_alternative();
9685
} else if !in_character_class && cp == ')' as u32 {
97-
simplified.push(SimpleUnit::Close);
86+
tracker.exit_group();
9887
} else if !in_character_class && cp == '(' as u32 {
9988
reader.advance();
89+
tracker.enter_group();
10090

101-
simplified.push(SimpleUnit::Open);
102-
103-
// Skip IgnoreGroup
91+
// Check for non-capturing groups and lookarounds
92+
// Note: these still increase depth but don't count as capturing groups
10493
if reader.eat2('?', ':')
105-
// Skip LookAroundAssertion
10694
|| reader.eat2('?', '=')
10795
|| reader.eat2('?', '!')
10896
|| reader.eat3('?', '<', '=')
10997
|| reader.eat3('?', '<', '!')
11098
{
99+
// Non-capturing group or lookaround - depth already incremented
111100
continue;
112101
}
113102

@@ -127,17 +116,29 @@ fn parse_capturing_groups<'a>(
127116

128117
if reader.eat('>') {
129118
let group_name = reader.atom(span_start, span_end);
130-
131-
simplified.push(SimpleUnit::GroupName(group_name));
132-
// Check duplicates later
133-
if let Some(last_span) = group_names.get(&group_name) {
134-
let entry = may_duplicates.entry(group_name).or_default();
135-
entry.push(*last_span);
136-
entry.push((span_start, span_end));
137-
} else {
138-
group_names.insert(group_name, (span_start, span_end));
119+
let alternative_path = tracker.get_alternative_path();
120+
121+
// Check for duplicates with existing groups
122+
for existing in &named_groups {
123+
if existing.name == group_name {
124+
// Check if they can participate together
125+
if !AlternativeTracker::can_participate(
126+
&existing.alternative_path,
127+
&alternative_path,
128+
) {
129+
// True duplicate - return error
130+
return Err(vec![existing.span, (span_start, span_end)]);
131+
}
132+
}
139133
}
140134

135+
named_groups.push(NamedGroupInfo {
136+
name: group_name,
137+
span: (span_start, span_end),
138+
alternative_path,
139+
});
140+
group_names.insert(group_name);
141+
141142
continue;
142143
}
143144
}
@@ -149,63 +150,72 @@ fn parse_capturing_groups<'a>(
149150
reader.advance();
150151
}
151152

152-
// Check duplicates and emit error if exists
153-
if !may_duplicates.is_empty() {
154-
// Check must be done for each group name
155-
for (group_name, spans) in may_duplicates {
156-
let iter = simplified.iter().clone();
153+
Ok((num_of_left_capturing_parens, group_names))
154+
}
157155

158-
let mut alternative_depth = FxHashSet::default();
159-
let mut depth = 0_u32;
160-
let mut is_first = true;
156+
/// Tracks which alternatives at each depth level have been seen.
157+
/// Used to determine if duplicate named groups are in different alternatives.
158+
#[derive(Debug)]
159+
struct AlternativeTracker {
160+
/// Current nesting depth
161+
depth: u32,
162+
/// Current alternative index at each depth level (stack-based)
163+
/// Each level represents the alternative index at that nesting depth
164+
current_alternative: Vec<u32>,
165+
}
161166

162-
'outer: for token in iter {
163-
match token {
164-
SimpleUnit::Open => {
165-
depth += 1;
166-
}
167-
SimpleUnit::Close => {
168-
// May panic if the pattern has invalid, unbalanced parens
169-
depth = depth.saturating_sub(1);
170-
}
171-
SimpleUnit::Pipe => {
172-
if !is_first {
173-
alternative_depth.insert(depth);
174-
}
175-
}
176-
SimpleUnit::GroupName(name) => {
177-
// Check target group name only
178-
if *name != group_name {
179-
continue;
180-
}
181-
// Skip the first one, because it is not duplicated
182-
if is_first {
183-
is_first = false;
184-
continue;
185-
}
167+
impl AlternativeTracker {
168+
fn new() -> Self {
169+
Self { depth: 0, current_alternative: vec![0] }
170+
}
186171

187-
// If left outer `|` is found, both can participate
188-
// `|(?<n>)`
189-
// ^ ^ depth: 1
190-
// ^ depth: 0
191-
for i in (0..depth).rev() {
192-
if alternative_depth.contains(&i) {
193-
// Remove it, next duplicates requires another `|`
194-
alternative_depth.remove(&i);
195-
continue 'outer;
196-
}
197-
}
172+
fn enter_group(&mut self) {
173+
self.depth += 1;
174+
while self.current_alternative.len() <= self.depth as usize {
175+
self.current_alternative.push(0);
176+
}
177+
}
198178

199-
return Err(spans);
200-
}
201-
}
179+
fn exit_group(&mut self) {
180+
if let Some(alt) = self.current_alternative.get_mut(self.depth as usize) {
181+
*alt = 0;
182+
}
183+
self.depth = self.depth.saturating_sub(1);
184+
}
185+
186+
fn mark_alternative(&mut self) {
187+
if let Some(alt) = self.current_alternative.get_mut(self.depth as usize) {
188+
*alt += 1;
189+
}
190+
}
191+
192+
fn get_alternative_path(&self) -> Vec<u32> {
193+
self.current_alternative.iter().take((self.depth + 1) as usize).copied().collect()
194+
}
195+
196+
fn can_participate(alt1: &[u32], alt2: &[u32]) -> bool {
197+
let min_len = alt1.len().min(alt2.len());
198+
// Check as prefixes, if they differ at any level,
199+
// it means they are in different alternatives, so they can participate together.
200+
for i in 0..min_len {
201+
if alt1[i] != alt2[i] {
202+
return true;
202203
}
203204
}
205+
false
204206
}
207+
}
205208

206-
Ok((num_of_left_capturing_parens, group_names.keys().copied().collect()))
209+
/// Tracks information about a named capturing group
210+
#[derive(Debug, Clone)]
211+
struct NamedGroupInfo<'a> {
212+
name: Atom<'a>,
213+
span: (u32, u32),
214+
alternative_path: Vec<u32>,
207215
}
208216

217+
// ---
218+
209219
#[cfg(test)]
210220
mod tests {
211221
use super::*;
@@ -225,6 +235,12 @@ mod tests {
225235
("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
226236
("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
227237
("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
238+
// Test exit_group reset behavior: consecutive groups at same depth
239+
("((?<a>x))((?<b>y))|(?<c>z)", (5, 3)), // 2 outer groups + 2 inner named + 1 named = 5 total
240+
("((?<a>x))|((?<a>y))", (4, 1)), // 2 outer + 2 inner named = 4 total, 1 unique name
241+
// Nested groups with alternatives
242+
("((?<a>x)|((?<a>y)))", (4, 1)), // 1 outer + 1 named + 1 inner + 1 named = 4 total
243+
("(((?<a>x))|((?<b>y)))|(((?<a>z))|((?<b>w)))", (10, 2)), // Complex nesting
228244
] {
229245
let mut reader = Reader::initialize(source_text, true, false).unwrap();
230246

@@ -238,9 +254,15 @@ mod tests {
238254

239255
#[test]
240256
fn duplicated_named_capturing_groups() {
241-
for source_text in
242-
["(?<n>.)(?<n>..)", "(?<n>.(?<n>..))", "|(?<n>.(?<n>..))", "(?<m>.)|(?<n>.(?<n>..))"]
243-
{
257+
for source_text in [
258+
"(?<n>.)(?<n>..)",
259+
"(?<n>.(?<n>..))",
260+
"|(?<n>.(?<n>..))",
261+
"(?<m>.)|(?<n>.(?<n>..))",
262+
// Test consecutive groups with same name in same alternative (should be error)
263+
"((?<a>x))((?<a>y))((?<a>z))",
264+
"(?<n>a)((?<n>b))",
265+
] {
244266
let mut reader = Reader::initialize(source_text, true, false).unwrap();
245267

246268
assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}");

0 commit comments

Comments
 (0)