Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 107 additions & 85 deletions crates/oxc_regular_expression/src/parser/pattern_parser/state.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use oxc_span::Atom;
use rustc_hash::{FxHashMap, FxHashSet};
use rustc_hash::FxHashSet;

use crate::parser::reader::Reader;

/// Currently all of properties are read only from outside of this module.
/// NOTE: Currently all of properties are read-only from outside of this module.
/// Even inside of this module, it is not changed after initialized.
#[derive(Debug)]
pub struct State<'a> {
Expand Down Expand Up @@ -49,12 +49,7 @@ impl<'a> State<'a> {
}
}

enum SimpleUnit<'a> {
Open,
Close,
Pipe,
GroupName(Atom<'a>),
}
// ---

/// Returns: Result<(num_of_left_parens, capturing_group_names), duplicated_named_capturing_group_offsets>
fn parse_capturing_groups<'a>(
Expand All @@ -67,18 +62,12 @@ fn parse_capturing_groups<'a>(
// (?=...), (?!...), (?<=...), (?<!...)
let mut num_of_left_capturing_parens = 0;

// Collect capturing group names
let mut group_names: FxHashMap<Atom<'a>, (u32, u32)> = FxHashMap::default();
// At the same time, check duplicates
// If you want to process this most efficiently:
// - define a scope for each Disjunction
// - then check for duplicates for each `|` while inheriting the parent-child relationship
// ref. https://source.chromium.org/chromium/chromium/src/+/main:v8/src/regexp/regexp-parser.cc;l=1644
// However, duplicates are rare in the first place.
// And as long as it works simply, this may be enough.
let mut may_duplicates: FxHashMap<Atom<'a>, DuplicatedNamedCapturingGroupOffsets> =
FxHashMap::default();
let mut simplified: Vec<SimpleUnit<'a>> = vec![];
// Track all named groups with their depth and alternative path
let mut named_groups: Vec<NamedGroupInfo<'a>> = Vec::new();
let mut group_names: FxHashSet<Atom<'a>> = FxHashSet::default();

// Track alternatives and depth
let mut tracker = AlternativeTracker::new();

let mut in_escape = false;
let mut in_character_class = false;
Expand All @@ -92,22 +81,22 @@ fn parse_capturing_groups<'a>(
} else if cp == ']' as u32 {
in_character_class = false;
} else if !in_character_class && cp == '|' as u32 {
simplified.push(SimpleUnit::Pipe);
tracker.mark_alternative();
} else if !in_character_class && cp == ')' as u32 {
simplified.push(SimpleUnit::Close);
tracker.exit_group();
} else if !in_character_class && cp == '(' as u32 {
reader.advance();
tracker.enter_group();

simplified.push(SimpleUnit::Open);

// Skip IgnoreGroup
// Check for non-capturing groups and lookarounds
// Note: these still increase depth but don't count as capturing groups
if reader.eat2('?', ':')
// Skip LookAroundAssertion
|| reader.eat2('?', '=')
|| reader.eat2('?', '!')
|| reader.eat3('?', '<', '=')
|| reader.eat3('?', '<', '!')
{
// Non-capturing group or lookaround - depth already incremented
continue;
}

Expand All @@ -127,17 +116,29 @@ fn parse_capturing_groups<'a>(

if reader.eat('>') {
let group_name = reader.atom(span_start, span_end);

simplified.push(SimpleUnit::GroupName(group_name));
// Check duplicates later
if let Some(last_span) = group_names.get(&group_name) {
let entry = may_duplicates.entry(group_name).or_default();
entry.push(*last_span);
entry.push((span_start, span_end));
} else {
group_names.insert(group_name, (span_start, span_end));
let alternative_path = tracker.get_alternative_path();

// Check for duplicates with existing groups
for existing in &named_groups {
if existing.name == group_name {
// Check if they can participate together
if !AlternativeTracker::can_participate(
&existing.alternative_path,
&alternative_path,
) {
// True duplicate - return error
return Err(vec![existing.span, (span_start, span_end)]);
}
}
}

named_groups.push(NamedGroupInfo {
name: group_name,
span: (span_start, span_end),
alternative_path,
});
group_names.insert(group_name);

continue;
}
}
Expand All @@ -149,63 +150,72 @@ fn parse_capturing_groups<'a>(
reader.advance();
}

// Check duplicates and emit error if exists
if !may_duplicates.is_empty() {
// Check must be done for each group name
for (group_name, spans) in may_duplicates {
let iter = simplified.iter().clone();
Ok((num_of_left_capturing_parens, group_names))
}

let mut alternative_depth = FxHashSet::default();
let mut depth = 0_u32;
let mut is_first = true;
/// Tracks which alternatives at each depth level have been seen.
/// Used to determine if duplicate named groups are in different alternatives.
#[derive(Debug)]
struct AlternativeTracker {
/// Current nesting depth
depth: u32,
/// Current alternative index at each depth level (stack-based)
/// Each level represents the alternative index at that nesting depth
current_alternative: Vec<u32>,
}

'outer: for token in iter {
match token {
SimpleUnit::Open => {
depth += 1;
}
SimpleUnit::Close => {
// May panic if the pattern has invalid, unbalanced parens
depth = depth.saturating_sub(1);
}
SimpleUnit::Pipe => {
if !is_first {
alternative_depth.insert(depth);
}
}
SimpleUnit::GroupName(name) => {
// Check target group name only
if *name != group_name {
continue;
}
// Skip the first one, because it is not duplicated
if is_first {
is_first = false;
continue;
}
impl AlternativeTracker {
fn new() -> Self {
Self { depth: 0, current_alternative: vec![0] }
}

// If left outer `|` is found, both can participate
// `|(?<n>)`
// ^ ^ depth: 1
// ^ depth: 0
for i in (0..depth).rev() {
if alternative_depth.contains(&i) {
// Remove it, next duplicates requires another `|`
alternative_depth.remove(&i);
continue 'outer;
}
}
fn enter_group(&mut self) {
self.depth += 1;
while self.current_alternative.len() <= self.depth as usize {
self.current_alternative.push(0);
}
}

return Err(spans);
}
}
fn exit_group(&mut self) {
if let Some(alt) = self.current_alternative.get_mut(self.depth as usize) {
*alt = 0;
}
self.depth = self.depth.saturating_sub(1);
}

fn mark_alternative(&mut self) {
if let Some(alt) = self.current_alternative.get_mut(self.depth as usize) {
*alt += 1;
}
}

fn get_alternative_path(&self) -> Vec<u32> {
self.current_alternative.iter().take((self.depth + 1) as usize).copied().collect()
}

fn can_participate(alt1: &[u32], alt2: &[u32]) -> bool {
let min_len = alt1.len().min(alt2.len());
// Check as prefixes, if they differ at any level,
// it means they are in different alternatives, so they can participate together.
for i in 0..min_len {
if alt1[i] != alt2[i] {
return true;
}
}
false
}
}

Ok((num_of_left_capturing_parens, group_names.keys().copied().collect()))
/// Tracks information about a named capturing group
#[derive(Debug, Clone)]
struct NamedGroupInfo<'a> {
name: Atom<'a>,
span: (u32, u32),
alternative_path: Vec<u32>,
}

// ---

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -225,6 +235,12 @@ mod tests {
("(foo)(?<n>bar)(?<nn>baz)", (3, 2)),
("(?<n>.)(?<m>.)|(?<n>..)|(?<m>.)", (4, 2)),
("(?<n>.)(?<m>.)|(?:..)|(?<m>.)", (3, 2)),
// Test exit_group reset behavior: consecutive groups at same depth
("((?<a>x))((?<b>y))|(?<c>z)", (5, 3)), // 2 outer groups + 2 inner named + 1 named = 5 total
("((?<a>x))|((?<a>y))", (4, 1)), // 2 outer + 2 inner named = 4 total, 1 unique name
// Nested groups with alternatives
("((?<a>x)|((?<a>y)))", (4, 1)), // 1 outer + 1 named + 1 inner + 1 named = 4 total
("(((?<a>x))|((?<b>y)))|(((?<a>z))|((?<b>w)))", (10, 2)), // Complex nesting
] {
let mut reader = Reader::initialize(source_text, true, false).unwrap();

Expand All @@ -238,9 +254,15 @@ mod tests {

#[test]
fn duplicated_named_capturing_groups() {
for source_text in
["(?<n>.)(?<n>..)", "(?<n>.(?<n>..))", "|(?<n>.(?<n>..))", "(?<m>.)|(?<n>.(?<n>..))"]
{
for source_text in [
"(?<n>.)(?<n>..)",
"(?<n>.(?<n>..))",
"|(?<n>.(?<n>..))",
"(?<m>.)|(?<n>.(?<n>..))",
// Test consecutive groups with same name in same alternative (should be error)
"((?<a>x))((?<a>y))((?<a>z))",
"(?<n>a)((?<n>b))",
] {
let mut reader = Reader::initialize(source_text, true, false).unwrap();

assert!(parse_capturing_groups(&mut reader).is_err(), "{source_text}");
Expand Down
Loading