From 5881ac3831a2035dc6c11f214615cd2d0477c7b3 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Fri, 17 May 2024 23:09:58 +0900 Subject: [PATCH] automata: Fix broken universal start states with sparse DFA The state IDs were not remapped, which will usually result in an index out of range error. Add a test based on is_special_state's doctest, which will validate the start state's behavior with a custom searcher. --- regex-automata/src/dfa/sparse.rs | 8 +++ regex-automata/tests/dfa/api.rs | 93 +++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 1 deletion(-) diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index 46278c181..22c8640c5 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -1846,6 +1846,14 @@ impl StartTable> { let new_start_id = remap[dfa.to_index(old_start_id)]; sl.set_start(anchored, sty, new_start_id); } + for ustart in [ + &mut sl.universal_start_unanchored, + &mut sl.universal_start_anchored, + ] { + if let Some(id) = ustart { + *id = remap[dfa.to_index(*id)]; + } + } Ok(sl) } } diff --git a/regex-automata/tests/dfa/api.rs b/regex-automata/tests/dfa/api.rs index 96e73af6c..2956fc500 100644 --- a/regex-automata/tests/dfa/api.rs +++ b/regex-automata/tests/dfa/api.rs @@ -3,7 +3,7 @@ use std::error::Error; use regex_automata::{ dfa::{dense, Automaton, OverlappingState}, nfa::thompson, - HalfMatch, Input, MatchError, + Anchored, HalfMatch, Input, MatchError, }; // Tests that quit bytes in the forward direction work correctly. @@ -67,3 +67,94 @@ fn unicode_word_implicitly_works() -> Result<(), Box> { assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a"))); Ok(()) } + +// A variant of [`Automaton::is_special_state`]'s doctest, but with universal start states. +#[test] +fn universal_start_search() -> Result<(), Box> { + fn find( + dfa: &A, + haystack: &[u8], + ) -> Result, MatchError> { + let mut state = dfa + .universal_start_state(Anchored::No) + .expect("regex should not require lookbehind"); + assert!(dfa.is_start_state(state)); + let mut last_match = None; + // Walk all the bytes in the haystack. We can quit early if we see + // a dead or a quit state. The former means the automaton will + // never transition to any other state. The latter means that the + // automaton entered a condition in which its search failed. + for (i, &b) in haystack.iter().enumerate() { + state = dfa.next_state(state, b); + if dfa.is_special_state(state) { + if dfa.is_match_state(state) { + last_match = + Some(HalfMatch::new(dfa.match_pattern(state, 0), i)); + } else if dfa.is_dead_state(state) { + return Ok(last_match); + } else if dfa.is_quit_state(state) { + // It is possible to enter into a quit state after + // observing a match has occurred. In that case, we + // should return the match instead of an error. + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::quit(b, i)); + } + // Implementors may also want to check for start or accel + // states and handle them differently for performance + // reasons. But it is not necessary for correctness. + } + } + // Matches are always delayed by 1 byte, so we must explicitly walk + // the special "EOI" transition at the end of the search. + state = dfa.next_eoi_state(state); + if dfa.is_match_state(state) { + last_match = Some(HalfMatch::new( + dfa.match_pattern(state, 0), + haystack.len(), + )); + } + Ok(last_match) + } + + fn check_impl( + dfa: impl Automaton, + haystack: &str, + pat: usize, + offset: usize, + ) -> Result<(), Box> { + let haystack = haystack.as_bytes(); + let mat = find(&dfa, haystack)?.unwrap(); + assert_eq!(mat.pattern().as_usize(), pat); + assert_eq!(mat.offset(), offset); + Ok(()) + } + + fn check( + dfa: &dense::DFA>, + haystack: &str, + pat: usize, + offset: usize, + ) -> Result<(), Box> { + check_impl(dfa, haystack, pat, offset)?; + check_impl(dfa.to_sparse()?, haystack, pat, offset)?; + Ok(()) + } + + let dfa = dense::DFA::new(r"[a-z]+")?; + let haystack = "123 foobar 4567"; + check(&dfa, haystack, 0, 10)?; + + let dfa = dense::DFA::new(r"[0-9]{4}")?; + let haystack = "123 foobar 4567"; + check(&dfa, haystack, 0, 15)?; + + let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; + let haystack = "123 foobar 4567"; + check(&dfa, haystack, 1, 3)?; + check(&dfa, &haystack[3..], 0, 7)?; + check(&dfa, &haystack[10..], 1, 5)?; + + Ok(()) +}