Skip to content

Commit

Permalink
automata: improve sparse DFA validation
Browse files Browse the repository at this point in the history
This rejiggers some code so that we can more reliably check whether
start state IDs are valid or not.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=62726
  • Loading branch information
BurntSushi committed Oct 9, 2023
1 parent 3061b4b commit 6521534
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 76 deletions.
Binary file not shown.
159 changes: 83 additions & 76 deletions regex-automata/src/dfa/sparse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -992,8 +992,8 @@ impl<'a> DFA<&'a [u8]> {
// (by trying to decode every state) and start state ID list below. If
// either validation fails, then we return an error.
let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
dfa.tt.validate(&dfa.special)?;
dfa.st.validate(&dfa.special, &dfa.tt)?;
let seen = dfa.tt.validate(&dfa.special)?;
dfa.st.validate(&dfa.special, &seen)?;
// N.B. dfa.special doesn't have a way to do unchecked deserialization,
// so it has already been validated.
Ok((dfa, nread))
Expand Down Expand Up @@ -1388,63 +1388,8 @@ impl<T: AsRef<[u8]>> Transitions<T> {
///
/// That is, every state ID can be used to correctly index a state in this
/// table.
fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
// In order to validate everything, we not only need to make sure we
// can decode every state, but that every transition in every state
// points to a valid state. There are many duplicative transitions, so
// we record state IDs that we've verified so that we don't redo the
// decoding work.
//
// Except, when in no_std mode, we don't have dynamic memory allocation
// available to us, so we skip this optimization. It's not clear
// whether doing something more clever is worth it just yet. If you're
// profiling this code and need it to run faster, please file an issue.
//
// OK, so we also use this to record the set of valid state IDs. Since
// it is possible for a transition to point to an invalid state ID that
// still (somehow) deserializes to a valid state. So we need to make
// sure our transitions are limited to actually correct state IDs.
// The problem is, I'm not sure how to do this verification step in
// no-std no-alloc mode. I think we'd *have* to store the set of valid
// state IDs in the DFA itself. For now, we don't do this verification
// in no-std no-alloc mode. The worst thing that can happen is an
// incorrect result. But no panics or memory safety problems should
// result. Because we still do validate that the state itself is
// "valid" in the sense that everything it points to actually exists.
//
// ---AG
struct Seen {
#[cfg(feature = "alloc")]
set: alloc::collections::BTreeSet<StateID>,
#[cfg(not(feature = "alloc"))]
set: core::marker::PhantomData<StateID>,
}

#[cfg(feature = "alloc")]
impl Seen {
fn new() -> Seen {
Seen { set: alloc::collections::BTreeSet::new() }
}
fn insert(&mut self, id: StateID) {
self.set.insert(id);
}
fn contains(&self, id: &StateID) -> bool {
self.set.contains(id)
}
}

#[cfg(not(feature = "alloc"))]
impl Seen {
fn new() -> Seen {
Seen { set: core::marker::PhantomData }
}
fn insert(&mut self, _id: StateID) {}
fn contains(&self, _id: &StateID) -> bool {
false
}
}

let mut verified: Seen = Seen::new();
fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
let mut verified = Seen::new();
// We need to make sure that we decode the correct number of states.
// Otherwise, an empty set of transitions would validate even if the
// recorded state length is non-empty.
Expand Down Expand Up @@ -1521,7 +1466,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
"mismatching sparse state length",
));
}
Ok(())
Ok(verified)
}

/// Converts these transitions to a borrowed value.
Expand Down Expand Up @@ -1659,7 +1604,7 @@ impl<T: AsRef<[u8]>> Transitions<T> {
let state = &state[nr..];
if npats == 0 {
return Err(DeserializeError::generic(
"state marked as a match, but has no pattern IDs",
"state marked as a match, but pattern length is zero",
));
}

Expand All @@ -1681,6 +1626,21 @@ impl<T: AsRef<[u8]>> Transitions<T> {
} else {
(&[][..], state)
};
if is_match && pattern_ids.is_empty() {
return Err(DeserializeError::generic(
"state marked as a match, but has no pattern IDs",
));
}
if sp.is_match_state(id) && pattern_ids.is_empty() {
return Err(DeserializeError::generic(
"state marked special as a match, but has no pattern IDs",
));
}
if sp.is_match_state(id) != is_match {
return Err(DeserializeError::generic(
"whether state is a match or not is inconsistent",
));
}

// Now read this state's accelerator info. The first byte is the length
// of the accelerator, which is typically 0 (for no acceleration) but
Expand Down Expand Up @@ -2061,28 +2021,19 @@ impl<T: AsRef<[u8]>> StartTable<T> {
fn validate(
&self,
sp: &Special,
trans: &Transitions<T>,
seen: &Seen,
) -> Result<(), DeserializeError> {
for (id, _, _) in self.iter() {
if !seen.contains(&id) {
return Err(DeserializeError::generic(
"found invalid start state ID",
));
}
if sp.is_match_state(id) {
return Err(DeserializeError::generic(
"start states cannot be match states",
));
}
// Confirm that the start state points to a valid state.
let state = trans.try_state(sp, id)?;
// And like for the transition table, confirm that the transitions
// on all start states themselves point to a valid state.
//
// It'd probably be better to integrate this validation with the
// transition table, or otherwise store a sorted sequence of all
// valid state IDs in the sparse DFA itself. That way, we could
// check that every pointer to a state corresponds precisely to a
// correct and valid state.
for i in 0..state.ntrans {
let to = state.next_at(i);
let _ = trans.try_state(sp, to)?;
}
}
Ok(())
}
Expand Down Expand Up @@ -2537,6 +2488,62 @@ impl<'a> fmt::Debug for StateMut<'a> {
}
}

// In order to validate everything, we not only need to make sure we
// can decode every state, but that every transition in every state
// points to a valid state. There are many duplicative transitions, so
// we record state IDs that we've verified so that we don't redo the
// decoding work.
//
// Except, when in no_std mode, we don't have dynamic memory allocation
// available to us, so we skip this optimization. It's not clear
// whether doing something more clever is worth it just yet. If you're
// profiling this code and need it to run faster, please file an issue.
//
// OK, so we also use this to record the set of valid state IDs. Since
// it is possible for a transition to point to an invalid state ID that
// still (somehow) deserializes to a valid state. So we need to make
// sure our transitions are limited to actually correct state IDs.
// The problem is, I'm not sure how to do this verification step in
// no-std no-alloc mode. I think we'd *have* to store the set of valid
// state IDs in the DFA itself. For now, we don't do this verification
// in no-std no-alloc mode. The worst thing that can happen is an
// incorrect result. But no panics or memory safety problems should
// result. Because we still do validate that the state itself is
// "valid" in the sense that everything it points to actually exists.
//
// ---AG
#[derive(Debug)]
struct Seen {
#[cfg(feature = "alloc")]
set: alloc::collections::BTreeSet<StateID>,
#[cfg(not(feature = "alloc"))]
set: core::marker::PhantomData<StateID>,
}

#[cfg(feature = "alloc")]
impl Seen {
fn new() -> Seen {
Seen { set: alloc::collections::BTreeSet::new() }
}
fn insert(&mut self, id: StateID) {
self.set.insert(id);
}
fn contains(&self, id: &StateID) -> bool {
self.set.contains(id)
}
}

#[cfg(not(feature = "alloc"))]
impl Seen {
fn new() -> Seen {
Seen { set: core::marker::PhantomData }
}
fn insert(&mut self, _id: StateID) {}
fn contains(&self, _id: &StateID) -> bool {
false
}
}

/*
/// A binary search routine specialized specifically to a sparse DFA state's
/// transitions. Specifically, the transitions are defined as a set of pairs
Expand Down

0 comments on commit 6521534

Please sign in to comment.