-
Notifications
You must be signed in to change notification settings - Fork 183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ZeroTrie: Add cursor type for manual iteration and use it in BlobSchemaV2 #4383
Merged
Merged
Changes from 22 commits
Commits
Show all changes
23 commits
Select commit
Hold shift + click to select a range
47ca9c1
Add debug_unwrap helper macro
sffc 92a4207
Clean up varint functions to not double assert
sffc 512ebc6
Change debug_split_at to return a value in the GIGO case
sffc ed19daa
Refactor helpers to be trait-added functions
sffc 3743661
Add as_borrowed_slice and AsRef impl. Need to bikeshed the name.
sffc 46316a0
Initial implementation of ZeroTrie step function
sffc 2d8d8f9
Use ZeroTrie stepping in BlobSchemaV2 to avoid allocations
sffc f272f0e
Merge branch 'main' into zerotrie-step
sffc f9a614e
Add APIs for ZeroTrieSimpleAsciiCursor and use them in BlobSchemaV2
sffc 929f81f
Move around examples and cargo fmt
sffc 79fc890
Return core::fmt::Error instead of asserting ascii
sffc 3a8efa1
Move the core::fmt::Write impl into the zerotrie crate
sffc 838ad05
Don't assert unreachable anymore
sffc 4e70618
Docs for the new core::fmt::Write impl
sffc 8f754cc
Move cursor impls to their own file
sffc bb4d1e7
fmt, clippy
sffc babaf98
Delete the `.head_value()` function in favor of `.cursor().value()`
sffc d5e9e2e
Change .value() to a mutating function and add .peek_value()
sffc fb05eb0
Update provider/blob/src/blob_schema.rs
sffc 5ce1b16
Remove peek_value and impl Clone instead
sffc eeb3cbe
Merge branch 'main' into zerotrie-step
sffc 7017868
value -> take_value
sffc 09b56c0
Docs for internal functions
sffc File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
// This file is part of ICU4X. For terms of use, please see the file | ||
// called LICENSE at the top level of the ICU4X source tree | ||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
|
||
use crate::reader::*; | ||
use crate::ZeroTrieSimpleAscii; | ||
|
||
use core::fmt; | ||
|
||
impl<Store> ZeroTrieSimpleAscii<Store> | ||
where | ||
Store: AsRef<[u8]> + ?Sized, | ||
{ | ||
/// Gets a cursor into the current trie. | ||
/// | ||
/// Useful to query a trie with data that is not a slice. | ||
/// | ||
/// This is currently supported only on `ZeroTrieSimpleAscii`. | ||
/// | ||
/// # Examples | ||
/// | ||
/// Get a value out of a trie by [writing](fmt::Write) it to the cursor: | ||
/// | ||
/// ``` | ||
/// use core::fmt::Write; | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// // Get out the value for "abc" | ||
/// let mut cursor = trie.cursor(); | ||
/// write!(&mut cursor, "abc"); | ||
/// assert_eq!(cursor.take_value(), Some(0)); | ||
/// ``` | ||
/// | ||
/// Find the longest prefix match: | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// // Find the longest prefix of the string "abcdxy": | ||
/// let query = b"abcdxy"; | ||
/// let mut longest_prefix = 0; | ||
/// let mut cursor = trie.cursor(); | ||
/// for (i, b) in query.iter().enumerate() { | ||
/// // Checking is_empty() is not required, but it is | ||
/// // good for efficiency | ||
/// if cursor.is_empty() { | ||
/// break; | ||
/// } | ||
/// if cursor.take_value().is_some() { | ||
/// longest_prefix = i; | ||
/// } | ||
/// cursor.step(*b); | ||
/// } | ||
/// | ||
/// // The longest prefix is "abc" which is length 3: | ||
/// assert_eq!(longest_prefix, 3); | ||
/// ``` | ||
#[inline] | ||
pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { | ||
ZeroTrieSimpleAsciiCursor { | ||
trie: self.as_borrowed_slice(), | ||
} | ||
} | ||
} | ||
|
||
impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { | ||
/// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid | ||
/// having to doubly anchor the trie to the stack. | ||
#[inline] | ||
pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { | ||
ZeroTrieSimpleAsciiCursor { trie: self } | ||
} | ||
} | ||
|
||
/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. | ||
/// | ||
/// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. | ||
// Clone but not Copy: <https://stackoverflow.com/q/32324251/1407170> | ||
#[derive(Debug, Clone)] | ||
pub struct ZeroTrieSimpleAsciiCursor<'a> { | ||
trie: ZeroTrieSimpleAscii<&'a [u8]>, | ||
} | ||
|
||
impl<'a> ZeroTrieSimpleAsciiCursor<'a> { | ||
/// Steps the cursor one byte into the trie. | ||
/// | ||
/// # Examples | ||
/// | ||
/// Unrolled loop checking for string presence at every step: | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// // Search the trie for the string "abcdxy" | ||
/// let mut cursor = trie.cursor(); | ||
/// assert_eq!(cursor.take_value(), None); // "" | ||
/// cursor.step(b'a'); | ||
/// assert_eq!(cursor.take_value(), None); // "a" | ||
/// cursor.step(b'b'); | ||
/// assert_eq!(cursor.take_value(), None); // "ab" | ||
/// cursor.step(b'c'); | ||
/// assert_eq!(cursor.take_value(), Some(0)); // "abc" | ||
/// cursor.step(b'd'); | ||
/// assert_eq!(cursor.take_value(), None); // "abcd" | ||
/// assert!(!cursor.is_empty()); | ||
/// cursor.step(b'x'); // no strings have the prefix "abcdx" | ||
/// assert!(cursor.is_empty()); | ||
/// assert_eq!(cursor.take_value(), None); // "abcdx" | ||
/// cursor.step(b'y'); | ||
/// assert_eq!(cursor.take_value(), None); // "abcdxy" | ||
/// ``` | ||
#[inline] | ||
pub fn step(&mut self, byte: u8) { | ||
step_bsearch_only(&mut self.trie.store, byte) | ||
} | ||
|
||
/// Takes the value at the current position. | ||
/// | ||
/// Calling this function on a new cursor is equivalent to calling `.get()` | ||
/// with the empty string (except that it can only be called once). | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "" and "abc" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); | ||
/// | ||
/// assert_eq!(Some(0), trie.get("")); | ||
/// let mut cursor = trie.cursor(); | ||
/// assert_eq!(Some(0), cursor.take_value()); | ||
/// assert_eq!(None, cursor.take_value()); | ||
/// ``` | ||
#[inline] | ||
pub fn take_value(&mut self) -> Option<usize> { | ||
take_value(&mut self.trie.store) | ||
} | ||
|
||
/// Checks whether the cursor points to an empty trie. | ||
/// | ||
/// Use this to determine when to stop iterating. | ||
#[inline] | ||
pub fn is_empty(&self) -> bool { | ||
self.trie.is_empty() | ||
} | ||
} | ||
|
||
impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { | ||
/// Steps the cursor through each ASCII byte of the string. | ||
/// | ||
/// If the string contains non-ASCII chars, an error is returned. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use core::fmt::Write; | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// let mut cursor = trie.cursor(); | ||
/// cursor.write_str("abcdxy").expect("all ASCII"); | ||
/// cursor.write_str("🚂").expect_err("non-ASCII"); | ||
/// ``` | ||
fn write_str(&mut self, s: &str) -> fmt::Result { | ||
for b in s.bytes() { | ||
if !b.is_ascii() { | ||
return Err(fmt::Error); | ||
} | ||
self.step(b); | ||
} | ||
Ok(()) | ||
} | ||
|
||
/// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns | ||
/// an error if the char is non-ASCII. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use core::fmt::Write; | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// let mut cursor = trie.cursor(); | ||
/// cursor.write_char('a').expect("ASCII"); | ||
/// cursor.write_char('x').expect("ASCII"); | ||
/// cursor.write_char('🚂').expect_err("non-ASCII"); | ||
/// ``` | ||
fn write_char(&mut self, c: char) -> fmt::Result { | ||
if !c.is_ascii() { | ||
return Err(fmt::Error); | ||
} | ||
self.step(c as u8); | ||
Ok(()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -504,6 +504,80 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option<usize> { | |
} | ||
} | ||
|
||
pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: docs |
||
let (mut b, x, search); | ||
loop { | ||
(b, *trie) = match trie.split_first() { | ||
Some(v) => v, | ||
None => { | ||
// Empty trie or only a value node | ||
return; | ||
} | ||
}; | ||
match byte_type(*b) { | ||
NodeType::Ascii if *b == c => { | ||
// Matched a byte | ||
return; | ||
} | ||
NodeType::Ascii => { | ||
// Byte that doesn't match | ||
*trie = &[]; | ||
return; | ||
} | ||
NodeType::Branch => { | ||
// Proceed to the branch node logic below | ||
(x, *trie) = read_varint_meta2(*b, trie); | ||
break; | ||
} | ||
NodeType::Span => { | ||
// Question: Should we put the trie back into a valid state? | ||
// Currently this code is unreachable so let's not worry about it. | ||
debug_assert!(false, "span nodes not supported in stepping"); | ||
return; | ||
} | ||
NodeType::Value => { | ||
// Skip the value node and go to the next node | ||
(_, *trie) = read_varint_meta3(*b, trie); | ||
continue; | ||
} | ||
}; | ||
} | ||
// Branch node | ||
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; | ||
// See comment above regarding this assertion | ||
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); | ||
let w = w & 0x3; | ||
let x = if x == 0 { 256 } else { x }; | ||
// Always use binary search | ||
(search, *trie) = trie.debug_split_at(x); | ||
match search.binary_search(&c) { | ||
Ok(i) => { | ||
// Matched a byte | ||
*trie = if w == 0 { | ||
get_branch_w0(trie, i, x) | ||
} else { | ||
get_branch(trie, i, x, w) | ||
}; | ||
} | ||
Err(_) => { | ||
// Byte that doesn't match | ||
*trie = &[] | ||
} | ||
}; | ||
} | ||
|
||
pub(crate) fn take_value(trie: &mut &[u8]) -> Option<usize> { | ||
let (b, new_trie) = trie.split_first()?; | ||
match byte_type(*b) { | ||
NodeType::Ascii | NodeType::Span | NodeType::Branch => None, | ||
NodeType::Value => { | ||
let x; | ||
(x, *trie) = read_varint_meta3(*b, new_trie); | ||
Some(x) | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "alloc")] | ||
use alloc::vec::Vec; | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how does this handle non-ASCII?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#4395