From 438b0e1f534fced3c2c1b45bce90f3326638ceab Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 30 Jun 2023 17:11:00 +0200 Subject: [PATCH 01/31] Squashed version of df11a8d74d7f60e91b2467d0b25328cc739c2426 --- Cargo.lock | 178 +- Cargo.toml | 1 + experimental/zerotrie/Cargo.toml | 70 + experimental/zerotrie/LICENSE | 51 + experimental/zerotrie/README.md | 33 + experimental/zerotrie/benches/overview.rs | 228 ++ experimental/zerotrie/examples/byteph.rs | 70 + .../examples/first_weekday_for_region.rs | 227 ++ .../zerotrie/src/builder/branch_meta.rs | 27 + experimental/zerotrie/src/builder/bytestr.rs | 117 + .../zerotrie/src/builder/konst/builder.rs | 296 +++ .../zerotrie/src/builder/konst/mod.rs | 9 + .../zerotrie/src/builder/konst/store.rs | 294 +++ experimental/zerotrie/src/builder/litemap.rs | 54 + experimental/zerotrie/src/builder/mod.rs | 140 ++ .../zerotrie/src/builder/nonconst/builder.rs | 362 +++ .../zerotrie/src/builder/nonconst/mod.rs | 33 + .../zerotrie/src/builder/nonconst/store.rs | 158 ++ experimental/zerotrie/src/byte_phf/builder.rs | 115 + .../zerotrie/src/byte_phf/cached_owned.rs | 37 + experimental/zerotrie/src/byte_phf/mod.rs | 355 +++ experimental/zerotrie/src/error.rs | 18 + experimental/zerotrie/src/lib.rs | 52 + experimental/zerotrie/src/reader.rs | 454 ++++ experimental/zerotrie/src/serde.rs | 547 ++++ experimental/zerotrie/src/varint.rs | 405 +++ experimental/zerotrie/src/zerotrie.rs | 570 +++++ experimental/zerotrie/tests/asciitrie_test.rs | 73 + experimental/zerotrie/tests/builder_test.rs | 836 +++++++ experimental/zerotrie/tests/data.rs | 2204 +++++++++++++++++ utils/litemap/src/map.rs | 364 ++- utils/litemap/src/store/mod.rs | 9 +- utils/litemap/src/store/slice_impl.rs | 8 + utils/litemap/src/store/vec_impl.rs | 8 + utils/zerovec/src/zerovec/mod.rs | 18 + 35 files changed, 8398 insertions(+), 23 deletions(-) create mode 100644 experimental/zerotrie/Cargo.toml create mode 100644 experimental/zerotrie/LICENSE create mode 100644 experimental/zerotrie/README.md create mode 100644 experimental/zerotrie/benches/overview.rs create mode 100644 experimental/zerotrie/examples/byteph.rs create mode 100644 experimental/zerotrie/examples/first_weekday_for_region.rs create mode 100644 experimental/zerotrie/src/builder/branch_meta.rs create mode 100644 experimental/zerotrie/src/builder/bytestr.rs create mode 100644 experimental/zerotrie/src/builder/konst/builder.rs create mode 100644 experimental/zerotrie/src/builder/konst/mod.rs create mode 100644 experimental/zerotrie/src/builder/konst/store.rs create mode 100644 experimental/zerotrie/src/builder/litemap.rs create mode 100644 experimental/zerotrie/src/builder/mod.rs create mode 100644 experimental/zerotrie/src/builder/nonconst/builder.rs create mode 100644 experimental/zerotrie/src/builder/nonconst/mod.rs create mode 100644 experimental/zerotrie/src/builder/nonconst/store.rs create mode 100644 experimental/zerotrie/src/byte_phf/builder.rs create mode 100644 experimental/zerotrie/src/byte_phf/cached_owned.rs create mode 100644 experimental/zerotrie/src/byte_phf/mod.rs create mode 100644 experimental/zerotrie/src/error.rs create mode 100644 experimental/zerotrie/src/lib.rs create mode 100644 experimental/zerotrie/src/reader.rs create mode 100644 experimental/zerotrie/src/serde.rs create mode 100644 experimental/zerotrie/src/varint.rs create mode 100644 experimental/zerotrie/src/zerotrie.rs create mode 100644 experimental/zerotrie/tests/asciitrie_test.rs create mode 100644 experimental/zerotrie/tests/builder_test.rs create mode 100644 experimental/zerotrie/tests/data.rs diff --git a/Cargo.lock b/Cargo.lock index c3b033a2a04..e756699b1f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -151,7 +151,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" name = "bies" version = "0.2.1" dependencies = [ - "criterion", + "criterion 0.4.0", "itertools", "num-traits", "partial-min-max", @@ -360,6 +360,17 @@ dependencies = [ "generic-array", ] +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "bitflags", + "textwrap 0.11.0", + "unicode-width", +] + [[package]] name = "clap" version = "3.2.23" @@ -369,7 +380,7 @@ dependencies = [ "bitflags", "clap_lex 0.2.4", "indexmap", - "textwrap", + "textwrap 0.16.0", ] [[package]] @@ -572,6 +583,32 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "criterion" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +dependencies = [ + "atty", + "cast", + "clap 2.34.0", + "criterion-plot 0.4.5", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + [[package]] name = "criterion" version = "0.4.0" @@ -583,7 +620,7 @@ dependencies = [ "cast", "ciborium", "clap 3.2.23", - "criterion-plot", + "criterion-plot 0.5.0", "itertools", "lazy_static", "num-traits", @@ -598,6 +635,16 @@ dependencies = [ "walkdir", ] +[[package]] +name = "criterion-plot" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +dependencies = [ + "cast", + "itertools", +] + [[package]] name = "criterion-plot" version = "0.5.0" @@ -671,6 +718,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "cxx" version = "1.0.94" @@ -1025,7 +1093,7 @@ dependencies = [ name = "fixed_decimal" version = "0.5.3" dependencies = [ - "criterion", + "criterion 0.4.0", "displaydoc", "getrandom", "icu_benchmark_macros", @@ -1365,7 +1433,7 @@ dependencies = [ name = "icu_calendar" version = "1.2.0" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "icu", @@ -1473,7 +1541,7 @@ version = "1.2.0" dependencies = [ "arraystring", "atoi", - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "icu", @@ -1498,7 +1566,7 @@ version = "0.0.0" name = "icu_collections" version = "1.2.0" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "iai", @@ -1596,7 +1664,7 @@ name = "icu_datetime" version = "1.2.1" dependencies = [ "bincode", - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "either", @@ -1628,7 +1696,7 @@ version = "0.0.0" name = "icu_decimal" version = "1.2.0" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "fixed_decimal", @@ -1734,7 +1802,7 @@ version = "0.0.0" name = "icu_locid" version = "1.2.0" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "iai", @@ -1753,7 +1821,7 @@ dependencies = [ name = "icu_locid_transform" version = "1.2.1" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "icu", @@ -1779,7 +1847,7 @@ dependencies = [ "arraystring", "arrayvec", "atoi", - "criterion", + "criterion 0.4.0", "databake", "detone", "displaydoc", @@ -1825,7 +1893,7 @@ dependencies = [ name = "icu_plurals" version = "1.2.0" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "fixed_decimal", @@ -1923,7 +1991,7 @@ name = "icu_provider_fs" version = "1.2.1" dependencies = [ "bincode", - "criterion", + "criterion 0.4.0", "crlify", "displaydoc", "icu_benchmark_macros", @@ -1974,7 +2042,7 @@ version = "0.0.0" name = "icu_segmenter" version = "1.2.1" dependencies = [ - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "icu", @@ -2000,7 +2068,7 @@ version = "0.0.0" name = "icu_testdata" version = "1.2.0" dependencies = [ - "criterion", + "criterion 0.4.0", "icu", "icu_calendar", "icu_casemapping", @@ -2150,7 +2218,7 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" name = "ixdtf" version = "0.1.0" dependencies = [ - "criterion", + "criterion 0.4.0", "icu_testdata", "serde-json-core", ] @@ -2228,7 +2296,7 @@ version = "0.7.0" dependencies = [ "bincode", "bytecheck", - "criterion", + "criterion 0.4.0", "icu_benchmark_macros", "icu_locid", "postcard", @@ -2771,6 +2839,26 @@ dependencies = [ "bitflags", ] +[[package]] +name = "ref-cast" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f43faa91b1c8b36841ee70e97188a869d37ae21759da6846d4be66de5bf7b12c" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d2275aab483050ab2a7364c1a46604865ee7d6906684e08db0f090acf74f9e7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.15", +] + [[package]] name = "regex" version = "1.7.3" @@ -3067,6 +3155,16 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + [[package]] name = "serde_derive" version = "1.0.160" @@ -3385,6 +3483,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + [[package]] name = "textwrap" version = "0.16.0" @@ -3489,7 +3596,7 @@ name = "tinystr" version = "0.7.1" dependencies = [ "bincode", - "criterion", + "criterion 0.4.0", "databake", "displaydoc", "postcard", @@ -4350,11 +4457,20 @@ dependencies = [ name = "writeable" version = "0.5.2" dependencies = [ - "criterion", + "criterion 0.4.0", "icu_benchmark_macros", "rand", ] +[[package]] +name = "wyhash" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf6e163c25e3fac820b4b453185ea2dea3b6a3e0a721d4d23d75bd33734c295" +dependencies = [ + "rand_core", +] + [[package]] name = "yoke" version = "0.7.1" @@ -4397,12 +4513,32 @@ dependencies = [ "zerovec", ] +[[package]] +name = "zerotrie" +version = "0.1.0" +dependencies = [ + "bincode", + "criterion 0.3.6", + "displaydoc", + "icu_benchmark_macros", + "litemap", + "postcard", + "rand", + "rand_pcg", + "ref-cast", + "serde", + "serde_json", + "t1ha", + "wyhash", + "zerovec", +] + [[package]] name = "zerovec" version = "0.9.4" dependencies = [ "bincode", - "criterion", + "criterion 0.4.0", "databake", "getrandom", "iai", diff --git a/Cargo.toml b/Cargo.toml index 6d031f1a568..97b306f87dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,6 +42,7 @@ members = [ "experimental/ixdtf", "experimental/relativetime", "experimental/relativetime/data", + "experimental/zerotrie", "ffi/capi_cdylib", "ffi/capi_staticlib", "ffi/diplomat", diff --git a/experimental/zerotrie/Cargo.toml b/experimental/zerotrie/Cargo.toml new file mode 100644 index 00000000000..265b28c33a2 --- /dev/null +++ b/experimental/zerotrie/Cargo.toml @@ -0,0 +1,70 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +[package] +name = "zerotrie" +description = "A data structure that efficiently maps strings to integers" +version = "0.1.0" +authors = ["The ICU4X Project Developers"] +edition = "2021" +readme = "README.md" +repository = "https://github.com/unicode-org/icu4x" +license = "Unicode-DFS-2016" +# Keep this in sync with other crates unless there are exceptions +include = [ + "src/**/*", + "examples/**/*", + "benches/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md" +] + +[package.metadata.docs.rs] +all-features = true + +[package.metadata.cargo-all-features] +# Bench feature gets tested separately and is only relevant for CI +denylist = ["bench"] + +[dependencies] +zerovec = { path = "../../utils/zerovec", optional = true } +litemap = { path = "../../utils/litemap", default-features = false, features = ["alloc"], optional = true } +ref-cast = { version = "1.0.12" } +serde = { version = "1.0", optional = true } +t1ha = "0.1" +wyhash = "0.5" +displaydoc = { version = "0.2.3", default-features = false } + +[dev-dependencies] +postcard = { version = "1.0", default-features = false, features = ["alloc"] } +serde = { version = "1.0", default-features = false } +zerovec = { path = "../../utils/zerovec", features = ["serde", "hashmap"] } +litemap = { path = "../../utils/litemap" } +criterion = "0.3" +icu_benchmark_macros = { path = "../../tools/benchmark/macros" } +serde_json = "1.0" +bincode = "1.0" +rand = "0.8" +rand_pcg = "0.3" + +[lib] +bench = false # This option is required for Benchmark CI +path = "src/lib.rs" + +[features] +default = [] +bench = [] +alloc = [] +litemap = ["dep:litemap", "alloc"] +serde = ["dep:serde", "alloc", "litemap/serde", "zerovec?/serde"] + +[[bench]] +name = "overview" +harness = false + +[[test]] +name = "builder_test" +required-features = ["alloc", "litemap"] diff --git a/experimental/zerotrie/LICENSE b/experimental/zerotrie/LICENSE new file mode 100644 index 00000000000..9858d01abf5 --- /dev/null +++ b/experimental/zerotrie/LICENSE @@ -0,0 +1,51 @@ +UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE + +See Terms of Use +for definitions of Unicode Inc.’s Data Files and Software. + +NOTICE TO USER: Carefully read the following legal agreement. +BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S +DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), +YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. +IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE +THE DATA FILES OR SOFTWARE. + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2022 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +— + +Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. +ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. diff --git a/experimental/zerotrie/README.md b/experimental/zerotrie/README.md new file mode 100644 index 00000000000..9f75b0602ad --- /dev/null +++ b/experimental/zerotrie/README.md @@ -0,0 +1,33 @@ +# zerotrie [![crates.io](https://img.shields.io/crates/v/zerotrie)](https://crates.io/crates/zerotrie) + +A data structure offering zero-copy storage and retrieval of byte strings, with a focus +on the efficient storage of ASCII strings. Strings are mapped to a `usize` values. + +[`ZeroTrie`] does not support mutation because doing so would require recomputing the entire +data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`]. + +There are multiple variants of [`ZeroTrie`] optimized for different use cases. + +## Examples + +```rust +use zerotrie::ZeroTrie; + +let data: &[(&str, usize)] = &[ + ("abc", 11), + ("xyz", 22), + ("axyb", 33), +]; + +let trie: ZeroTrie> = data.iter().copied().collect(); + +assert_eq!(trie.get("axyb"), Some(33)); +assert_eq!(trie.byte_len(), 18); +``` + +[`LiteMap`]: litemap::LiteMap +[`BTreeMap`]: alloc::collections::BTreeMap + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/experimental/zerotrie/benches/overview.rs b/experimental/zerotrie/benches/overview.rs new file mode 100644 index 00000000000..3b66bc73ccd --- /dev/null +++ b/experimental/zerotrie/benches/overview.rs @@ -0,0 +1,228 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use litemap::LiteMap; +use std::collections::HashMap; +use zerotrie::ZeroTrieExtendedCapacity; +use zerotrie::ZeroTriePerfectHash; +use zerotrie::ZeroTrieSimpleAscii; +use zerovec::ZeroHashMap; +use zerovec::ZeroMap; + +mod testdata { + include!("../tests/data.rs"); +} + +fn get_basic_bench(c: &mut Criterion) { + let mut g = c.benchmark_group("get/basic"); + + // NOTE: All the trie data are the same for basic data + let trie = testdata::basic::TRIE_ASCII; + let data = testdata::basic::DATA_ASCII; + + g.bench_function("SimpleAscii", |b| { + let trie = ZeroTrieSimpleAscii::from_bytes(trie); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&trie).get(key); + assert_eq!(Some(*expected), actual); + } + }); + }); + + g.bench_function("PerfectHash", |b| { + let trie = ZeroTriePerfectHash::from_bytes(trie); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&trie).get(key); + assert_eq!(Some(*expected), actual); + } + }); + }); + + g.bench_function("ExtendedCapacity", |b| { + let trie = ZeroTrieExtendedCapacity::from_bytes(trie); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&trie).get(key); + assert_eq!(Some(*expected), actual); + } + }); + }); + + g.bench_function("ZeroMap/usize", |b| { + let zm: ZeroMap<[u8], usize> = data.iter().copied().collect(); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&zm).get_copied(key); + assert_eq!(Some(*expected), actual); + } + }); + }); + + g.bench_function("ZeroMap/u8", |b| { + let zm: ZeroMap<[u8], u8> = data.iter().map(|(k, v)| (*k, *v as u8)).collect(); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&zm).get_copied(key); + assert_eq!(Some(*expected as u8), actual); + } + }); + }); + + g.bench_function("HashMap", |b| { + let hm: HashMap<&[u8], usize> = data.iter().copied().map(|(a, b)| (a, b)).collect(); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&hm).get(key); + assert_eq!(Some(expected), actual); + } + }); + }); + + g.bench_function("ZeroHashMap/usize", |b| { + let zhm: ZeroHashMap<[u8], usize> = data + .iter() + .copied() + .collect(); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&zhm).get(key); + // No get_copied on ZHM so we need to do it manually + let actual = actual.map(|x| >::zvl_get_as_t(x, |y| *y)); + assert_eq!(Some(*expected), actual); + } + }); + }); + + g.bench_function("ZeroHashMap/u8", |b| { + let zhm: ZeroHashMap<[u8], u8> = data.iter().map(|(k, v)| (*k, *v as u8)).collect(); + b.iter(|| { + for (key, expected) in black_box(data) { + let actual = black_box(&zhm).get(key).copied(); + assert_eq!(Some(*expected as u8), actual); + } + }); + }); +} + +fn get_subtags_bench_medium(c: &mut Criterion) { + let g = c.benchmark_group("get/subtags_10pct"); + + let strings = testdata::short_subtags_10pct::STRINGS; + let litemap = testdata::strings_to_litemap(strings); + + get_subtags_bench_helper(g, strings, litemap); +} + +fn get_subtags_bench_large(c: &mut Criterion) { + let g = c.benchmark_group("get/subtags_full"); + + let strings = testdata::short_subtags::STRINGS; + let litemap = testdata::strings_to_litemap(strings); + + get_subtags_bench_helper(g, strings, litemap); +} + +fn get_subtags_bench_helper( + mut g: criterion::BenchmarkGroup, + strings: &[&str], + litemap: LiteMap<&[u8], usize>, +) { + g.bench_function("SimpleAscii", |b| { + let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&trie).get(key); + assert_eq!(Some(i), actual); + } + }); + }); + + g.bench_function("PerfectHash", |b| { + let trie = ZeroTriePerfectHash::try_from(&litemap).unwrap(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&trie).get(key); + assert_eq!(Some(i), actual); + } + }); + }); + + g.bench_function("ExtendedCapacity", |b| { + let trie = ZeroTrieExtendedCapacity::try_from(&litemap).unwrap(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&trie).get(key); + assert_eq!(Some(i), actual); + } + }); + }); + + g.bench_function("ZeroMap/usize", |b| { + let zm: ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&zm).get_copied(key.as_bytes()); + assert_eq!(Some(i), actual); + } + }); + }); + + g.bench_function("ZeroMap/u8", |b| { + let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&zm).get_copied(key.as_bytes()); + assert_eq!(Some(i as u8), actual); + } + }); + }); + + g.bench_function("HashMap", |b| { + let hm: HashMap<&[u8], usize> = litemap.iter().map(|(a, b)| (*a, *b)).collect(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&hm).get(key.as_bytes()); + assert_eq!(Some(&i), actual); + } + }); + }); + + g.bench_function("ZeroHashMap/usize", |b| { + let zhm: ZeroHashMap<[u8], usize> = litemap + .iter() + .map(|(a, b)| (*a, b)) + .collect(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&zhm).get(key.as_bytes()); + // No get_copied on ZHM so we need to do it manually + let actual = actual.map(|x| >::zvl_get_as_t(x, |y| *y)); + assert_eq!(Some(i), actual); + } + }); + }); + + g.bench_function("ZeroHashMap/u8", |b| { + let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect(); + b.iter(|| { + for (i, key) in black_box(strings).iter().enumerate() { + let actual = black_box(&zhm).get(key.as_bytes()).copied(); + assert_eq!(Some(i as u8), actual); + } + }); + }); + + g.finish(); +} + +criterion_group!( + benches, + get_basic_bench, + get_subtags_bench_medium, + get_subtags_bench_large +); +criterion_main!(benches); diff --git a/experimental/zerotrie/examples/byteph.rs b/experimental/zerotrie/examples/byteph.rs new file mode 100644 index 00000000000..79d09741e5f --- /dev/null +++ b/experimental/zerotrie/examples/byteph.rs @@ -0,0 +1,70 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// This example demonstrates the use of AsciiTrie to look up data based on a region code. + +#![no_main] // https://github.com/unicode-org/icu4x/issues/395 +#![allow(unused_labels)] +#![allow(dead_code)] + +icu_benchmark_macros::static_setup!(); + +use zerotrie::byte_phf::*; + +fn print_byte_to_stdout(byte: u8) { + if let Ok(c) = char::try_from(byte) { + if c.is_ascii_alphanumeric() { + print!("'{c}'"); + return; + } + } + print!("0x{byte:X}"); +} + +fn random_alphanums(seed: u64, len: usize) -> Vec { + use rand::seq::SliceRandom; + use rand::SeedableRng; + const BYTES: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed); + BYTES.choose_multiple(&mut rng, len).copied().collect() +} + +#[no_mangle] +fn main(_argc: isize, _argv: *const *const u8) -> isize { + icu_benchmark_macros::main_setup!(); + + // let bytes = b"abdeghi"; + // let bytes = b"abdeghklmopuvxz"; + // let bytes = b"qwertyuiopasdfgh"; + // let bytes = b"qwrtuipadgklzxcbmQWRUOPADHKZVM"; + + let mut p_distr = vec![0; 256]; + for len in 0..256 { + for seed in 0..100 { + let bytes = random_alphanums(seed, len); + // println!("{len} {seed}"); + let (p, _) = find(bytes.as_slice()).unwrap(); + p_distr[p as usize] += 1; + } + } + println!("p_distr: {p_distr:?}"); + + let bytes = random_alphanums(0, 16); + + #[allow(non_snake_case)] + let N = bytes.len(); + + let (p, qq) = find(bytes.as_slice()).unwrap(); + + println!("Results:"); + for byte in bytes.iter() { + print_byte_to_stdout(*byte); + let l1 = f1(*byte, p, N); + let q = qq[l1]; + let l2 = f2(*byte, q, N); + println!(" => l1 {l1} => q {q} => l2 {l2}"); + } + + 0 +} diff --git a/experimental/zerotrie/examples/first_weekday_for_region.rs b/experimental/zerotrie/examples/first_weekday_for_region.rs new file mode 100644 index 00000000000..fc4027be03c --- /dev/null +++ b/experimental/zerotrie/examples/first_weekday_for_region.rs @@ -0,0 +1,227 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// This example demonstrates the use of ZeroTrieSimpleAscii to look up data based on a region code. + +#![no_main] // https://github.com/unicode-org/icu4x/issues/395 +#![allow(dead_code)] + +use zerotrie::ZeroTriePerfectHash; +use zerotrie::ZeroTrieSimpleAscii; + +icu_benchmark_macros::static_setup!(); + +mod weekday { + pub const MON: usize = 1; + pub const FRI: usize = 5; + pub const SAT: usize = 6; + pub const SUN: usize = 7; +} + +// This data originated from CLDR 41. +static DATA: &[(&str, usize)] = &[ + ("001", weekday::MON), + ("AD", weekday::MON), + ("AE", weekday::SAT), + ("AF", weekday::SAT), + ("AG", weekday::SUN), + ("AI", weekday::MON), + ("AL", weekday::MON), + ("AM", weekday::MON), + ("AN", weekday::MON), + ("AR", weekday::MON), + ("AS", weekday::SUN), + ("AT", weekday::MON), + ("AU", weekday::MON), + ("AX", weekday::MON), + ("AZ", weekday::MON), + ("BA", weekday::MON), + ("BD", weekday::SUN), + ("BE", weekday::MON), + ("BG", weekday::MON), + ("BH", weekday::SAT), + ("BM", weekday::MON), + ("BN", weekday::MON), + ("BR", weekday::SUN), + ("BS", weekday::SUN), + ("BT", weekday::SUN), + ("BW", weekday::SUN), + ("BY", weekday::MON), + ("BZ", weekday::SUN), + ("CA", weekday::SUN), + ("CH", weekday::MON), + ("CL", weekday::MON), + ("CM", weekday::MON), + ("CN", weekday::SUN), + ("CO", weekday::SUN), + ("CR", weekday::MON), + ("CY", weekday::MON), + ("CZ", weekday::MON), + ("DE", weekday::MON), + ("DJ", weekday::SAT), + ("DK", weekday::MON), + ("DM", weekday::SUN), + ("DO", weekday::SUN), + ("DZ", weekday::SAT), + ("EC", weekday::MON), + ("EE", weekday::MON), + ("EG", weekday::SAT), + ("ES", weekday::MON), + ("ET", weekday::SUN), + ("FI", weekday::MON), + ("FJ", weekday::MON), + ("FO", weekday::MON), + ("FR", weekday::MON), + ("GB", weekday::MON), + ("GB-alt-variant", weekday::SUN), + ("GE", weekday::MON), + ("GF", weekday::MON), + ("GP", weekday::MON), + ("GR", weekday::MON), + ("GT", weekday::SUN), + ("GU", weekday::SUN), + ("HK", weekday::SUN), + ("HN", weekday::SUN), + ("HR", weekday::MON), + ("HU", weekday::MON), + ("ID", weekday::SUN), + ("IE", weekday::MON), + ("IL", weekday::SUN), + ("IN", weekday::SUN), + ("IQ", weekday::SAT), + ("IR", weekday::SAT), + ("IS", weekday::MON), + ("IT", weekday::MON), + ("JM", weekday::SUN), + ("JO", weekday::SAT), + ("JP", weekday::SUN), + ("KE", weekday::SUN), + ("KG", weekday::MON), + ("KH", weekday::SUN), + ("KR", weekday::SUN), + ("KW", weekday::SAT), + ("KZ", weekday::MON), + ("LA", weekday::SUN), + ("LB", weekday::MON), + ("LI", weekday::MON), + ("LK", weekday::MON), + ("LT", weekday::MON), + ("LU", weekday::MON), + ("LV", weekday::MON), + ("LY", weekday::SAT), + ("MC", weekday::MON), + ("MD", weekday::MON), + ("ME", weekday::MON), + ("MH", weekday::SUN), + ("MK", weekday::MON), + ("MM", weekday::SUN), + ("MN", weekday::MON), + ("MO", weekday::SUN), + ("MQ", weekday::MON), + ("MT", weekday::SUN), + ("MV", weekday::FRI), + ("MX", weekday::SUN), + ("MY", weekday::MON), + ("MZ", weekday::SUN), + ("NI", weekday::SUN), + ("NL", weekday::MON), + ("NO", weekday::MON), + ("NP", weekday::SUN), + ("NZ", weekday::MON), + ("OM", weekday::SAT), + ("PA", weekday::SUN), + ("PE", weekday::SUN), + ("PH", weekday::SUN), + ("PK", weekday::SUN), + ("PL", weekday::MON), + ("PR", weekday::SUN), + ("PT", weekday::SUN), + ("PY", weekday::SUN), + ("QA", weekday::SAT), + ("RE", weekday::MON), + ("RO", weekday::MON), + ("RS", weekday::MON), + ("RU", weekday::MON), + ("SA", weekday::SUN), + ("SD", weekday::SAT), + ("SE", weekday::MON), + ("SG", weekday::SUN), + ("SI", weekday::MON), + ("SK", weekday::MON), + ("SM", weekday::MON), + ("SV", weekday::SUN), + ("SY", weekday::SAT), + ("TH", weekday::SUN), + ("TJ", weekday::MON), + ("TM", weekday::MON), + ("TR", weekday::MON), + ("TT", weekday::SUN), + ("TW", weekday::SUN), + ("UA", weekday::MON), + ("UM", weekday::SUN), + ("US", weekday::SUN), + ("UY", weekday::MON), + ("UZ", weekday::MON), + ("VA", weekday::MON), + ("VE", weekday::SUN), + ("VI", weekday::SUN), + ("VN", weekday::MON), + ("WS", weekday::SUN), + ("XK", weekday::MON), + ("YE", weekday::SUN), + ("ZA", weekday::SUN), + ("ZW", weekday::SUN), +]; +static TRIE: ZeroTrieSimpleAscii<[u8; 539]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(DATA); + +static TRIE_PHF: ZeroTriePerfectHash<[u8; 567]> = ZeroTriePerfectHash::from_store([ + 225, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 0, 15, 0, + 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 79, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 74, 48, 76, + 78, 77, 80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, + 14, 41, 59, 74, 86, 88, 90, 92, 98, 100, 142, 181, 208, 226, 241, 253, 31, 43, 67, 85, 94, 97, + 121, 136, 178, 65, 134, 196, 69, 79, 83, 85, 1, 2, 3, 129, 129, 129, 129, 201, 65, 68, 69, 71, + 73, 75, 77, 86, 89, 1, 2, 3, 4, 5, 6, 7, 8, 135, 134, 129, 135, 129, 129, 129, 135, 134, 198, + 72, 74, 77, 82, 84, 87, 1, 2, 3, 4, 5, 135, 129, 129, 129, 135, 135, 197, 65, 77, 83, 89, 90, + 1, 2, 3, 4, 129, 135, 135, 129, 129, 196, 65, 69, 73, 78, 1, 2, 3, 129, 135, 135, 129, 83, 135, + 75, 129, 69, 135, 194, 65, 87, 1, 135, 135, 77, 134, 206, 68, 69, 70, 71, 73, 76, 77, 78, 82, + 83, 84, 85, 88, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 129, 134, 134, 135, 129, 129, + 129, 129, 129, 135, 129, 129, 129, 129, 205, 65, 68, 69, 71, 72, 77, 78, 82, 83, 84, 87, 89, + 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 129, 135, 129, 129, 134, 129, 129, 135, 135, 135, + 135, 129, 135, 201, 65, 72, 76, 77, 78, 79, 82, 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 135, 129, 129, + 129, 135, 135, 129, 129, 129, 198, 69, 74, 75, 77, 79, 90, 1, 2, 3, 4, 5, 129, 134, 129, 135, + 135, 134, 197, 67, 69, 71, 83, 84, 1, 2, 3, 4, 129, 129, 134, 129, 135, 196, 73, 74, 79, 82, 1, + 2, 3, 129, 129, 129, 129, 199, 66, 69, 70, 80, 82, 84, 85, 14, 15, 16, 17, 18, 19, 129, 45, 97, + 108, 116, 45, 118, 97, 114, 105, 97, 110, 116, 135, 129, 129, 129, 129, 135, 135, 196, 75, 78, + 82, 85, 1, 2, 3, 135, 135, 129, 129, 200, 68, 69, 76, 78, 81, 82, 83, 84, 1, 2, 3, 4, 5, 6, 7, + 135, 129, 135, 135, 134, 134, 129, 129, 198, 69, 71, 72, 82, 87, 90, 1, 2, 3, 4, 5, 135, 129, + 135, 135, 134, 129, 195, 77, 79, 80, 1, 2, 135, 134, 135, 48, 49, 129, 200, 65, 66, 73, 75, 84, + 85, 86, 89, 1, 2, 3, 4, 5, 6, 7, 135, 129, 129, 129, 129, 129, 129, 134, 197, 73, 76, 79, 80, + 90, 1, 2, 3, 4, 135, 129, 129, 135, 129, 206, 67, 68, 69, 72, 75, 77, 78, 79, 81, 84, 86, 88, + 89, 90, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 129, 129, 129, 135, 129, 135, 129, 135, 129, + 135, 133, 135, 129, 135, 200, 65, 69, 72, 75, 76, 82, 84, 89, 1, 2, 3, 4, 5, 6, 7, 135, 135, + 135, 135, 129, 135, 135, 135, +]); + +fn black_box(dummy: T) -> T { + unsafe { + let ret = std::ptr::read_volatile(&dummy); + std::mem::forget(dummy); + ret + } +} + +#[no_mangle] +fn main(_argc: isize, _argv: *const *const u8) -> isize { + icu_benchmark_macros::main_setup!(); + + // Un-comment to re-generate the bytes (printed to the terminal) + // let trie_phf = DATA.iter().copied().collect::>>(); + // assert_eq!(trie_phf.as_bytes(), TRIE_PHF.as_bytes()); + + if black_box(TRIE_PHF).get(b"MV") == Some(weekday::FRI) { + 0 + } else { + 1 + } +} diff --git a/experimental/zerotrie/src/builder/branch_meta.rs b/experimental/zerotrie/src/builder/branch_meta.rs new file mode 100644 index 00000000000..03dc3087d70 --- /dev/null +++ b/experimental/zerotrie/src/builder/branch_meta.rs @@ -0,0 +1,27 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +/// Intermediate metadata for a branch node under construction. +#[derive(Debug, Clone, Copy)] +pub(crate) struct BranchMeta { + /// The lead byte for this branch. + pub ascii: u8, + /// The size in bytes of the trie data reachable from this branch. + pub local_length: usize, + /// The size in bytes of this and all later sibling branches. + pub cumulative_length: usize, + /// The number of later sibling branches, including this. + pub count: usize, +} + +impl BranchMeta { + pub const fn const_default() -> Self { + BranchMeta { + ascii: 0, + cumulative_length: 0, + local_length: 0, + count: 0, + } + } +} diff --git a/experimental/zerotrie/src/builder/bytestr.rs b/experimental/zerotrie/src/builder/bytestr.rs new file mode 100644 index 00000000000..11e614573fe --- /dev/null +++ b/experimental/zerotrie/src/builder/bytestr.rs @@ -0,0 +1,117 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::borrow::Borrow; + +#[cfg(feature = "serde")] +use alloc::boxed::Box; + +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct ByteStr([u8]); + +impl ByteStr { + pub const fn from_byte_slice_with_value<'a, 'l>( + input: &'l [(&'a [u8], usize)], + ) -> &'l [(&'a ByteStr, usize)] { + // Safety: [u8] and ByteStr have the same layout and invariants + unsafe { core::mem::transmute(input) } + } + + pub const fn from_str_slice_with_value<'a, 'l>( + input: &'l [(&'a str, usize)], + ) -> &'l [(&'a ByteStr, usize)] { + // Safety: str and ByteStr have the same layout, and ByteStr is less restrictive + unsafe { core::mem::transmute(input) } + } + + pub fn from_bytes(input: &[u8]) -> &Self { + // Safety: [u8] and ByteStr have the same layout and invariants + unsafe { core::mem::transmute(input) } + } + + #[cfg(feature = "serde")] + pub fn from_boxed_bytes(input: Box<[u8]>) -> Box { + // Safety: [u8] and ByteStr have the same layout and invariants + unsafe { core::mem::transmute(input) } + } + + #[allow(dead_code)] // may want this in the future + pub fn from_str(input: &str) -> &Self { + Self::from_bytes(input.as_bytes()) + } + + #[allow(dead_code)] // may want this in the future + pub fn empty() -> &'static Self { + Self::from_bytes(&[]) + } + + #[allow(dead_code)] // not used in all features + pub const fn as_bytes(&self) -> &[u8] { + &self.0 + } + + pub const fn len(&self) -> usize { + self.0.len() + } + + #[allow(dead_code)] // not used in all features + pub fn is_all_ascii(&self) -> bool { + for byte in self.0.iter() { + if !byte.is_ascii() { + return false; + } + } + true + } + + #[allow(dead_code)] // may want this in the future + pub(crate) fn byte_at(&self, index: usize) -> Option { + self.0.get(index).copied() + } + + pub(crate) const fn byte_at_or_panic(&self, index: usize) -> u8 { + self.0[index] + } + + pub(crate) const fn is_less_then(&self, other: &Self) -> bool { + let mut i = 0; + while i < self.len() && i < other.len() { + if self.0[i] < other.0[i] { + return true; + } + if self.0[i] > other.0[i] { + return false; + } + i += 1; + } + self.len() < other.len() + } + + pub(crate) const fn prefix_eq(&self, other: &ByteStr, prefix_len: usize) -> bool { + assert!(prefix_len <= self.len()); + assert!(prefix_len <= other.len()); + let mut i = 0; + while i < prefix_len { + if self.0[i] != other.0[i] { + return false; + } + i += 1; + } + true + } +} + +impl Borrow<[u8]> for ByteStr { + fn borrow(&self) -> &[u8] { + self.as_bytes() + } +} + +#[cfg(feature = "alloc")] +impl Borrow<[u8]> for alloc::boxed::Box { + fn borrow(&self) -> &[u8] { + self.as_bytes() + } +} diff --git a/experimental/zerotrie/src/builder/konst/builder.rs b/experimental/zerotrie/src/builder/konst/builder.rs new file mode 100644 index 00000000000..89df85ca1c0 --- /dev/null +++ b/experimental/zerotrie/src/builder/konst/builder.rs @@ -0,0 +1,296 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::super::branch_meta::BranchMeta; +use super::super::bytestr::ByteStr; +use super::store::const_for_each; +use super::store::ConstArrayBuilder; +use super::store::ConstLengthsStack; +use super::store::ConstSlice; +use crate::error::Error; +use crate::varint; + +/// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts. +pub(crate) struct ZeroTrieBuilderConst { + data: ConstArrayBuilder, +} + +impl ZeroTrieBuilderConst { + #[cfg(feature = "litemap")] + pub fn as_bytes(&self) -> &[u8] { + self.data.as_const_slice().as_slice() + } + + pub const fn take_or_panic(self) -> [u8; N] { + self.data.const_take_or_panic() + } + + pub const fn new() -> Self { + Self { + data: ConstArrayBuilder::new_empty([0; N], N), + } + } + + #[must_use] + const fn prepend_ascii(self, ascii: u8) -> (Self, usize) { + if ascii >= 128 { + panic!("Non-ASCII not supported in ZeroTrieSimpleAscii"); + } + let data = self.data.const_push_front_or_panic(ascii); + (Self { data }, 1) + } + + #[must_use] + const fn prepend_value(self, value: usize) -> (Self, usize) { + let mut data = self.data; + let varint_array = varint::write_extended_varint(value); + data = data.const_extend_front_or_panic(varint_array.as_const_slice()); + data = data.const_bitor_assign(0, 0b10000000); + (Self { data }, varint_array.len()) + } + + #[must_use] + const fn prepend_branch(self, value: usize) -> (Self, usize) { + let mut data = self.data; + let varint_array = varint::write_varint(value); + data = data.const_extend_front_or_panic(varint_array.as_const_slice()); + data = data.const_bitor_assign(0, 0b11000000); + (Self { data }, varint_array.len()) + } + + #[must_use] + const fn prepend_slice(self, s: ConstSlice) -> (Self, usize) { + let mut data = self.data; + let mut i = s.len(); + while i > 0 { + data = data.const_push_front_or_panic(*s.get_or_panic(i - 1)); + i -= 1; + } + (Self { data }, s.len()) + } + + #[must_use] + const fn prepend_n_zeros(self, n: usize) -> Self { + let mut data = self.data; + let mut i = 0; + while i < n { + data = data.const_push_front_or_panic(0); + i += 1; + } + Self { data } + } + + const fn bitor_assign_at(self, index: usize, byte: u8) -> Self { + let mut data = self.data; + data = data.const_bitor_assign(index, byte); + Self { data } + } + + /// Panics if the items are not sorted + pub const fn from_tuple_slice<'a, const K: usize>( + items: &[(&'a ByteStr, usize)], + ) -> Result { + let items = ConstSlice::from_slice(items); + let mut prev: Option<&'a ByteStr> = None; + const_for_each!(items, (ascii_str, _), { + match prev { + None => (), + Some(prev) => { + if !prev.is_less_then(ascii_str) { + panic!("Strings in ByteStr constructor are not sorted"); + } + } + }; + prev = Some(ascii_str) + }); + Self::from_sorted_const_tuple_slice::(items) + } + + /// Assumes that the items are sorted + pub const fn from_sorted_const_tuple_slice( + items: ConstSlice<(&ByteStr, usize)>, + ) -> Result { + let mut result = Self::new(); + let total_size; + (result, total_size) = result.create_or_panic::(items); + debug_assert!(total_size == result.data.len()); + Ok(result) + } + + #[must_use] + const fn create_or_panic( + mut self, + all_items: ConstSlice<(&ByteStr, usize)>, + ) -> (Self, usize) { + let mut prefix_len = match all_items.last() { + Some(x) => x.0.len(), + // Empty slice: + None => return (Self::new(), 0), + }; + let mut lengths_stack = ConstLengthsStack::::new(); + let mut i = all_items.len() - 1; + let mut j = all_items.len(); + let mut current_len = 0; + loop { + let item_i = all_items.get_or_panic(i); + let item_j = all_items.get_or_panic(j - 1); + assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); + if item_i.0.len() == prefix_len { + let len; + (self, len) = self.prepend_value(item_i.1); + current_len += len; + } + if prefix_len == 0 { + break; + } + prefix_len -= 1; + let mut new_i = i; + let mut new_j = j; + let mut diff_i = 0; + let mut diff_j = 0; + let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len); + let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len); + assert!(ascii_i == ascii_j); + let key_ascii = ascii_i; + loop { + if new_i == 0 { + break; + } + let candidate = all_items.get_or_panic(new_i - 1).0; + if candidate.len() < prefix_len { + // Too short + break; + } + if item_i.0.prefix_eq(candidate, prefix_len) { + new_i -= 1; + } else { + break; + } + if candidate.len() == prefix_len { + // A string of length prefix_len can't be preceded by another with that prefix + break; + } + let candidate = candidate.byte_at_or_panic(prefix_len); + if candidate != ascii_i { + diff_i += 1; + ascii_i = candidate; + } + } + loop { + if new_j == all_items.len() { + break; + } + let candidate = all_items.get_or_panic(new_j).0; + if candidate.len() < prefix_len { + // Too short + break; + } + if item_j.0.prefix_eq(candidate, prefix_len) { + new_j += 1; + } else { + break; + } + if candidate.len() == prefix_len { + panic!("A shorter string should be earlier in the sequence"); + } + let candidate = candidate.byte_at_or_panic(prefix_len); + if candidate != ascii_j { + diff_j += 1; + ascii_j = candidate; + } + } + if diff_i == 0 && diff_j == 0 { + let len; + (self, len) = self.prepend_ascii(ascii_i); + current_len += len; + assert!(i == new_i || i == new_i + 1); + i = new_i; + assert!(j == new_j); + continue; + } + // Branch + if diff_j == 0 { + lengths_stack = lengths_stack.push_or_panic(BranchMeta { + ascii: key_ascii, + cumulative_length: current_len, + local_length: current_len, + count: 1, + }); + } else { + let BranchMeta { + cumulative_length, + count, + .. + } = lengths_stack.peek_or_panic(); + lengths_stack = lengths_stack.push_or_panic(BranchMeta { + ascii: key_ascii, + cumulative_length: cumulative_length + current_len, + local_length: current_len, + count: count + 1, + }); + } + if diff_i != 0 { + j = i; + i -= 1; + prefix_len = all_items.get_or_panic(i).0.len(); + current_len = 0; + continue; + } + // Branch (first) + let (total_length, total_count) = { + let BranchMeta { + cumulative_length, + count, + .. + } = lengths_stack.peek_or_panic(); + (cumulative_length, count) + }; + let branch_metas; + (lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count); + let original_keys = branch_metas.map_to_ascii_bytes(); + // Write out the offset table + current_len = total_length; + const USIZE_BITS: usize = core::mem::size_of::() * 8; + let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8; + if w > 3 { + panic!("ZeroTrie capacity exceeded"); + } + let mut k = 0; + while k <= w { + self = self.prepend_n_zeros(total_count - 1); + current_len += total_count - 1; + let mut l = 0; + let mut length_to_write = 0; + while l < total_count { + let BranchMeta { local_length, .. } = *branch_metas + .as_const_slice() + .get_or_panic(total_count - l - 1); + let mut adjusted_length = length_to_write; + let mut m = 0; + while m < k { + adjusted_length >>= 8; + m += 1; + } + if l > 0 { + self = self.bitor_assign_at(l - 1, adjusted_length as u8); + } + l += 1; + length_to_write += local_length; + } + k += 1; + } + assert!(0 < total_count && total_count <= 256); + let branch_value = (w << 8) + (total_count & 0xff); + let slice_len; + (self, slice_len) = self.prepend_slice(original_keys.as_const_slice()); + let branch_len; + (self, branch_len) = self.prepend_branch(branch_value); + current_len += slice_len + branch_len; + i = new_i; + j = new_j; + } + assert!(lengths_stack.is_empty()); + (self, current_len) + } +} diff --git a/experimental/zerotrie/src/builder/konst/mod.rs b/experimental/zerotrie/src/builder/konst/mod.rs new file mode 100644 index 00000000000..275af4bf671 --- /dev/null +++ b/experimental/zerotrie/src/builder/konst/mod.rs @@ -0,0 +1,9 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod builder; +mod store; + +pub(crate) use builder::*; +pub(crate) use store::ConstArrayBuilder; diff --git a/experimental/zerotrie/src/builder/konst/store.rs b/experimental/zerotrie/src/builder/konst/store.rs new file mode 100644 index 00000000000..5b6cde31ec8 --- /dev/null +++ b/experimental/zerotrie/src/builder/konst/store.rs @@ -0,0 +1,294 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! This module contains internal collections for the const builder. + +use super::super::branch_meta::BranchMeta; + +/// A const-friendly slice type. +#[derive(Debug, Copy, Clone)] +pub(crate) struct ConstSlice<'a, T> { + full_slice: &'a [T], + start: usize, + limit: usize, +} + +impl<'a, T> ConstSlice<'a, T> { + pub const fn from_slice(other: &'a [T]) -> Self { + ConstSlice { + full_slice: other, + start: 0, + limit: other.len(), + } + } + + pub const fn from_manual_slice(full_slice: &'a [T], start: usize, limit: usize) -> Self { + ConstSlice { + full_slice, + start, + limit, + } + } + + pub const fn len(&self) -> usize { + self.limit - self.start + } + + pub const fn get_or_panic(&self, index: usize) -> &T { + &self.full_slice[index + self.start] + } + + #[cfg(test)] + pub const fn first(&self) -> Option<&T> { + if self.len() == 0 { + None + } else { + Some(self.get_or_panic(0)) + } + } + + pub const fn last(&self) -> Option<&T> { + if self.len() == 0 { + None + } else { + Some(self.get_or_panic(self.len() - 1)) + } + } + + #[cfg(test)] + pub const fn get_subslice_or_panic( + &self, + new_start: usize, + new_limit: usize, + ) -> ConstSlice<'a, T> { + assert!(new_start <= new_limit); + assert!(new_limit <= self.len()); + ConstSlice { + full_slice: self.full_slice, + start: self.start + new_start, + limit: self.start + new_limit, + } + } + + #[cfg(any(test, feature = "alloc"))] + pub fn as_slice(&self) -> &'a [T] { + &self.full_slice[self.start..self.limit] + } +} + +impl<'a, T> From<&'a [T]> for ConstSlice<'a, T> { + fn from(other: &'a [T]) -> Self { + Self::from_slice(other) + } +} + +/// A const-friendly mutable data structure backed by an array. +#[derive(Debug, Copy, Clone)] +pub(crate) struct ConstArrayBuilder { + full_array: [T; N], + start: usize, + limit: usize, +} + +impl Default for ConstArrayBuilder { + fn default() -> Self { + Self::new_empty([(); N].map(|_| Default::default()), 0) + } +} + +impl ConstArrayBuilder { + pub const fn new_empty(full_array: [T; N], cursor: usize) -> Self { + assert!(cursor <= N); + Self { + full_array, + start: cursor, + limit: cursor, + } + } + + pub const fn from_manual_slice(full_array: [T; N], start: usize, limit: usize) -> Self { + assert!(start <= limit); + assert!(limit <= N); + Self { + full_array, + start, + limit, + } + } + + pub const fn len(&self) -> usize { + self.limit - self.start + } + + #[allow(dead_code)] + pub const fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub const fn as_const_slice(&self) -> ConstSlice { + ConstSlice::from_manual_slice(&self.full_array, self.start, self.limit) + } + + #[cfg(feature = "alloc")] + pub fn as_slice(&self) -> &[T] { + &self.full_array[self.start..self.limit] + } +} + +impl ConstArrayBuilder { + pub const fn const_bitor_assign(mut self, index: usize, other: u8) -> Self { + self.full_array[self.start + index] |= other; + self + } + // Can't be generic because T has a destructor + pub const fn const_take_or_panic(self) -> [u8; N] { + if self.start != 0 || self.limit != N { + panic!("AsciiTrieBuilder buffer too large"); + } + self.full_array + } + // Can't be generic because T has a destructor + pub const fn const_push_front_or_panic(mut self, value: u8) -> Self { + if self.start == 0 { + panic!("AsciiTrieBuilder buffer too small"); + } + self.start -= 1; + self.full_array[self.start] = value; + self + } + // Can't be generic because T has a destructor + pub const fn const_extend_front_or_panic(mut self, other: ConstSlice) -> Self { + if self.start < other.len() { + panic!("AsciiTrieBuilder buffer too small"); + } + self.start -= other.len(); + let mut i = self.start; + const_for_each!(other, byte, { + self.full_array[i] = *byte; + i += 1; + }); + self + } +} + +impl ConstArrayBuilder { + pub const fn push_front_or_panic(mut self, value: T) -> Self { + if self.start == 0 { + panic!("AsciiTrieBuilder buffer too small"); + } + self.start -= 1; + self.full_array[self.start] = value; + self + } + #[cfg(feature = "alloc")] + pub fn swap_or_panic(mut self, i: usize, j: usize) -> Self { + self.full_array.swap(self.start + i, self.start + j); + self + } +} + +macro_rules! const_for_each { + ($safe_const_slice:expr, $item:tt, $inner:expr) => {{ + let mut i = 0; + while i < $safe_const_slice.len() { + let $item = $safe_const_slice.get_or_panic(i); + $inner; + i += 1; + } + }}; +} + +pub(crate) use const_for_each; + +pub(crate) struct ConstLengthsStack { + data: [Option; N], + idx: usize, +} + +impl core::fmt::Debug for ConstLengthsStack { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.as_slice().fmt(f) + } +} + +impl ConstLengthsStack { + pub const fn new() -> Self { + Self { + data: [None; N], + idx: 0, + } + } + + pub const fn is_empty(&self) -> bool { + self.idx == 0 + } + + #[must_use] + pub const fn push_or_panic(mut self, meta: BranchMeta) -> Self { + if self.idx >= N { + panic!(concat!( + "AsciiTrie Builder: Need more stack (max ", + stringify!(N), + ")" + )); + } + self.data[self.idx] = Some(meta); + self.idx += 1; + self + } + + pub const fn peek_or_panic(&self) -> BranchMeta { + if self.idx == 0 { + panic!("AsciiTrie Builder: Attempted to peek from an empty stack"); + } + self.get_or_panic(0) + } + + const fn get_or_panic(&self, index: usize) -> BranchMeta { + if self.idx <= index { + panic!("AsciiTrie Builder: Attempted to get too deep in a stack"); + } + match self.data[self.idx - index - 1] { + Some(x) => x, + None => unreachable!(), + } + } + + pub const fn pop_many_or_panic( + mut self, + len: usize, + ) -> (Self, ConstArrayBuilder<256, BranchMeta>) { + debug_assert!(len <= 256); + let mut result = ConstArrayBuilder::new_empty([BranchMeta::const_default(); 256], 256); + let mut ix = 0; + loop { + if ix == len { + break; + } + let i = self.idx - ix - 1; + result = result.push_front_or_panic(match self.data[i] { + Some(x) => x, + None => panic!("Not enough items in the ConstLengthsStack"), + }); + ix += 1; + } + self.idx -= len; + (self, result) + } + + fn as_slice(&self) -> &[Option] { + &self.data[0..self.idx] + } +} + +impl ConstArrayBuilder { + pub const fn map_to_ascii_bytes(&self) -> ConstArrayBuilder { + let mut result = ConstArrayBuilder::new_empty([0; N], N); + let self_as_slice = self.as_const_slice(); + const_for_each!(self_as_slice, value, { + result = result.const_push_front_or_panic(value.ascii); + }); + result + } +} diff --git a/experimental/zerotrie/src/builder/litemap.rs b/experimental/zerotrie/src/builder/litemap.rs new file mode 100644 index 00000000000..64a5915303e --- /dev/null +++ b/experimental/zerotrie/src/builder/litemap.rs @@ -0,0 +1,54 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::konst::*; +use crate::builder::bytestr::ByteStr; +use crate::error::Error; +use crate::zerotrie::ZeroTrieSimpleAscii; +use crate::ZeroTrie; +use alloc::borrow::Borrow; +use alloc::vec::Vec; +use litemap::LiteMap; + +impl ZeroTrieSimpleAscii> { + #[doc(hidden)] + pub fn try_from_litemap_with_const_builder<'a, S>( + items: &LiteMap<&'a [u8], usize, S>, + ) -> Result + where + S: litemap::store::StoreSlice<&'a [u8], usize, Slice = [(&'a [u8], usize)]>, + { + let tuples = items.as_slice(); + let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples); + ZeroTrieBuilderConst::<10000>::from_sorted_const_tuple_slice::<100>(byte_str_slice.into()) + .map(|s| Self { + store: s.as_bytes().to_vec(), + }) + } +} + +impl<'a, K, S> TryFrom<&'a LiteMap> for ZeroTrie> +where + // Borrow, not AsRef, because we rely on Ord being the same. Unfortunately + // this means `LiteMap<&str, usize>` does not work. + K: Borrow<[u8]>, + S: litemap::store::StoreSlice, +{ + type Error = Error; + fn try_from(items: &LiteMap) -> Result { + let byte_litemap = items.to_borrowed_keys::<[u8], Vec<_>>(); + let byte_slice = byte_litemap.as_slice(); + let byte_str_slice = ByteStr::from_byte_slice_with_value(byte_slice); + Self::try_from_tuple_slice(byte_str_slice) + } +} + +/// TODO: Once const mut references are allowed, we can make this more infallible by +/// calculating the required length, heap-allocating the required capacity, and pointing +/// ConstAsciiTrieBuilderStore to the heap buffer. +/// ```compile_fail +/// // error[E0658]: mutable references are not allowed in constant functions +/// const fn write_to_mut_buffer(buf: &mut [u8]) { buf[0] = 0; } +/// ``` +const _: () = (); diff --git a/experimental/zerotrie/src/builder/mod.rs b/experimental/zerotrie/src/builder/mod.rs new file mode 100644 index 00000000000..867fe4b5e2f --- /dev/null +++ b/experimental/zerotrie/src/builder/mod.rs @@ -0,0 +1,140 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod branch_meta; +pub(crate) mod bytestr; +pub(crate) mod konst; +#[cfg(feature = "litemap")] +mod litemap; +#[cfg(feature = "alloc")] +pub(crate) mod nonconst; + +use bytestr::ByteStr; + +use super::ZeroTrieSimpleAscii; + +impl ZeroTrieSimpleAscii<[u8; N]> { + /// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values. + /// + /// This function needs to know the exact length of the resulting trie at compile time. + /// + /// Also see [`Self::from_sorted_str_tuples`]. + /// + /// # Panics + /// + /// Panics if `items` is not sorted or if `N` is not correct. + /// + /// # Examples + /// + /// Create a `const` ZeroTrieSimpleAscii at compile time: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // The required capacity for this trie happens to be 17 bytes + /// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ + /// (b"bar", 2), + /// (b"bazzoo", 3), + /// (b"foo", 1), + /// ]); + /// + /// assert_eq!(TRIE.get(b"foo"), Some(1)); + /// assert_eq!(TRIE.get(b"bar"), Some(2)); + /// assert_eq!(TRIE.get(b"bazzoo"), Some(3)); + /// assert_eq!(TRIE.get(b"unknown"), None); + /// ``` + /// + /// Panics if strings are not sorted: + /// + /// ```compile_fail + /// # use zerotrie::ZeroTrieSimpleAscii; + /// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ + /// (b"foo", 1), + /// (b"bar", 2), + /// (b"bazzoo", 3), + /// ]); + /// ``` + /// + /// Panics if capacity is too small: + /// + /// ```compile_fail + /// # use zerotrie::ZeroTrieSimpleAscii; + /// const TRIE: ZeroTrieSimpleAscii<[u8; 15]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ + /// (b"bar", 2), + /// (b"bazzoo", 3), + /// (b"foo", 1), + /// ]); + /// ``` + /// + /// Panics if capacity is too large: + /// + /// ```compile_fail + /// # use zerotrie::{ZeroTrieSimpleAscii, AsciiStr}; + /// const TRIE: ZeroTrieSimpleAscii<[u8; 20]> = ZeroTrieSimpleAscii::from_sorted_u8_tuples(&[ + /// (b"bar", 2), + /// (b"bazzoo", 3), + /// (b"foo", 1), + /// ]); + /// ``` + pub const fn from_sorted_u8_tuples(tuples: &[(&[u8], usize)]) -> Self { + use konst::*; + let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples); + let result = ZeroTrieBuilderConst::::from_tuple_slice::<100>(byte_str_slice); + match result { + Ok(s) => Self::from_store(s.take_or_panic()), + Err(_) => panic!("Failed to build ZeroTrie"), + } + } + + /// **Const Constructor:** Creates an [`ZeroTrieSimpleAscii`] from a sorted slice of keys and values. + /// + /// This function needs to know the exact length of the resulting trie at compile time. + /// + /// Also see [`Self::from_sorted_u8_tuples`]. + /// + /// # Panics + /// + /// Panics if `items` is not sorted, if `N` is not correct, or if any of the strings contain + /// non-ASCII characters. + /// + /// # Examples + /// + /// Create a `const` ZeroTrieSimpleAscii at compile time: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // The required capacity for this trie happens to be 17 bytes + /// const TRIE: ZeroTrieSimpleAscii<[u8; 17]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&[ + /// ("bar", 2), + /// ("bazzoo", 3), + /// ("foo", 1), + /// ]); + /// + /// assert_eq!(TRIE.get(b"foo"), Some(1)); + /// assert_eq!(TRIE.get(b"bar"), Some(2)); + /// assert_eq!(TRIE.get(b"bazzoo"), Some(3)); + /// assert_eq!(TRIE.get(b"unknown"), None); + /// ``` + /// + /// Panics if the strings are not ASCII: + /// + /// ```compile_fail + /// # use zerotrie::ZeroTrieSimpleAscii; + /// const TRIE: ZeroTrieSimpleAscii<[u8; 100]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&[ + /// ("bár", 2), + /// ("båzzöo", 3), + /// ("foo", 1), + /// ]); + /// ``` + pub const fn from_sorted_str_tuples(tuples: &[(&str, usize)]) -> Self { + use konst::*; + let byte_str_slice = ByteStr::from_str_slice_with_value(tuples); + let result = ZeroTrieBuilderConst::::from_tuple_slice::<100>(byte_str_slice); + match result { + Ok(s) => Self::from_store(s.take_or_panic()), + Err(_) => panic!("Failed to build ZeroTrie"), + } + } +} diff --git a/experimental/zerotrie/src/builder/nonconst/builder.rs b/experimental/zerotrie/src/builder/nonconst/builder.rs new file mode 100644 index 00000000000..fd1be2b16a7 --- /dev/null +++ b/experimental/zerotrie/src/builder/nonconst/builder.rs @@ -0,0 +1,362 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::super::branch_meta::BranchMeta; +use super::store::NonConstLengthsStack; +use super::store::TrieBuilderStore; +use crate::builder::bytestr::ByteStr; +use crate::byte_phf::PerfectByteHashMapCacheOwned; +use crate::error::Error; +use crate::varint; +use alloc::vec::Vec; + +/// Whether to use the perfect hash function in the ZeroTrie. +pub enum PhfMode { + /// Use binary search for all branch nodes. + BinaryOnly, + /// Use the perfect hash function for large branch nodes. + UsePhf, +} + +/// Whether to support non-ASCII data in the ZeroTrie. +pub enum AsciiMode { + /// Support only ASCII, returning an error if non-ASCII is found. + AsciiOnly, + /// Support all data, creating span nodes for non-ASCII bytes. + BinarySpans, +} + +/// Whether to enforce a limit to the capacity of the ZeroTrie. +pub enum CapacityMode { + /// Return an error if the trie requires a branch of more than 2^32 bytes. + Normal, + /// Construct the trie without returning an error. + Extended, +} + +pub struct ZeroTrieBuilderOptions { + pub phf_mode: PhfMode, + pub ascii_mode: AsciiMode, + pub capacity_mode: CapacityMode, +} + +/// A low-level builder for ZeroTrie. Supports all options. +pub(crate) struct ZeroTrieBuilder { + data: S, + phf_cache: PerfectByteHashMapCacheOwned, + options: ZeroTrieBuilderOptions, +} + +impl ZeroTrieBuilder { + pub fn to_bytes(&self) -> Vec { + self.data.atbs_to_bytes() + } + + fn prepend_ascii(&mut self, ascii: u8) -> Result { + if ascii <= 127 { + self.data.atbs_push_front(ascii); + Ok(1) + } else if matches!(self.options.ascii_mode, AsciiMode::BinarySpans) { + if let Some(old_front) = self.data.atbs_split_first() { + let old_byte_len = self.data.atbs_len() + 1; + if old_front & 0b11100000 == 0b10100000 { + // Extend an existing span + // Unwrap OK: there is a varint at this location in the buffer + #[allow(clippy::unwrap_used)] + let old_span_size = + varint::try_read_extended_varint_from_tstore(old_front, &mut self.data) + .unwrap(); + self.data.atbs_push_front(ascii); + let varint_array = varint::write_extended_varint(old_span_size + 1); + self.data.atbs_extend_front(varint_array.as_slice()); + self.data.atbs_bitor_assign(0, 0b10100000); + let new_byte_len = self.data.atbs_len(); + return Ok(new_byte_len - old_byte_len); + } else { + self.data.atbs_push_front(old_front); + } + } + // Create a new span + self.data.atbs_push_front(ascii); + self.data.atbs_push_front(0b10100001); + Ok(2) + } else { + Err(Error::NonAsciiError) + } + } + + #[must_use] + fn prepend_value(&mut self, value: usize) -> usize { + let varint_array = varint::write_extended_varint(value); + self.data.atbs_extend_front(varint_array.as_slice()); + self.data.atbs_bitor_assign(0, 0b10000000); + varint_array.len() + } + + #[must_use] + fn prepend_branch(&mut self, value: usize) -> usize { + let varint_array = varint::write_varint(value); + self.data.atbs_extend_front(varint_array.as_slice()); + self.data.atbs_bitor_assign(0, 0b11000000); + varint_array.len() + } + + #[must_use] + fn prepend_slice(&mut self, s: &[u8]) -> usize { + self.data.atbs_extend_front(s); + s.len() + } + + /// Builds a ZeroTrie from an iterator of bytes. It first collects and sorts the iterator. + pub fn from_bytes_iter, I: IntoIterator>( + iter: I, + options: ZeroTrieBuilderOptions, + ) -> Result { + let items = Vec::<(K, usize)>::from_iter(iter); + let mut items = items + .iter() + .map(|(k, v)| (k.as_ref(), *v)) + .collect::>(); + items.sort(); + let ascii_str_slice = items.as_slice(); + let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice); + Self::from_sorted_tuple_slice(byte_str_slice, options) + } + + /// Builds a ZeroTrie with the given items and options. Assumes that the items are sorted. + /// + /// # Panics + /// + /// May panic if the items are not sorted. + pub fn from_sorted_tuple_slice( + items: &[(&ByteStr, usize)], + options: ZeroTrieBuilderOptions, + ) -> Result { + let mut result = Self { + data: S::atbs_new_empty(), + phf_cache: PerfectByteHashMapCacheOwned::new_empty(), + options, + }; + let total_size = result.create(items)?; + debug_assert!(total_size == result.data.atbs_len()); + Ok(result) + } + + #[allow(clippy::unwrap_used)] // lots of indexing, but all indexes should be in range + fn create(&mut self, all_items: &[(&ByteStr, usize)]) -> Result { + let mut prefix_len = match all_items.last() { + Some(x) => x.0.len(), + // Empty slice: + None => return Ok(0), + }; + let mut lengths_stack = NonConstLengthsStack::new(); + let mut i = all_items.len() - 1; + let mut j = all_items.len(); + let mut current_len = 0; + loop { + let item_i = all_items.get(i).unwrap(); + let item_j = all_items.get(j - 1).unwrap(); + assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); + if item_i.0.len() == prefix_len { + let len = self.prepend_value(item_i.1); + current_len += len; + } + if prefix_len == 0 { + break; + } + prefix_len -= 1; + let mut new_i = i; + let mut new_j = j; + let mut diff_i = 0; + let mut diff_j = 0; + let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len); + let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len); + assert_eq!(ascii_i, ascii_j); + let key_ascii = ascii_i; + loop { + if new_i == 0 { + break; + } + let candidate = all_items.get(new_i - 1).unwrap().0; + if candidate.len() < prefix_len { + // Too short + break; + } + if item_i.0.prefix_eq(candidate, prefix_len) { + new_i -= 1; + } else { + break; + } + if candidate.len() == prefix_len { + // A string of length prefix_len can't be preceded by another with that prefix + break; + } + let candidate = candidate.byte_at_or_panic(prefix_len); + if candidate != ascii_i { + diff_i += 1; + ascii_i = candidate; + } + } + loop { + if new_j == all_items.len() { + break; + } + let candidate = all_items.get(new_j).unwrap().0; + if candidate.len() < prefix_len { + // Too short + break; + } + if item_j.0.prefix_eq(candidate, prefix_len) { + new_j += 1; + } else { + break; + } + if candidate.len() == prefix_len { + panic!("A shorter string should be earlier in the sequence"); + } + let candidate = candidate.byte_at_or_panic(prefix_len); + if candidate != ascii_j { + diff_j += 1; + ascii_j = candidate; + } + } + if diff_i == 0 && diff_j == 0 { + let len = self.prepend_ascii(ascii_i)?; + current_len += len; + assert!(i == new_i || i == new_i + 1); + i = new_i; + assert_eq!(j, new_j); + continue; + } + // Branch + if diff_j == 0 { + lengths_stack.push(BranchMeta { + ascii: key_ascii, + cumulative_length: current_len, + local_length: current_len, + count: 1, + }); + } else { + let BranchMeta { + cumulative_length, + count, + .. + } = lengths_stack.peek_or_panic(); + lengths_stack.push(BranchMeta { + ascii: key_ascii, + cumulative_length: cumulative_length + current_len, + local_length: current_len, + count: count + 1, + }); + } + if diff_i != 0 { + j = i; + i -= 1; + prefix_len = all_items.get(i).unwrap().0.len(); + current_len = 0; + continue; + } + // Branch (first) + // std::println!("lengths_stack: {lengths_stack:?}"); + let (total_length, total_count) = { + let BranchMeta { + cumulative_length, + count, + .. + } = lengths_stack.peek_or_panic(); + (cumulative_length, count) + }; + let mut branch_metas = lengths_stack.pop_many_or_panic(total_count); + let original_keys = branch_metas.map_to_ascii_bytes(); + let use_phf = matches!(self.options.phf_mode, PhfMode::UsePhf); + let opt_phf_vec = if total_count > 15 && use_phf { + let phf_vec = self + .phf_cache + .try_get_or_insert(original_keys.as_const_slice().as_slice().to_vec())?; + // Put everything in order via bubble sort + // Note: branch_metas is stored in reverse order (0 = last element) + loop { + let mut l = total_count - 1; + let mut changes = 0; + let mut start = 0; + while l > 0 { + let a = *branch_metas.as_const_slice().get_or_panic(l); + let b = *branch_metas.as_const_slice().get_or_panic(l - 1); + let a_idx = phf_vec.keys().iter().position(|x| x == &a.ascii).unwrap(); + let b_idx = phf_vec.keys().iter().position(|x| x == &b.ascii).unwrap(); + if a_idx > b_idx { + // std::println!("{a:?} <=> {b:?} ({phf_vec:?})"); + self.data.atbs_swap_ranges( + start, + start + a.local_length, + start + a.local_length + b.local_length, + ); + branch_metas = branch_metas.swap_or_panic(l - 1, l); + start += b.local_length; + changes += 1; + // FIXME: fix the `length` field + } else { + start += a.local_length; + } + l -= 1; + } + if changes == 0 { + break; + } + } + Some(phf_vec) + } else { + None + }; + // Write out the offset table + current_len = total_length; + const USIZE_BITS: usize = core::mem::size_of::() * 8; + let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8; + if w > 3 && matches!(self.options.capacity_mode, CapacityMode::Normal) { + return Err(Error::CapacityExceeded); + } + let mut k = 0; + while k <= w { + self.data.atbs_prepend_n_zeros(total_count - 1); + current_len += total_count - 1; + let mut l = 0; + let mut length_to_write = 0; + while l < total_count { + let BranchMeta { local_length, .. } = *branch_metas + .as_const_slice() + .get_or_panic(total_count - l - 1); + let mut adjusted_length = length_to_write; + let mut m = 0; + while m < k { + adjusted_length >>= 8; + m += 1; + } + if l > 0 { + self.data.atbs_bitor_assign(l - 1, adjusted_length as u8); + } + l += 1; + length_to_write += local_length; + } + k += 1; + } + // Write out the lookup table + assert!(0 < total_count && total_count <= 256); + let branch_value = (w << 8) + (total_count & 0xff); + if let Some(phf_vec) = opt_phf_vec { + self.data.atbs_extend_front(phf_vec.as_bytes()); + let phf_len = phf_vec.as_bytes().len(); + let branch_len = self.prepend_branch(branch_value); + current_len += phf_len + branch_len; + } else { + let search_len = self.prepend_slice(original_keys.as_slice()); + let branch_len = self.prepend_branch(branch_value); + current_len += search_len + branch_len; + } + i = new_i; + j = new_j; + } + assert!(lengths_stack.is_empty()); + Ok(current_len) + } +} diff --git a/experimental/zerotrie/src/builder/nonconst/mod.rs b/experimental/zerotrie/src/builder/nonconst/mod.rs new file mode 100644 index 00000000000..77ccec0a56f --- /dev/null +++ b/experimental/zerotrie/src/builder/nonconst/mod.rs @@ -0,0 +1,33 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mod builder; +mod store; + +pub(crate) use builder::*; +pub(crate) use store::TrieBuilderStore; + +impl crate::ZeroTrieSimpleAscii { + pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::BinaryOnly, + ascii_mode: AsciiMode::AsciiOnly, + capacity_mode: CapacityMode::Normal, + }; +} + +impl crate::ZeroTriePerfectHash { + pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::UsePhf, + ascii_mode: AsciiMode::BinarySpans, + capacity_mode: CapacityMode::Normal, + }; +} + +impl crate::ZeroTrieExtendedCapacity { + pub(crate) const BUILDER_OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::UsePhf, + ascii_mode: AsciiMode::BinarySpans, + capacity_mode: CapacityMode::Extended, + }; +} diff --git a/experimental/zerotrie/src/builder/nonconst/store.rs b/experimental/zerotrie/src/builder/nonconst/store.rs new file mode 100644 index 00000000000..d86f96c3967 --- /dev/null +++ b/experimental/zerotrie/src/builder/nonconst/store.rs @@ -0,0 +1,158 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! This module contains internal collections for the non-const builder. + +use super::super::branch_meta::BranchMeta; +use super::super::konst::ConstArrayBuilder; +use alloc::collections::VecDeque; +use alloc::vec::Vec; + +pub trait TrieBuilderStore { + fn atbs_new_empty() -> Self; + fn atbs_len(&self) -> usize; + fn atbs_push_front(&mut self, byte: u8); + fn atbs_extend_front(&mut self, other: &[u8]); + fn atbs_to_bytes(&self) -> Vec; + fn atbs_bitor_assign(&mut self, index: usize, other: u8); + fn atbs_swap_ranges(&mut self, start: usize, mid: usize, limit: usize); + fn atbs_split_first(&mut self) -> Option; + + fn atbs_prepend_n_zeros(&mut self, n: usize) { + let mut i = 0; + while i < n { + self.atbs_push_front(0); + i += 1; + } + } +} + +impl TrieBuilderStore for VecDeque { + fn atbs_new_empty() -> Self { + VecDeque::new() + } + fn atbs_len(&self) -> usize { + self.len() + } + fn atbs_push_front(&mut self, byte: u8) { + self.push_front(byte); + } + fn atbs_extend_front(&mut self, other: &[u8]) { + // TODO: No extend_front on VecDeque? + self.reserve(other.len()); + for b in other.iter().rev() { + self.push_front(*b); + } + } + fn atbs_to_bytes(&self) -> Vec { + let mut v = Vec::with_capacity(self.len()); + let (a, b) = self.as_slices(); + v.extend(a); + v.extend(b); + v + } + fn atbs_bitor_assign(&mut self, index: usize, other: u8) { + self[index] |= other; + } + fn atbs_swap_ranges(&mut self, mut start: usize, mut mid: usize, mut limit: usize) { + if start > mid || mid > limit { + panic!("Invalid args to atbs_swap_ranges(): start > mid || mid > limit"); + } + if limit > self.len() { + panic!( + "Invalid args to atbs_swap_ranges(): limit out of range: {limit} > {}", + self.len() + ); + } + loop { + if start == mid || mid == limit { + return; + } + let len0 = mid - start; + let len1 = limit - mid; + let mut i = start; + let mut j = limit - core::cmp::min(len0, len1); + while j < limit { + self.swap(i, j); + i += 1; + j += 1; + } + if len0 < len1 { + mid = start + len0; + limit -= len0; + } else { + start += len1; + mid = limit - len1; + } + } + } + fn atbs_split_first(&mut self) -> Option { + self.pop_front() + } +} + +pub(crate) struct NonConstLengthsStack { + data: Vec, +} + +impl core::fmt::Debug for NonConstLengthsStack { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.as_slice().fmt(f) + } +} + +impl NonConstLengthsStack { + pub const fn new() -> Self { + Self { data: Vec::new() } + } + + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + pub fn push(&mut self, meta: BranchMeta) { + self.data.push(meta); + } + + pub fn peek_or_panic(&self) -> BranchMeta { + *self.data.last().unwrap() + } + + pub fn pop_many_or_panic(&mut self, len: usize) -> ConstArrayBuilder<256, BranchMeta> { + debug_assert!(len <= 256); + let mut result = ConstArrayBuilder::new_empty([BranchMeta::const_default(); 256], 256); + let mut ix = 0; + loop { + if ix == len { + break; + } + let i = self.data.len() - ix - 1; + // Won't panic because len <= 256 + result = result.push_front_or_panic(match self.data.get(i) { + Some(x) => *x, + None => panic!("Not enough items in the ConstLengthsStack"), + }); + ix += 1; + } + self.data.truncate(self.data.len() - len); + result + } + + fn as_slice(&self) -> &[BranchMeta] { + &self.data + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_swap_ranges() { + let s = b"..abcdefghijkl="; + let mut s = s.iter().copied().collect::>(); + s.atbs_swap_ranges(2, 7, 14); + assert_eq!(s.atbs_to_bytes(), b"..fghijklabcde="); + } +} diff --git a/experimental/zerotrie/src/byte_phf/builder.rs b/experimental/zerotrie/src/byte_phf/builder.rs new file mode 100644 index 00000000000..c5c01b90104 --- /dev/null +++ b/experimental/zerotrie/src/byte_phf/builder.rs @@ -0,0 +1,115 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::*; +use crate::error::Error; +use alloc::vec; +use alloc::vec::Vec; + +/// To speed up the search algorithm, we limit the number of times the level-2 parameter (q) +/// can hit its max value of 255 before we try the next level-1 parameter (p). In practice, +/// this has a small impact on the resulting perfect hash, resulting in about 1 in 10000 +/// hash maps that fall back to the slow path. +const MAX_L2_SEARCH_MISSES: usize = 24; + +#[allow(unused_labels)] // for readability +pub fn find(bytes: &[u8]) -> Result<(u8, Vec), Error> { + #[allow(non_snake_case)] + let N = bytes.len(); + + let mut p = 0u8; + let mut qq = vec![0u8; N]; + + let mut bqs = vec![0u8; N]; + let mut seen = vec![false; N]; + let mut max_allowable_p = P_FAST_MAX; + let mut max_allowable_q = Q_FAST_MAX; + + 'p_loop: loop { + let mut buckets: Vec<(usize, Vec)> = (0..N).map(|i| (i, vec![])).collect(); + for byte in bytes { + buckets[f1(*byte, p, N)].1.push(*byte); + } + buckets.sort_by_key(|(_, v)| -(v.len() as isize)); + // println!("New P: p={p:?}, buckets={buckets:?}"); + let mut i = 0; + let mut num_max_q = 0; + bqs.fill(0); + seen.fill(false); + 'q_loop: loop { + if i == buckets.len() { + for (local_j, real_j) in buckets.iter().map(|(j, _)| *j).enumerate() { + qq[real_j] = bqs[local_j]; + } + // println!("Success: p={p:?}, num_max_q={num_max_q:?}, bqs={bqs:?}, qq={qq:?}"); + // if num_max_q > 0 { + // println!("num_max_q={num_max_q:?}"); + // } + return Ok((p, qq)); + } + let mut bucket = buckets[i].1.as_slice(); + 'byte_loop: for (j, byte) in bucket.iter().enumerate() { + if seen[f2(*byte, bqs[i], N)] { + // println!("Skipping Q: p={p:?}, i={i:?}, byte={byte:}, q={i:?}, l2={:?}", f2(*byte, bqs[i], N)); + for k_byte in &bucket[0..j] { + assert!(seen[f2(*k_byte, bqs[i], N)]); + seen[f2(*k_byte, bqs[i], N)] = false; + } + 'reset_loop: loop { + if bqs[i] < max_allowable_q { + bqs[i] += 1; + continue 'q_loop; + } + num_max_q += 1; + bqs[i] = 0; + if i == 0 || num_max_q > MAX_L2_SEARCH_MISSES { + if p == max_allowable_p && max_allowable_p != P_REAL_MAX { + max_allowable_p = P_REAL_MAX; + max_allowable_q = Q_REAL_MAX; + p = 0; + continue 'p_loop; + } else if p == P_REAL_MAX { + // println!("Could not solve PHF function"); + return Err(Error::CouldNotSolvePerfectHash); + } else { + p += 1; + continue 'p_loop; + } + } + i -= 1; + bucket = buckets[i].1.as_slice(); + for byte in bucket { + assert!(seen[f2(*byte, bqs[i], N)]); + seen[f2(*byte, bqs[i], N)] = false; + } + } + } else { + // println!("Marking as seen: i={i:?}, byte={byte:}, l2={:?}", f2(*byte, bqs[i], N)); + seen[f2(*byte, bqs[i], N)] = true; + } + } + // println!("Found Q: i={i:?}, q={:?}", bqs[i]); + i += 1; + } + } +} + +impl PerfectByteHashMap> { + pub fn try_new(keys: &[u8]) -> Result { + let n = keys.len(); + let (p, mut qq) = find(keys)?; + let mut keys_permuted = vec![0; n]; + for key in keys { + let l1 = f1(*key, p, n); + let q = qq[l1]; + let l2 = f2(*key, q, n); + keys_permuted[l2] = *key; + } + let mut result = Vec::with_capacity(n * 2 + 1); + result.push(p); + result.append(&mut qq); + result.append(&mut keys_permuted); + Ok(Self(result)) + } +} diff --git a/experimental/zerotrie/src/byte_phf/cached_owned.rs b/experimental/zerotrie/src/byte_phf/cached_owned.rs new file mode 100644 index 00000000000..a4c2732f474 --- /dev/null +++ b/experimental/zerotrie/src/byte_phf/cached_owned.rs @@ -0,0 +1,37 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use super::*; +use crate::error::Error; +use alloc::collections::btree_map::Entry; +use alloc::collections::BTreeMap; +use alloc::vec::Vec; + +pub struct PerfectByteHashMapCacheOwned { + // Note: This should probably be a HashMap but that isn't in `alloc` + data: BTreeMap, PerfectByteHashMap>>, +} + +impl PerfectByteHashMapCacheOwned { + pub fn new_empty() -> Self { + Self { + data: BTreeMap::new(), + } + } + + pub fn try_get_or_insert(&mut self, keys: Vec) -> Result<&PerfectByteHashMap<[u8]>, Error> { + let mut_phf = match self.data.entry(keys) { + Entry::Vacant(entry) => { + let value = PerfectByteHashMap::try_new(entry.key())?; + entry.insert(value) + } + Entry::Occupied(entry) => entry.into_mut(), + }; + Ok(mut_phf.as_borrowed()) + } + + pub fn get(&self, keys: &[u8]) -> Option<&PerfectByteHashMap<[u8]>> { + self.data.get(keys).map(|p| p.as_borrowed()) + } +} diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs new file mode 100644 index 00000000000..fc89b50d0de --- /dev/null +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -0,0 +1,355 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[cfg(feature = "alloc")] +mod builder; +#[cfg(feature = "alloc")] +mod cached_owned; + +#[cfg(feature = "alloc")] +pub use builder::find; +#[cfg(feature = "alloc")] +pub use cached_owned::PerfectByteHashMapCacheOwned; + +use ref_cast::RefCast; + +const P_FAST_MAX: u8 = 11; +const Q_FAST_MAX: u8 = 95; + +#[cfg(feature = "alloc")] // used in the builder code +const P_REAL_MAX: u8 = 15; +#[cfg(feature = "alloc")] // used in the builder code +const Q_REAL_MAX: u8 = 127; + +/// Like slice::split_at but returns an Option instead of panicking +#[inline] +fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if mid > slice.len() { + debug_assert!(false, "debug_split_at: index expected to be in range"); + None + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + Some(slice.split_at(mid)) + } +} + +#[inline] +fn debug_get(slice: &[u8], index: usize) -> Option { + match slice.get(index) { + Some(x) => Some(*x), + None => { + debug_assert!(false, "debug_get: index expected to be in range"); + None + } + } +} + +/// Invariant: n > 0 +#[inline] +pub fn f1(byte: u8, p: u8, n: usize) -> usize { + let n = if n > 0 { + n + } else { + debug_assert!(false, "unreachable by invariant"); + 1 + }; + if p == 0 { + byte as usize % n + } else { + let mut result = byte ^ p ^ byte.wrapping_shr(p as u32); + for _ in P_FAST_MAX..p { + result = result ^ (result << 1) ^ (result >> 1); + } + result as usize % n + } +} + +/// Invariant: n > 0 +#[inline] +pub fn f2(byte: u8, q: u8, n: usize) -> usize { + let n = if n > 0 { + n + } else { + debug_assert!(false, "unreachable by invariant"); + 1 + }; + // ((byte ^ q) as usize) % n + let mut result = byte ^ q; + // if q >= Q_FAST_MAX { + // result = result ^ byte.wrapping_shr(q as u32); + // } + for _ in Q_FAST_MAX..q { + result = result ^ (result << 1) ^ (result >> 1); + } + result as usize % n +} + +// Standard layout: P, N bytes of Q, N bytes of expected keys +#[derive(Debug, PartialEq, Eq, RefCast)] +#[repr(transparent)] +pub struct PerfectByteHashMap(S); + +impl PerfectByteHashMap { + pub fn from_store(store: S) -> Self { + Self(store) + } + + pub fn take_store(self) -> S { + self.0 + } +} + +impl PerfectByteHashMap +where + S: AsRef<[u8]> + ?Sized, +{ + pub fn get(&self, key: u8) -> Option { + let (p, buffer) = self.0.as_ref().split_first()?; + let n = buffer.len() / 2; + if n == 0 { + return None; + } + let (qq, eks) = debug_split_at(buffer, n)?; + debug_assert_eq!(qq.len(), eks.len()); + let q = debug_get(qq, f1(key, *p, n))?; + let l2 = f2(key, q, n); + let ek = debug_get(eks, l2)?; + if ek == key { + Some(l2) + } else { + None + } + } + /// This is called `num_items` because `len` is ambiguous: it could refer + /// to the number of items or the number of bytes. + pub fn num_items(&self) -> usize { + self.0.as_ref().len() / 2 + } + pub fn keys(&self) -> &[u8] { + let n = self.num_items(); + debug_split_at(self.0.as_ref(), 1 + n) + .map(|s| s.1) + .unwrap_or(&[]) + } + pub fn p_qmax(&self) -> Option<(u8, u8)> { + let (p, buffer) = self.0.as_ref().split_first()?; + let n = buffer.len() / 2; + if n == 0 { + return None; + } + let (qq, _) = debug_split_at(buffer, n)?; + Some((*p, *qq.iter().max().unwrap())) + } + pub fn as_bytes(&self) -> &[u8] { + self.0.as_ref() + } + #[cfg(all(feature = "alloc", test))] + pub fn check(&self) -> Result<(), (&'static str, u8)> { + use alloc::vec; + let len = self.num_items(); + let mut seen = vec![false; len]; + for b in 0..=255u8 { + let get_result = self.get(b); + if self.keys().contains(&b) { + let i = get_result.ok_or(("expected to find", b))?; + if seen[i] { + return Err(("seen", b)); + } + seen[i] = true; + } else if get_result.is_some() { + return Err(("did not expect to find", b)); + } + } + Ok(()) + } +} + +impl PerfectByteHashMap +where + S: AsRef<[u8]> + ?Sized, +{ + pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> { + PerfectByteHashMap::ref_cast(self.0.as_ref()) + } +} + +#[cfg(all(test, feature = "alloc"))] +mod tests { + use super::*; + use alloc::vec::Vec; + extern crate std; + + fn random_alphanums(seed: u64, len: usize) -> Vec { + use rand::seq::SliceRandom; + use rand::SeedableRng; + const BYTES: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed); + BYTES.choose_multiple(&mut rng, len).copied().collect() + } + + #[test] + fn test_smaller() { + let mut count_by_p = [0; 256]; + let mut count_by_qmax = [0; 256]; + for len in 1..16 { + for seed in 0..150 { + let keys = random_alphanums(seed, len); + let keys_str = core::str::from_utf8(&keys).unwrap(); + let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str); + computed + .check() + .unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str))); + let (p, qmax) = computed.p_qmax().unwrap(); + count_by_p[p as usize] += 1; + count_by_qmax[qmax as usize] += 1; + } + } + std::println!("count_by_p (smaller): {count_by_p:?}"); + std::println!("count_by_qmax (smaller): {count_by_qmax:?}"); + let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::(); + let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..] + .iter() + .sum::(); + std::println!("fastq/slowq: {count_fastq}/{count_slowq}"); + // Assert that 99% of cases resolve to the fast hash + assert!(count_fastq >= count_slowq * 100); + } + + #[test] + fn test_larger() { + let mut count_by_p = [0; 256]; + let mut count_by_qmax = [0; 256]; + for len in 16..60 { + for seed in 0..75 { + let keys = random_alphanums(seed, len); + let keys_str = core::str::from_utf8(&keys).unwrap(); + let computed = PerfectByteHashMap::try_new(&keys).expect(keys_str); + computed + .check() + .unwrap_or_else(|_| panic!("{}", std::str::from_utf8(&keys).expect(keys_str))); + let (p, qmax) = computed.p_qmax().unwrap(); + count_by_p[p as usize] += 1; + count_by_qmax[qmax as usize] += 1; + } + } + std::println!("count_by_p (larger): {count_by_p:?}"); + std::println!("count_by_qmax (larger): {count_by_qmax:?}"); + let count_fastq = count_by_qmax[0..=Q_FAST_MAX as usize].iter().sum::(); + let count_slowq = count_by_qmax[Q_FAST_MAX as usize + 1..] + .iter() + .sum::(); + std::println!("fastq/slowq: {count_fastq}/{count_slowq}"); + // Assert that 99% of cases resolve to the fast hash + assert!(count_fastq >= count_slowq * 100); + } + + #[test] + fn test_build_read_small() { + #[derive(Debug)] + struct TestCase<'a> { + keys: &'a str, + expected: &'a [u8], + reordered_keys: &'a str, + } + let cases = [ + TestCase { + keys: "ab", + expected: &[0, 0, 0, b'b', b'a'], + reordered_keys: "ba", + }, + TestCase { + keys: "abc", + expected: &[0, 0, 0, 0, b'c', b'a', b'b'], + reordered_keys: "cab", + }, + TestCase { + // Note: splitting "a" and "c" into different buckets requires the heavier hash + // function because the difference between "a" and "c" is the period (2). + keys: "ac", + expected: &[1, 0, 1, b'c', b'a'], + reordered_keys: "ca", + }, + TestCase { + keys: "abd", + expected: &[0, 0, 1, 3, b'a', b'b', b'd'], + reordered_keys: "abd", + }, + TestCase { + keys: "def", + expected: &[0, 0, 0, 0, b'f', b'd', b'e'], + reordered_keys: "fde", + }, + TestCase { + keys: "fi", + expected: &[0, 0, 0, b'f', b'i'], + reordered_keys: "fi", + }, + TestCase { + keys: "gh", + expected: &[0, 0, 0, b'h', b'g'], + reordered_keys: "hg", + }, + TestCase { + keys: "lm", + expected: &[0, 0, 0, b'l', b'm'], + reordered_keys: "lm", + }, + TestCase { + // Note: "a" and "q" (0x61 and 0x71) are very hard to split; only a handful of + // hash function crates can get them into separate buckets. + keys: "aq", + expected: &[4, 0, 1, b'a', b'q'], + reordered_keys: "aq", + }, + TestCase { + keys: "xy", + expected: &[0, 0, 0, b'x', b'y'], + reordered_keys: "xy", + }, + TestCase { + keys: "xyz", + expected: &[0, 0, 0, 0, b'x', b'y', b'z'], + reordered_keys: "xyz", + }, + TestCase { + keys: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", + expected: &[ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 16, + 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 2, 0, 7, 104, 105, 106, 107, 108, 109, 110, 111, 112, 117, 118, 119, 68, 69, + 70, 113, 114, 65, 66, 67, 120, 121, 122, 115, 72, 73, 74, 71, 80, 81, 82, 83, + 84, 85, 86, 87, 88, 89, 90, 75, 76, 77, 78, 79, 103, 97, 98, 99, 116, 100, 102, + 101, + ], + reordered_keys: "hijklmnopuvwDEFqrABCxyzsHIJGPQRSTUVWXYZKLMNOgabctdfe", + }, + TestCase { + keys: "abcdefghij", + expected: &[ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 100, 101, 102, 103, 104, 105, 106, 97, 98, 99, + ], + reordered_keys: "defghijabc", + }, + TestCase { + // This is a small case that resolves to the slow hasher + keys: "Jbej", + expected: &[2, 0, 0, 102, 0, b'j', b'e', b'b', b'J'], + reordered_keys: "jebJ", + }, + TestCase { + // This is another small case that resolves to the slow hasher + keys: "JFNv", + expected: &[1, 98, 0, 2, 0, b'J', b'F', b'N', b'v'], + reordered_keys: "JFNv", + }, + ]; + for cas in cases { + let computed = PerfectByteHashMap::try_new(cas.keys.as_bytes()).expect(cas.keys); + assert_eq!(computed.as_bytes(), cas.expected, "{:?}", cas); + assert_eq!(computed.keys(), cas.reordered_keys.as_bytes(), "{:?}", cas); + computed.check().expect(cas.keys); + } + } +} diff --git a/experimental/zerotrie/src/error.rs b/experimental/zerotrie/src/error.rs new file mode 100644 index 00000000000..256b39a891f --- /dev/null +++ b/experimental/zerotrie/src/error.rs @@ -0,0 +1,18 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use displaydoc::Display; + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Display)] +pub enum Error { + /// Non-ASCII data was added to an ASCII-only collection. + #[displaydoc("Non-ASCII cannot be added to an ASCII-only collection")] + NonAsciiError, + /// The collection reached its maximum supported capacity. + #[displaydoc("Reached maximum capacity of collection")] + CapacityExceeded, + /// The builder could not solve the perfect hash function. + #[displaydoc("Failed to solve the perfect hash function. This is rare! Please report your case to the ICU4X team.")] + CouldNotSolvePerfectHash, +} diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs new file mode 100644 index 00000000000..dd960e1e8b0 --- /dev/null +++ b/experimental/zerotrie/src/lib.rs @@ -0,0 +1,52 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! A data structure offering zero-copy storage and retrieval of byte strings, with a focus +//! on the efficient storage of ASCII strings. Strings are mapped to a `usize` values. +//! +//! [`ZeroTrie`] does not support mutation because doing so would require recomputing the entire +//! data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`]. +//! +//! There are multiple variants of [`ZeroTrie`] optimized for different use cases. +//! +//! # Examples +//! +//! ``` +//! use zerotrie::ZeroTrie; +//! +//! let data: &[(&str, usize)] = &[ +//! ("abc", 11), +//! ("xyz", 22), +//! ("axyb", 33), +//! ]; +//! +//! let trie: ZeroTrie> = data.iter().copied().collect(); +//! +//! assert_eq!(trie.get("axyb"), Some(33)); +//! assert_eq!(trie.byte_len(), 18); +//! ``` +//! +//! [`LiteMap`]: litemap::LiteMap +//! [`BTreeMap`]: alloc::collections::BTreeMap + +#![no_std] + +#[cfg(feature = "alloc")] +extern crate alloc; + +mod builder; +#[doc(hidden)] +pub mod byte_phf; +mod error; +mod reader; +#[cfg(feature = "serde")] +mod serde; +mod varint; +mod zerotrie; + +pub use crate::zerotrie::ZeroTrie; +pub use crate::zerotrie::ZeroTrieExtendedCapacity; +pub use crate::zerotrie::ZeroTriePerfectHash; +pub use crate::zerotrie::ZeroTrieSimpleAscii; +pub use error::Error as ZeroTrieError; diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs new file mode 100644 index 00000000000..917927f7b62 --- /dev/null +++ b/experimental/zerotrie/src/reader.rs @@ -0,0 +1,454 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::byte_phf::PerfectByteHashMap; +use crate::varint::read_extended_varint; +use crate::varint::read_varint; +use core::ops::Range; + +#[cfg(feature = "alloc")] +use alloc::string::String; + +/// Like slice::split_at but returns an Option instead of panicking. +/// +/// Debug-panics if `mid` is out of range. +#[inline] +fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if mid > slice.len() { + debug_assert!(false, "debug_split_at: index expected to be in range"); + None + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + Some(slice.split_at(mid)) + } +} + +/// Like slice::split_at but returns an Option instead of panicking. +#[inline] +fn maybe_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if mid > slice.len() { + None + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + Some(slice.split_at(mid)) + } +} + +#[inline] +fn debug_get(slice: &[u8], index: usize) -> Option { + match slice.get(index) { + Some(x) => Some(*x), + None => { + debug_assert!(false, "debug_get: index expected to be in range"); + None + } + } +} + +#[inline] +fn debug_get_range(slice: &[u8], range: Range) -> Option<&[u8]> { + match slice.get(range) { + Some(x) => Some(x), + None => { + debug_assert!(false, "debug_get_range: indices expected to be in range"); + None + } + } +} + +/// Given a slice starting with an offset table, returns the trie for the given index. +/// +/// Arguments: +/// - `trie` = a trie pointing at an offset table (after the branch node and search table) +/// - `i` = the desired index within the offset table +/// - `n` = the number of items in the offset table +/// - `w` = the width of the offset table items minus one +#[inline] +fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> Option<&[u8]> { + let mut p = 0usize; + let mut q = 0usize; + loop { + let indices; + (indices, trie) = debug_split_at(trie, n - 1)?; + p = (p << 8) + + if i == 0 { + 0 + } else { + debug_get(indices, i - 1)? as usize + }; + q = match indices.get(i) { + Some(x) => (q << 8) + *x as usize, + None => trie.len(), + }; + if w == 0 { + break; + } + w -= 1; + } + debug_get_range(trie, p..q) +} + +/// Version of [`get_branch()`] specialized for the case `w == 0` for performance +#[inline] +fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> Option<&[u8]> { + let indices; + (indices, trie) = debug_split_at(trie, n - 1)?; + let p = if i == 0 { + 0 + } else { + debug_get(indices, i - 1)? as usize + }; + let q = match indices.get(i) { + Some(x) => *x as usize, + None => trie.len(), + }; + debug_get_range(trie, p..q) +} + +enum ByteType { + Ascii, + Span, + Value, + Match, +} + +impl core::fmt::Debug for ByteType { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use ByteType::*; + f.write_str(match *self { + Ascii => "a", + Span => "s", + Value => "v", + Match => "m", + }) + } +} + +#[inline] +fn byte_type(b: u8) -> ByteType { + match b & 0b11100000 { + 0b10000000 => ByteType::Value, + 0b10100000 => ByteType::Span, + 0b11000000 => ByteType::Match, + 0b11100000 => ByteType::Match, + _ => ByteType::Ascii, + } +} + +// DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w. +// +// | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 | +// |---------------|-------------------|--------------------|-------------------|--------------------| +// | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns | +// | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs | +// | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs | + +/// Query the trie assuming all branch nodes are binary search. +pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { + loop { + let (b, x, i, search); + (b, trie) = trie.split_first()?; + let byte_type = byte_type(*b); + (x, trie) = match byte_type { + ByteType::Ascii => (0, trie), + ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, + ByteType::Match => read_varint(*b, trie)?, + }; + if let Some((c, temp)) = ascii.split_first() { + if matches!(byte_type, ByteType::Ascii) { + if b == c { + // Matched a byte + ascii = temp; + continue; + } else { + // Byte that doesn't match + return None; + } + } + if matches!(byte_type, ByteType::Value) { + // Value node, but not at end of string + continue; + } + if matches!(byte_type, ByteType::Span) { + let (trie_span, ascii_span); + (trie_span, trie) = debug_split_at(trie, x)?; + (ascii_span, ascii) = maybe_split_at(ascii, x)?; + if trie_span == ascii_span { + // Matched a byte span + continue; + } else { + // Byte span that doesn't match + return None; + } + } + // Branch node + let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; + // See comment above regarding this assertion + debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); + let w = w & 0x3; + let x = if x == 0 { 256 } else { x }; + // Always use binary search + (search, trie) = debug_split_at(trie, x)?; + i = search.binary_search(c).ok()?; + trie = if w == 0 { + get_branch_w0(trie, i, x) + } else { + get_branch(trie, i, x, w) + }?; + ascii = temp; + continue; + } else { + if matches!(byte_type, ByteType::Value) { + // Value node at end of string + return Some(x); + } + return None; + } + } +} + +/// Query the trie assuming branch nodes could be either binary search or PHF. +pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { + loop { + let (b, x, i, search); + (b, trie) = trie.split_first()?; + let byte_type = byte_type(*b); + (x, trie) = match byte_type { + ByteType::Ascii => (0, trie), + ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, + ByteType::Match => read_varint(*b, trie)?, + }; + if let Some((c, temp)) = ascii.split_first() { + if matches!(byte_type, ByteType::Ascii) { + if b == c { + // Matched a byte + ascii = temp; + continue; + } else { + // Byte that doesn't match + return None; + } + } + if matches!(byte_type, ByteType::Value) { + // Value node, but not at end of string + continue; + } + if matches!(byte_type, ByteType::Span) { + let (trie_span, ascii_span); + (trie_span, trie) = debug_split_at(trie, x)?; + (ascii_span, ascii) = maybe_split_at(ascii, x)?; + if trie_span == ascii_span { + // Matched a byte span + continue; + } else { + // Byte span that doesn't match + return None; + } + } + // Branch node + let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; + // See comment above regarding this assertion + debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); + let w = w & 0x3; + let x = if x == 0 { 256 } else { x }; + if x < 16 { + // binary search + (search, trie) = debug_split_at(trie, x)?; + i = search.binary_search(c).ok()?; + } else { + // phf + (search, trie) = debug_split_at(trie, x * 2 + 1)?; + i = PerfectByteHashMap::from_store(search).get(*c)?; + } + trie = if w == 0 { + get_branch_w0(trie, i, x) + } else { + get_branch(trie, i, x, w) + }?; + ascii = temp; + continue; + } else { + if matches!(byte_type, ByteType::Value) { + // Value node at end of string + return Some(x); + } + return None; + } + } +} + +/// Query the trie without the limited capacity assertion. +pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { + loop { + let (b, x, i, search); + (b, trie) = trie.split_first()?; + let byte_type = byte_type(*b); + (x, trie) = match byte_type { + ByteType::Ascii => (0, trie), + ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, + ByteType::Match => read_varint(*b, trie)?, + }; + if let Some((c, temp)) = ascii.split_first() { + if matches!(byte_type, ByteType::Ascii) { + if b == c { + // Matched a byte + ascii = temp; + continue; + } else { + // Byte that doesn't match + return None; + } + } + if matches!(byte_type, ByteType::Value) { + // Value node, but not at end of string + continue; + } + if matches!(byte_type, ByteType::Span) { + let (trie_span, ascii_span); + (trie_span, trie) = debug_split_at(trie, x)?; + (ascii_span, ascii) = maybe_split_at(ascii, x)?; + if trie_span == ascii_span { + // Matched a byte span + continue; + } else { + // Byte span that doesn't match + return None; + } + } + // Branch node + let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; + let x = if x == 0 { 256 } else { x }; + if x < 16 { + // binary search + (search, trie) = debug_split_at(trie, x)?; + i = search.binary_search(c).ok()?; + } else { + // phf + (search, trie) = debug_split_at(trie, x * 2 + 1)?; + i = PerfectByteHashMap::from_store(search).get(*c)?; + } + trie = if w == 0 { + get_branch_w0(trie, i, x) + } else { + get_branch(trie, i, x, w) + }?; + ascii = temp; + continue; + } else { + if matches!(byte_type, ByteType::Value) { + // Value node at end of string + return Some(x); + } + return None; + } + } +} + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +#[cfg(feature = "alloc")] +pub(crate) struct ZeroTrieIterator<'a> { + use_phf: bool, + state: Vec<(&'a [u8], Vec, usize)>, +} + +#[cfg(feature = "alloc")] +impl<'a> ZeroTrieIterator<'a> { + pub fn new + ?Sized>(store: &'a S, use_phf: bool) -> Self { + ZeroTrieIterator { + use_phf, + state: alloc::vec![(store.as_ref(), alloc::vec![], 0)], + } + } +} + +#[cfg(feature = "alloc")] +impl<'a> Iterator for ZeroTrieIterator<'a> { + type Item = (Vec, usize); + fn next(&mut self) -> Option { + let (mut trie, mut string, mut branch_idx); + (trie, string, branch_idx) = self.state.pop()?; + loop { + let (b, x, span, search); + let return_trie = trie; + (b, trie) = match trie.split_first() { + Some(tpl) => tpl, + None => { + // At end of current branch; step back to the branch node. + // If there are no more branches, we are finished. + (trie, string, branch_idx) = self.state.pop()?; + continue; + } + }; + let byte_type = byte_type(*b); + if matches!(byte_type, ByteType::Ascii) { + string.push(*b); + continue; + } + (x, trie) = match byte_type { + ByteType::Ascii => (0, trie), + ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, + ByteType::Match => read_varint(*b, trie)?, + }; + if matches!(byte_type, ByteType::Span) { + (span, trie) = debug_split_at(trie, x)?; + string.extend(span); + continue; + } + if matches!(byte_type, ByteType::Value) { + let retval = string.clone(); + // Return to this position on the next step + self.state.push((trie, string, 0)); + return Some((retval, x)); + } + // Match node + let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; + let x = if x == 0 { 256 } else { x }; + if branch_idx + 1 < x { + // Return to this branch node at the next index + self.state + .push((return_trie, string.clone(), branch_idx + 1)); + } + let byte = if x < 16 || !self.use_phf { + // binary search + (search, trie) = debug_split_at(trie, x)?; + debug_get(search, branch_idx)? + } else { + // phf + (search, trie) = debug_split_at(trie, x * 2 + 1)?; + debug_get(search, branch_idx + x + 1)? + }; + string.push(byte); + trie = if w == 0 { + get_branch_w0(trie, branch_idx, x) + } else { + get_branch(trie, branch_idx, x, w) + }?; + branch_idx = 0; + } + } +} + +#[cfg(feature = "alloc")] +pub(crate) fn get_iter_phf + ?Sized>( + store: &S, +) -> impl Iterator, usize)> + '_ { + ZeroTrieIterator::new(store, true) +} + +/// # Panics +/// Panics if the trie contains non-ASCII items. +#[cfg(feature = "alloc")] +pub(crate) fn get_iter_ascii_or_panic + ?Sized>( + store: &S, +) -> impl Iterator + '_ { + ZeroTrieIterator::new(store, false).map(|(k, v)| { + #[allow(clippy::unwrap_used)] // in signature of function + let ascii_str = String::from_utf8(k).unwrap(); + (ascii_str, v) + }) +} diff --git a/experimental/zerotrie/src/serde.rs b/experimental/zerotrie/src/serde.rs new file mode 100644 index 00000000000..6fd87f6b325 --- /dev/null +++ b/experimental/zerotrie/src/serde.rs @@ -0,0 +1,547 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::builder::bytestr::ByteStr; +use crate::zerotrie::ZeroTrieInner; +use crate::ZeroTrie; +use crate::ZeroTrieExtendedCapacity; +use crate::ZeroTriePerfectHash; +use crate::ZeroTrieSimpleAscii; +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::fmt; +use litemap::LiteMap; +use serde::de::Error; +use serde::de::Visitor; +use serde::Deserialize; +use serde::Deserializer; +use serde::Serialize; +use serde::Serializer; + +struct ByteStrVisitor; +impl<'de> Visitor<'de> for ByteStrVisitor { + type Value = Box<[u8]>; + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "a slice of borrowed bytes or a string") + } + fn visit_bytes(self, v: &[u8]) -> Result { + Ok(Box::from(v)) + } + fn visit_str(self, v: &str) -> Result { + Ok(Box::from(v.as_bytes())) + } + fn visit_seq(self, mut v: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut result = Vec::with_capacity(v.size_hint().unwrap_or(0)); + while let Some(x) = v.next_element::()? { + result.push(x); + } + Ok(Box::from(result)) + } +} + +impl<'de, 'data> Deserialize<'de> for &'data ByteStr +where + 'de: 'data, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = <&'data [u8]>::deserialize(deserializer)?; + Ok(ByteStr::from_bytes(s)) + } +} + +impl<'de, 'data> Deserialize<'de> for Box +where + 'de: 'data, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + if deserializer.is_human_readable() { + let s = deserializer.deserialize_any(ByteStrVisitor)?; + Ok(ByteStr::from_boxed_bytes(s)) + } else { + let s = Vec::::deserialize(deserializer)?; + Ok(ByteStr::from_boxed_bytes(s.into_boxed_slice())) + } + } +} + +impl<'data> Serialize for &'data ByteStr { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let bytes = self.as_bytes(); + if serializer.is_human_readable() { + match core::str::from_utf8(bytes) { + Ok(s) => serializer.serialize_str(s), + Err(_) => serializer.serialize_bytes(bytes), + } + } else { + serializer.serialize_bytes(bytes) + } + } +} + +impl<'de, 'data, X> Deserialize<'de> for ZeroTrieSimpleAscii +where + 'de: 'data, + // DISCUSS: There are several possibilities for the bounds here that would + // get the job done. I could look for Deserialize, but this would require + // creating a custom Deserializer for the map case. I also considered + // introducing a new trait instead of relying on From. + X: From<&'data [u8]> + From> + 'data, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + if deserializer.is_human_readable() { + let lm = LiteMap::, usize>::deserialize(deserializer)?; + ZeroTrieSimpleAscii::try_from_serde_litemap(&lm) + .map_err(D::Error::custom) + .map(|trie| trie.map_store(From::from)) + } else { + // Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes + <&[u8]>::deserialize(deserializer) + .map(ZeroTrieSimpleAscii::from_store) + .map(|x| x.map_store(From::from)) + } + } +} + +impl Serialize for ZeroTrieSimpleAscii +where + X: AsRef<[u8]>, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if serializer.is_human_readable() { + let lm = self.to_litemap(); + lm.serialize(serializer) + } else { + let bytes = self.as_bytes(); + bytes.serialize(serializer) + } + } +} + +impl<'de, 'data, X> Deserialize<'de> for ZeroTriePerfectHash +where + 'de: 'data, + X: From<&'data [u8]> + From> + 'data, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + if deserializer.is_human_readable() { + let lm = LiteMap::, usize>::deserialize(deserializer)?; + ZeroTriePerfectHash::try_from_serde_litemap(&lm) + .map_err(D::Error::custom) + .map(|trie| trie.map_store(From::from)) + } else { + // Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes + <&[u8]>::deserialize(deserializer) + .map(ZeroTriePerfectHash::from_store) + .map(|x| x.map_store(From::from)) + } + } +} + +impl Serialize for ZeroTriePerfectHash +where + X: AsRef<[u8]>, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if serializer.is_human_readable() { + let lm = self.to_litemap(); + let lm = lm + .iter() + .map(|(k, v)| (ByteStr::from_bytes(k), v)) + .collect::>(); + lm.serialize(serializer) + } else { + let bytes = self.as_bytes(); + bytes.serialize(serializer) + } + } +} + +impl<'de, 'data, X> Deserialize<'de> for ZeroTrieExtendedCapacity +where + 'de: 'data, + X: From<&'data [u8]> + From> + 'data, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + if deserializer.is_human_readable() { + let lm = LiteMap::, usize>::deserialize(deserializer)?; + ZeroTrieExtendedCapacity::try_from_serde_litemap(&lm) + .map_err(D::Error::custom) + .map(|trie| trie.map_store(From::from)) + } else { + // Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes + <&[u8]>::deserialize(deserializer) + .map(ZeroTrieExtendedCapacity::from_store) + .map(|x| x.map_store(From::from)) + } + } +} + +impl Serialize for ZeroTrieExtendedCapacity +where + X: AsRef<[u8]>, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if serializer.is_human_readable() { + let lm = self.to_litemap(); + let lm = lm + .iter() + .map(|(k, v)| (ByteStr::from_bytes(k), v)) + .collect::>(); + lm.serialize(serializer) + } else { + let bytes = self.as_bytes(); + bytes.serialize(serializer) + } + } +} + +mod tags { + const USE_PHF: u8 = 0x1; + const BINARY_SPANS: u8 = 0x2; + const EXTENDED: u8 = 0x4; + + pub(crate) const SIMPLE_ASCII: u8 = 0; + pub(crate) const PERFECT_HASH: u8 = USE_PHF | BINARY_SPANS; + pub(crate) const EXTENDED_CAPACITY: u8 = USE_PHF | BINARY_SPANS | EXTENDED; +} + +impl<'de, 'data, X> Deserialize<'de> for ZeroTrie +where + 'de: 'data, + X: From<&'data [u8]> + From> + 'data, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + if deserializer.is_human_readable() { + let lm = LiteMap::, usize>::deserialize(deserializer)?; + ZeroTrie::>::try_from(&lm) + .map_err(D::Error::custom) + .map(|trie| trie.map_store(From::from)) + } else { + // Note: `impl Deserialize for &[u8]` uses visit_borrowed_bytes + let bytes = <&[u8]>::deserialize(deserializer)?; + let (tag, trie_bytes) = bytes + .split_first() + .ok_or(D::Error::custom("expected at least 1 byte for ZeroTrie"))?; + let zerotrie = + match *tag { + tags::SIMPLE_ASCII => ZeroTrieSimpleAscii::from_store(trie_bytes) + .map_store_into_zerotrie(From::from), + tags::PERFECT_HASH => ZeroTriePerfectHash::from_store(trie_bytes) + .map_store_into_zerotrie(From::from), + tags::EXTENDED_CAPACITY => ZeroTrieExtendedCapacity::from_store(trie_bytes) + .map_store_into_zerotrie(From::from), + _ => return Err(D::Error::custom("invalid ZeroTrie tag")), + }; + Ok(zerotrie) + } + } +} + +impl Serialize for ZeroTrie +where + X: AsRef<[u8]>, +{ + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + if serializer.is_human_readable() { + let lm = self.to_litemap(); + let lm = lm + .iter() + .map(|(k, v)| (ByteStr::from_bytes(k), v)) + .collect::>(); + lm.serialize(serializer) + } else { + let (tag, bytes) = match &self.0 { + ZeroTrieInner::SimpleAscii(t) => (tags::SIMPLE_ASCII, t.as_bytes()), + ZeroTrieInner::PerfectHash(t) => (tags::PERFECT_HASH, t.as_bytes()), + ZeroTrieInner::ExtendedCapacity(t) => (tags::EXTENDED_CAPACITY, t.as_bytes()), + }; + let mut all_in_one_vec = Vec::with_capacity(bytes.len() + 1); + all_in_one_vec.push(tag); + all_in_one_vec.extend(bytes); + all_in_one_vec.serialize(serializer) + } + } +} + +#[cfg(test)] +mod testdata { + include!("../tests/data.rs"); +} + +#[cfg(test)] +mod tests { + use super::*; + use alloc::borrow::Cow; + + #[derive(Serialize, Deserialize)] + pub struct ZeroTrieSimpleAsciiCow<'a> { + #[serde(borrow)] + trie: ZeroTrieSimpleAscii>, + } + + #[test] + pub fn test_serde_simpleascii_cow() { + let trie = ZeroTrieSimpleAscii::from_store(Cow::from(testdata::basic::TRIE_ASCII)); + let original = ZeroTrieSimpleAsciiCow { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_ASCII); + assert_eq!(bincode_bytes, testdata::basic::BINCODE_BYTES_ASCII); + + let json_recovered: ZeroTrieSimpleAsciiCow = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTrieSimpleAsciiCow = + bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(matches!(json_recovered.trie.take_store(), Cow::Owned(_))); + assert!(matches!( + bincode_recovered.trie.take_store(), + Cow::Borrowed(_) + )); + } + + #[derive(Serialize, Deserialize)] + pub struct ZeroTriePerfectHashCow<'a> { + #[serde(borrow)] + trie: ZeroTriePerfectHash>, + } + + #[test] + pub fn test_serde_perfecthash_cow() { + let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_ASCII)); + let original = ZeroTriePerfectHashCow { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_ASCII); + assert_eq!(bincode_bytes, testdata::basic::BINCODE_BYTES_ASCII); + + let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTriePerfectHashCow = + bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(matches!(json_recovered.trie.take_store(), Cow::Owned(_))); + assert!(matches!( + bincode_recovered.trie.take_store(), + Cow::Borrowed(_) + )); + } + + #[test] + pub fn test_serde_perfecthash_cow_u() { + let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_UNICODE)); + let original = ZeroTriePerfectHashCow { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_UNICODE); + assert_eq!(bincode_bytes, testdata::basic::BINCODE_BYTES_UNICODE); + + let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTriePerfectHashCow = + bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(matches!(json_recovered.trie.take_store(), Cow::Owned(_))); + assert!(matches!( + bincode_recovered.trie.take_store(), + Cow::Borrowed(_) + )); + } + + #[test] + pub fn test_serde_perfecthash_cow_bin() { + let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_BINARY)); + let original = ZeroTriePerfectHashCow { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_BINARY); + assert_eq!(bincode_bytes, testdata::basic::BINCODE_BYTES_BINARY); + + let json_recovered: ZeroTriePerfectHashCow = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTriePerfectHashCow = + bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(matches!(json_recovered.trie.take_store(), Cow::Owned(_))); + assert!(matches!( + bincode_recovered.trie.take_store(), + Cow::Borrowed(_) + )); + } + + #[derive(Serialize, Deserialize)] + pub struct ZeroTrieAnyCow<'a> { + #[serde(borrow)] + trie: ZeroTrie>, + } + + #[test] + pub fn test_serde_any_cow() { + let trie = + ZeroTrieSimpleAscii::from_store(Cow::from(testdata::basic::TRIE_ASCII)).into_zerotrie(); + let original = ZeroTrieAnyCow { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_ASCII); + // Note: ZeroTrie adds an extra byte to the start of the trie bytes + assert_eq!(&bincode_bytes[0..9], &[27, 0, 0, 0, 0, 0, 0, 0, 0]); + assert_eq!( + &bincode_bytes[9..], + &testdata::basic::BINCODE_BYTES_ASCII[8..] + ); + + let json_recovered: ZeroTrieAnyCow = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTrieAnyCow = bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(matches!(json_recovered.trie.take_store(), Cow::Owned(_))); + assert!(matches!( + bincode_recovered.trie.take_store(), + Cow::Borrowed(_) + )); + } + + #[test] + pub fn test_serde_any_cow_u() { + let trie = ZeroTriePerfectHash::from_store(Cow::from(testdata::basic::TRIE_UNICODE)) + .into_zerotrie(); + let original = ZeroTrieAnyCow { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_UNICODE); + // Note: ZeroTrie adds an extra byte to the start of the trie bytes + assert_eq!(&bincode_bytes[0..9], &[40, 0, 0, 0, 0, 0, 0, 0, 3]); + assert_eq!( + &bincode_bytes[9..], + &testdata::basic::BINCODE_BYTES_UNICODE[8..] + ); + + let json_recovered: ZeroTrieAnyCow = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTrieAnyCow = bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(matches!(json_recovered.trie.take_store(), Cow::Owned(_))); + assert!(matches!( + bincode_recovered.trie.take_store(), + Cow::Borrowed(_) + )); + } +} + +#[cfg(test)] +#[cfg(feature = "zerovec")] +mod tests_zerovec { + use super::*; + use zerovec::ZeroVec; + + #[derive(Serialize, Deserialize)] + pub struct ZeroTrieSimpleAsciiZeroVec<'a> { + #[serde(borrow)] + trie: ZeroTrieSimpleAscii>, + } + + #[test] + pub fn test_serde_simpleascii_zerovec() { + let trie = + ZeroTrieSimpleAscii::from_store(ZeroVec::new_borrowed(testdata::basic::TRIE_ASCII)); + let original = ZeroTrieSimpleAsciiZeroVec { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_ASCII); + assert_eq!(bincode_bytes, testdata::basic::BINCODE_BYTES_ASCII); + + let json_recovered: ZeroTrieSimpleAsciiZeroVec = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTrieSimpleAsciiZeroVec = + bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(json_recovered.trie.take_store().is_owned()); + assert!(!bincode_recovered.trie.take_store().is_owned()); + } + + #[derive(Serialize, Deserialize)] + pub struct ZeroTriePerfectHashZeroVec<'a> { + #[serde(borrow)] + trie: ZeroTriePerfectHash>, + } + + #[test] + pub fn test_serde_perfecthash_zerovec() { + let trie = + ZeroTriePerfectHash::from_store(ZeroVec::new_borrowed(testdata::basic::TRIE_ASCII)); + let original = ZeroTriePerfectHashZeroVec { trie }; + let json_str = serde_json::to_string(&original).unwrap(); + let bincode_bytes = bincode::serialize(&original).unwrap(); + + assert_eq!(json_str, testdata::basic::JSON_STR_ASCII); + assert_eq!(bincode_bytes, testdata::basic::BINCODE_BYTES_ASCII); + + let json_recovered: ZeroTriePerfectHashZeroVec = serde_json::from_str(&json_str).unwrap(); + let bincode_recovered: ZeroTriePerfectHashZeroVec = + bincode::deserialize(&bincode_bytes).unwrap(); + + assert_eq!(original.trie, json_recovered.trie); + assert_eq!(original.trie, bincode_recovered.trie); + + assert!(json_recovered.trie.take_store().is_owned()); + assert!(!bincode_recovered.trie.take_store().is_owned()); + } +} diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs new file mode 100644 index 00000000000..503c4b7874a --- /dev/null +++ b/experimental/zerotrie/src/varint.rs @@ -0,0 +1,405 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Varint spec for ZeroTrie: +//! +//! - Lead byte: top 2 bits are trie metadata; third is varint extender; rest is value +//! - Trail bytes: top bit is varint extender; add rest to current value * 2^7 +//! - Add the "latent value" to the final result: (1<<5) + (1<<7) + (1<<14) + ... + +use crate::builder::konst::ConstArrayBuilder; + +#[cfg(feature = "alloc")] +use crate::builder::nonconst::TrieBuilderStore; + +/// Reads a varint with 2 bits of metadata in the lead byte. +pub const fn read_varint(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { + let mut value = (start & 0b00011111) as usize; + let mut remainder = remainder; + if (start & 0b00100000) != 0 { + loop { + let next; + (next, remainder) = match remainder.split_first() { + Some(t) => t, + None => return None, + }; + // Note: value << 7 could drop high bits. The first addition can't overflow. + // The second addition could overflow; in such a case we just inform the + // developer via the debug assertion. + value = (value << 7) + ((*next & 0b01111111) as usize) + 32; + if (*next & 0b10000000) == 0 { + break; + } + } + } + Some((value, remainder)) +} + +/// Reads a varint with 3 bits of metadata in the lead byte. +pub const fn read_extended_varint(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { + let mut value = (start & 0b00001111) as usize; + let mut remainder = remainder; + if (start & 0b00010000) != 0 { + loop { + let next; + (next, remainder) = match remainder.split_first() { + Some(t) => t, + None => return None, + }; + // Note: value << 7 could drop high bits. The first addition can't overflow. + // The second addition could overflow; in such a case we just inform the + // developer via the debug assertion. + value = (value << 7) + ((*next & 0b01111111) as usize) + 16; + if (*next & 0b10000000) == 0 { + break; + } + } + } + Some((value, remainder)) +} + +#[cfg(feature = "alloc")] +pub(crate) fn try_read_extended_varint_from_tstore( + start: u8, + remainder: &mut S, +) -> Option { + let mut value = (start & 0b00001111) as usize; + if (start & 0b00010000) != 0 { + loop { + let next = remainder.atbs_split_first()?; + // Note: value << 7 could drop high bits. The first addition can't overflow. + // The second addition could overflow; in such a case we just inform the + // developer via the debug assertion. + value = (value << 7) + ((next & 0b01111111) as usize) + 16; + if (next & 0b10000000) == 0 { + break; + } + } + } + Some(value) +} + +#[cfg(test)] +const MAX_VARINT: usize = usize::MAX; + +// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value. +// Add an extra 1 since the lead byte holds only 5 bits of data. +const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::() * 8 / 7; + +pub(crate) const fn write_varint(value: usize) -> ConstArrayBuilder { + let mut result = [0; MAX_VARINT_LENGTH]; + let mut i = MAX_VARINT_LENGTH - 1; + let mut value = value; + let mut last = true; + loop { + if value < 32 { + result[i] = value as u8; + if !last { + result[i] |= 0b00100000; + } + break; + } + value -= 32; + result[i] = (value as u8) & 0b01111111; + if !last { + result[i] |= 0b10000000; + } else { + last = false; + } + value >>= 7; + i -= 1; + } + // The bytes are from i to the end. + ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) +} + +pub(crate) const fn write_extended_varint( + value: usize, +) -> ConstArrayBuilder { + let mut result = [0; MAX_VARINT_LENGTH]; + let mut i = MAX_VARINT_LENGTH - 1; + let mut value = value; + let mut last = true; + loop { + if value < 16 { + result[i] = value as u8; + if !last { + result[i] |= 0b00010000; + } + break; + } + value -= 16; + result[i] = (value as u8) & 0b01111111; + if !last { + result[i] |= 0b10000000; + } else { + last = false; + } + value >>= 7; + i -= 1; + } + // The bytes are from i to the end. + ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH) +} + +/// A secondary implementation that separates the latent value while computing the varint. +#[cfg(test)] +pub(crate) const fn write_varint_reference( + value: usize, +) -> ConstArrayBuilder { + let mut result = [0; MAX_VARINT_LENGTH]; + if value < 32 { + result[0] = value as u8; + return ConstArrayBuilder::from_manual_slice(result, 0, 1); + } + result[0] = 32; + let mut latent = 32; + let mut steps = 2; + loop { + let next_latent = (latent << 7) + 32; + if value < next_latent || next_latent == latent { + break; + } + latent = next_latent; + steps += 1; + } + let mut value = value - latent; + let mut i = steps; + while i > 0 { + i -= 1; + result[i] |= (value as u8) & 0b01111111; + value >>= 7; + if i > 0 && i < steps - 1 { + result[i] |= 0b10000000; + } + } + // The bytes are from 0 to `steps`. + ConstArrayBuilder::from_manual_slice(result, 0, steps) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug)] + struct TestCase<'a> { + bytes: &'a [u8], + remainder: &'a [u8], + value: usize, + } + static CASES: &[TestCase] = &[ + TestCase { + bytes: &[0b00000000], + remainder: &[], + value: 0, + }, + TestCase { + bytes: &[0b00001010], + remainder: &[], + value: 10, + }, + TestCase { + bytes: &[0b00011111], + remainder: &[], + value: 31, + }, + TestCase { + bytes: &[0b00011111, 0b10101010], + remainder: &[0b10101010], + value: 31, + }, + TestCase { + bytes: &[0b00100000, 0b00000000], + remainder: &[], + value: 32, + }, + TestCase { + bytes: &[0b00100000, 0b00000001], + remainder: &[], + value: 33, + }, + TestCase { + bytes: &[0b00100000, 0b00100000], + remainder: &[], + value: 64, + }, + TestCase { + bytes: &[0x20, 0x44], + remainder: &[], + value: 100, + }, + TestCase { + bytes: &[0b00100000, 0b01111111], + remainder: &[], + value: 159, + }, + TestCase { + bytes: &[0b00100001, 0b00000000], + remainder: &[], + value: 160, + }, + TestCase { + bytes: &[0b00100001, 0b00000001], + remainder: &[], + value: 161, + }, + TestCase { + bytes: &[0x23, 0x54], + remainder: &[], + value: 500, + }, + TestCase { + bytes: &[0b00111111, 0b01111111], + remainder: &[], + value: 4127, // 32 + (1 << 12) - 1 + }, + TestCase { + bytes: &[0b00100000, 0b10000000, 0b00000000], + remainder: &[], + value: 4128, // 32 + (1 << 12) + }, + TestCase { + bytes: &[0b00100000, 0b10000000, 0b00000001], + remainder: &[], + value: 4129, // 32 + (1 << 12) + 1 + }, + TestCase { + bytes: &[0b00100000, 0b10000000, 0b01111111], + remainder: &[], + value: 4255, // 32 + (1 << 12) + 127 + }, + TestCase { + bytes: &[0b00100000, 0b10000001, 0b00000000], + remainder: &[], + value: 4256, // 32 + (1 << 12) + 128 + }, + TestCase { + bytes: &[0b00100000, 0b10000001, 0b00000001], + remainder: &[], + value: 4257, // 32 + (1 << 12) + 129 + }, + TestCase { + bytes: &[0x20, 0x86, 0x68], + remainder: &[], + value: 5000, + }, + TestCase { + bytes: &[0b00100000, 0b11111111, 0b01111111], + remainder: &[], + value: 20511, // 32 + (1 << 12) + (1 << 14) - 1 + }, + TestCase { + bytes: &[0b00100001, 0b10000000, 0b00000000], + remainder: &[], + value: 20512, // 32 + (1 << 12) + (1 << 14) + }, + TestCase { + bytes: &[0b00111111, 0b11111111, 0b01111111], + remainder: &[], + value: 528415, // 32 + (1 << 12) + (1 << 19) - 1 + }, + TestCase { + bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000], + remainder: &[], + value: 528416, // 32 + (1 << 12) + (1 << 19) + }, + TestCase { + bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001], + remainder: &[], + value: 528417, // 32 + (1 << 12) + (1 << 19) + 1 + }, + TestCase { + bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111], + remainder: &[], + value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1 + }, + TestCase { + bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000], + remainder: &[], + value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26) + }, + ]; + + #[test] + fn test_read() { + for cas in CASES { + let recovered = read_varint(cas.bytes[0], &cas.bytes[1..]).unwrap(); + assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); + } + } + + #[test] + fn test_read_write() { + for cas in CASES { + let reference_bytes = write_varint_reference(cas.value); + assert_eq!( + reference_bytes.len(), + cas.bytes.len() - cas.remainder.len(), + "{:?}", + cas + ); + assert_eq!( + reference_bytes.as_slice(), + &cas.bytes[0..reference_bytes.len()], + "{:?}", + cas + ); + let recovered = read_varint(cas.bytes[0], &cas.bytes[1..]).unwrap(); + assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); + let write_bytes = write_varint(cas.value); + assert_eq!( + reference_bytes.as_slice(), + write_bytes.as_slice(), + "{:?}", + cas + ); + } + } + + #[test] + fn test_roundtrip() { + let mut i = 0usize; + while i < MAX_VARINT { + let bytes = write_varint(i); + let recovered = read_varint(bytes.as_slice()[0], &bytes.as_slice()[1..]); + assert!(recovered.is_some(), "{:?}", i); + assert_eq!(i, recovered.unwrap().0, "{:?}", bytes.as_slice()); + i <<= 1; + i += 1; + } + } + + #[test] + fn test_max() { + let reference_bytes = write_varint_reference(MAX_VARINT); + let write_bytes = write_varint(MAX_VARINT); + assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH); + assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice()); + let subarray = write_bytes + .as_const_slice() + .get_subslice_or_panic(1, write_bytes.len()); + let (recovered_value, remainder) = read_varint( + *write_bytes.as_const_slice().first().unwrap(), + subarray.as_slice(), + ) + .unwrap(); + assert!(remainder.is_empty()); + assert_eq!(recovered_value, MAX_VARINT); + assert_eq!( + write_bytes.as_slice(), + &[ + 0b00100001, // + 0b11011111, // + 0b11011111, // + 0b11011111, // + 0b11011111, // + 0b11011111, // + 0b11011111, // + 0b11011111, // + 0b11011111, // + 0b01011111, // + ] + ); + } +} diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs new file mode 100644 index 00000000000..3e83e78b022 --- /dev/null +++ b/experimental/zerotrie/src/zerotrie.rs @@ -0,0 +1,570 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::reader::*; + +use core::borrow::Borrow; +use ref_cast::RefCast; + +#[cfg(feature = "alloc")] +use crate::{builder::bytestr::ByteStr, builder::nonconst::ZeroTrieBuilder, error::Error}; +#[cfg(feature = "alloc")] +use alloc::{boxed::Box, collections::BTreeMap, collections::VecDeque, string::String, vec::Vec}; +#[cfg(feature = "litemap")] +use litemap::LiteMap; + +/// A data structure that compactly maps from byte sequences to integers. +/// +/// There are several variants of `ZeroTrie` which are very similar but are optimized +/// for different use cases: +/// +/// - [`ZeroTrieSimpleAscii`] is the most compact structure. Very fast for small data. +/// Only stores ASCII-encoded strings. Can be const-constructed! +/// - [`ZeroTriePerfectHash`] is also compact, but it also supports arbitrary binary +/// strings. It also scales better to large data. Cannot be const-constructed. +/// - [`ZeroTrieExtendedCapacity`] can be used if more than 2^32 bytes are required. +/// +/// You can create a `ZeroTrie` directly, in which case the most appropriate +/// backing implementation will be chosen. +/// +/// # Examples +/// +/// ``` +/// use zerotrie::ZeroTrie; +/// use litemap::LiteMap; +/// +/// let mut map = LiteMap::<&[u8], usize>::new_vec(); +/// map.insert("foo".as_bytes(), 1); +/// map.insert("bar".as_bytes(), 2); +/// map.insert("bazzoo".as_bytes(), 3); +/// +/// let trie = ZeroTrie::try_from(&map)?; +/// +/// assert_eq!(trie.get("foo"), Some(1)); +/// assert_eq!(trie.get("bar"), Some(2)); +/// assert_eq!(trie.get("bazzoo"), Some(3)); +/// assert_eq!(trie.get("unknown"), None); +/// +/// # Ok::<_, zerotrie::ZeroTrieError>(()) +/// ``` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ZeroTrie(pub(crate) ZeroTrieInner); + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum ZeroTrieInner { + SimpleAscii(ZeroTrieSimpleAscii), + PerfectHash(ZeroTriePerfectHash), + ExtendedCapacity(ZeroTrieExtendedCapacity), +} + +/// A data structure that compactly maps from ASCII strings to integers. +/// +/// # Examples +/// +/// ``` +/// use zerotrie::ZeroTrieSimpleAscii; +/// use litemap::LiteMap; +/// +/// let mut map = LiteMap::new_vec(); +/// map.insert(&b"foo"[..], 1); +/// map.insert(b"bar", 2); +/// map.insert(b"bazzoo", 3); +/// +/// let trie = ZeroTrieSimpleAscii::try_from(&map)?; +/// +/// assert_eq!(trie.get(b"foo"), Some(1)); +/// assert_eq!(trie.get(b"bar"), Some(2)); +/// assert_eq!(trie.get(b"bazzoo"), Some(3)); +/// assert_eq!(trie.get(b"unknown"), None); +/// +/// # Ok::<_, zerotrie::ZeroTrieError>(()) +/// ``` +#[repr(transparent)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] +pub struct ZeroTrieSimpleAscii { + pub(crate) store: S, +} + +/// A data structure that compactly maps from byte strings to integers. +/// +/// # Examples +/// +/// ``` +/// use zerotrie::ZeroTriePerfectHash; +/// use litemap::LiteMap; +/// +/// let mut map = LiteMap::<&[u8], usize>::new_vec(); +/// map.insert("foo".as_bytes(), 1); +/// map.insert("bår".as_bytes(), 2); +/// map.insert("båzzøø".as_bytes(), 3); +/// +/// let trie = ZeroTriePerfectHash::try_from(&map)?; +/// +/// assert_eq!(trie.get("foo".as_bytes()), Some(1)); +/// assert_eq!(trie.get("bår".as_bytes()), Some(2)); +/// assert_eq!(trie.get("båzzøø".as_bytes()), Some(3)); +/// assert_eq!(trie.get("bazzoo".as_bytes()), None); +/// +/// # Ok::<_, zerotrie::ZeroTrieError>(()) +/// ``` +#[repr(transparent)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] +pub struct ZeroTriePerfectHash { + pub(crate) store: S, +} + +/// A data structure that maps from a large number of byte strings to integers. +#[repr(transparent)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] +pub struct ZeroTrieExtendedCapacity { + pub(crate) store: S, +} + +macro_rules! impl_zerotrie_subtype { + ($name:ident, $variant:ident, $getter_fn:path, $iter_ty:ty, $iter_fn:path, $cnv_fn:path) => { + impl $name { + /// Wrap this specific ZeroTrie variant into a ZeroTrie. + pub const fn into_zerotrie(self) -> ZeroTrie { + ZeroTrie(ZeroTrieInner::$variant(self)) + } + /// Create a trie directly from a store. + /// + /// If the store does not contain valid bytes, unexpected behavior may occur. + pub const fn from_store(store: S) -> Self { + Self { store } + } + /// Takes the byte store from this trie. + pub fn take_store(self) -> S { + self.store + } + /// Maps the store into another type. + pub fn map_store(self, f: impl FnOnce(S) -> X) -> $name { + $name::::from_store(f(self.store)) + } + pub(crate) fn map_store_into_zerotrie(self, f: impl FnOnce(S) -> X) -> ZeroTrie { + $name::::from_store(f(self.store)).into_zerotrie() + } + } + impl $name + where + S: AsRef<[u8]> + ?Sized, + { + /// Queries the trie for a string. + pub fn get(&self, key: K) -> Option where K: AsRef<[u8]> { + // TODO: Should this be AsRef or Borrow? + $getter_fn(self.store.as_ref(), key.as_ref()) + } + /// Returns `true` if the trie is empty. + pub fn is_empty(&self) -> bool { + self.store.as_ref().is_empty() + } + /// Returns the size of the trie in number of bytes. + /// + /// To get the number of keys in the trie, use `.iter().count()`: + /// + /// ``` + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// + /// // A trie with two values: "abc" and "abcdef" + #[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x80def\\x81\");")] + /// + /// assert_eq!(8, trie.byte_len()); + /// assert_eq!(2, trie.iter().count()); + /// ``` + pub fn byte_len(&self) -> usize { + self.store.as_ref().len() + } + /// Returns the bytes contained in the underlying store. + pub fn as_bytes(&self) -> &[u8] { + self.store.as_ref() + } + /// Returns this trie as a reference transparent over a byte slice. + pub fn as_borrowed(&self) -> &$name<[u8]> { + $name::from_bytes(self.store.as_ref()) + } + } + #[cfg(feature = "alloc")] + impl $name + where + S: AsRef<[u8]> + ?Sized, + { + /// Converts a possibly-borrowed $name to an owned one. + /// + /// ***Enable this impl with the `"alloc"` feature.*** + /// + /// # Examples + /// + /// ``` + /// use std::borrow::Cow; + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// + #[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"abc\\x85\");")] + #[doc = concat!("let owned: ", stringify!($name), "> = trie.to_owned();")] + /// + /// assert_eq!(trie.get(b"abc"), Some(5)); + /// assert_eq!(owned.get(b"abc"), Some(5)); + /// ``` + pub fn to_owned(&self) -> $name> { + $name::from_store( + Vec::from(self.store.as_ref()), + ) + } + pub fn iter(&self) -> impl Iterator + '_ { + $iter_fn(self.as_bytes()) + } + } + impl $name<[u8]> { + /// Casts from a byte slice to a reference to a trie with the same lifetime. + /// + /// If the bytes are not a valid trie, unexpected behavior may occur. + pub fn from_bytes(trie: &[u8]) -> &Self { + Self::ref_cast(trie) + } + } + #[cfg(feature = "alloc")] + impl $name> { + pub(crate) fn try_from_tuple_slice(items: &[(&ByteStr, usize)]) -> Result { + ZeroTrieBuilder::>::from_sorted_tuple_slice( + items, + Self::BUILDER_OPTIONS, + ) + .map(|s| Self { + store: s.to_bytes(), + }) + } + } + #[cfg(feature = "alloc")] + impl<'a, K> TryFrom<&'a BTreeMap> for $name> + where + K: Borrow<[u8]> + { + type Error = crate::error::Error; + fn try_from(map: &'a BTreeMap) -> Result { + let tuples: Vec<(&[u8], usize)> = map + .iter() + .map(|(k, v)| (k.borrow(), *v)) + .collect(); + let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples); + Self::try_from_tuple_slice(byte_str_slice) + } + } + #[cfg(feature = "litemap")] + impl<'a, K, S> TryFrom<&'a LiteMap> for $name> + where + K: Borrow<[u8]>, + S: litemap::store::StoreIterable<'a, K, usize>, + { + type Error = crate::error::Error; + fn try_from(map: &'a LiteMap) -> Result { + let tuples: Vec<(&[u8], usize)> = map + .iter() + .map(|(k, v)| (k.borrow(), *v)) + .collect(); + let byte_str_slice = ByteStr::from_byte_slice_with_value(&tuples); + Self::try_from_tuple_slice(byte_str_slice) + } + } + #[cfg(feature = "alloc")] + impl $name + where + S: AsRef<[u8]> + ?Sized + { + /// Exports the data from this ZeroTrie type into a BTreeMap. + /// + /// ***Enable this impl with the `"alloc"` feature.*** + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// use std::collections::BTreeMap; + /// + #[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")] + /// let items = trie.to_btreemap(); + /// + /// assert_eq!(items.len(), 2); + /// + #[doc = concat!("let recovered_trie: ", stringify!($name), "> = items")] + /// .into_iter() + /// .collect(); + /// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes()); + /// ``` + pub fn to_btreemap(&self) -> BTreeMap<$iter_ty, usize> { + self.iter().collect() + } + pub(crate) fn to_btreemap_bytes(&self) -> BTreeMap, usize> { + self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect() + } + } + // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. + impl Borrow<$name<[u8]>> for $name<&[u8]> { + fn borrow(&self) -> &$name<[u8]> { + self.as_borrowed() + } + } + // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. + #[cfg(feature = "alloc")] + impl Borrow<$name<[u8]>> for $name> { + fn borrow(&self) -> &$name<[u8]> { + self.as_borrowed() + } + } + // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. + #[cfg(feature = "alloc")] + impl Borrow<$name<[u8]>> for $name> { + fn borrow(&self) -> &$name<[u8]> { + self.as_borrowed() + } + } + #[cfg(feature = "alloc")] + impl alloc::borrow::ToOwned for $name<[u8]> { + type Owned = $name>; + #[doc = concat!("This impl allows [`", stringify!($name), "`] to be used inside of a [`Cow`](alloc::borrow::Cow).")] + /// + #[doc = concat!("Note that it is also possible to use `", stringify!($name), ">` for a similar result.")] + /// + /// ***Enable this impl with the `"alloc"` feature.*** + /// + /// # Examples + /// + /// ``` + /// use std::borrow::Cow; + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// + #[doc = concat!("let trie: Cow<", stringify!($name), "<[u8]>> = Cow::Borrowed(", stringify!($name), "::from_bytes(b\"abc\\x85\"));")] + /// assert_eq!(trie.get(b"abc"), Some(5)); + /// ``` + fn to_owned(&self) -> Self::Owned { + let bytes: &[u8] = self.store.as_ref(); + $name::from_store( + Vec::from(bytes).into_boxed_slice(), + ) + } + } + #[cfg(feature = "litemap")] + impl $name + where + S: AsRef<[u8]> + ?Sized, + { + /// Exports the data from this ZeroTrie type into a LiteMap. + /// + /// ***Enable this function with the `"litemap"` feature.*** + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// use litemap::LiteMap; + /// + #[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")] + /// + /// let items = trie.to_litemap(); + /// assert_eq!(items.len(), 2); + /// + #[doc = concat!("let recovered_trie: ", stringify!($name), "> = items")] + /// .iter() + /// .map(|(k, v)| (k, *v)) + /// .collect(); + /// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes()); + /// ``` + pub fn to_litemap(&self) -> LiteMap<$iter_ty, usize> { + self.iter().collect() + } + pub(crate) fn to_litemap_bytes(&self) -> LiteMap, usize> { + self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect() + } + } + #[cfg(feature = "litemap")] + impl $name> + { + #[cfg(feature = "serde")] + pub(crate) fn try_from_serde_litemap(items: &LiteMap, usize>) -> Result { + let lm_borrowed: LiteMap<&ByteStr, usize> = items.to_borrowed_keys(); + Self::try_from_tuple_slice(lm_borrowed.as_slice()) + } + } + #[cfg(feature = "alloc")] + impl<'a, K> FromIterator<(K, usize)> for $name> + where + K: AsRef<[u8]> + { + fn from_iter>(iter: T) -> Self { + use crate::builder::nonconst::ZeroTrieBuilder; + ZeroTrieBuilder::>::from_bytes_iter( + iter, + Self::BUILDER_OPTIONS + ) + .map(|s| Self { + store: s.to_bytes(), + }) + .unwrap() + } + } + // TODO(#2778): Auto-derive these impls based on the repr(transparent). + // Safety: $name is repr(transparent) over S, a VarULE + #[cfg(feature = "zerovec")] + unsafe impl zerovec::ule::VarULE for $name + where + S: zerovec::ule::VarULE, + { + #[inline] + fn validate_byte_slice(bytes: &[u8]) -> Result<(), zerovec::ZeroVecError> { + S::validate_byte_slice(bytes) + } + #[inline] + unsafe fn from_byte_slice_unchecked(bytes: &[u8]) -> &Self { + core::mem::transmute(S::from_byte_slice_unchecked(bytes)) + } + } + }; +} + +#[cfg(feature = "alloc")] +fn vec_u8_to_box_u8(input: Vec) -> Box<[u8]> { + input.into_boxed_slice() +} + +#[cfg(feature = "alloc")] +fn string_to_box_u8(input: String) -> Box<[u8]> { + input.into_boxed_str().into_boxed_bytes() +} + +impl_zerotrie_subtype!( + ZeroTrieSimpleAscii, + SimpleAscii, + get_bsearch_only, + String, + get_iter_ascii_or_panic, + string_to_box_u8 +); +impl_zerotrie_subtype!( + ZeroTriePerfectHash, + PerfectHash, + get_phf_limited, + Vec, + get_iter_phf, + vec_u8_to_box_u8 +); +impl_zerotrie_subtype!( + ZeroTrieExtendedCapacity, + ExtendedCapacity, + get_phf_extended, + Vec, + get_iter_phf, + vec_u8_to_box_u8 +); + +macro_rules! impl_dispatch { + ($self:ident, $inner_fn:ident()) => { + match $self.0 { + ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn(), + ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn(), + ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn(), + } + }; + (&$self:ident, $inner_fn:ident()) => { + match &$self.0 { + ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn(), + ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn(), + ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn(), + } + }; + ($self:ident, $inner_fn:ident($arg:ident)) => { + match $self.0 { + ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn($arg), + ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn($arg), + ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn($arg), + } + }; + (&$self:ident, $inner_fn:ident($arg:ident)) => { + match &$self.0 { + ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn($arg), + ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn($arg), + ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn($arg), + } + }; +} + +impl ZeroTrie { + /// Takes the byte store from this trie. + pub fn take_store(self) -> S { + impl_dispatch!(self, take_store()) + } + /// Maps the store into another type. + pub fn map_store(self, f: impl FnOnce(S) -> X) -> ZeroTrie { + impl_dispatch!(self, map_store_into_zerotrie(f)) + } +} + +impl ZeroTrie +where + S: AsRef<[u8]>, +{ + /// Queries the trie for a string. + pub fn get(&self, key: K) -> Option + where + K: AsRef<[u8]>, + { + impl_dispatch!(&self, get(key)) + } + /// Returns `true` if the trie is empty. + pub fn is_empty(&self) -> bool { + impl_dispatch!(&self, is_empty()) + } + /// Returns the size of the trie in number of bytes. + /// + /// To get the number of keys in the trie, use `.iter().count()`. + pub fn byte_len(&self) -> usize { + impl_dispatch!(&self, byte_len()) + } +} + +#[cfg(feature = "alloc")] +impl ZeroTrie +where + S: AsRef<[u8]>, +{ + /// Exports the data from this ZeroTrie into a BTreeMap. + pub fn to_btreemap(&self) -> BTreeMap, usize> { + impl_dispatch!(&self, to_btreemap_bytes()) + } +} + +#[cfg(feature = "litemap")] +impl ZeroTrie +where + S: AsRef<[u8]>, +{ + /// Exports the data from this ZeroTrie into a LiteMap. + pub fn to_litemap(&self) -> LiteMap, usize> { + impl_dispatch!(&self, to_litemap_bytes()) + } +} + +#[cfg(feature = "alloc")] +impl ZeroTrie> { + pub(crate) fn try_from_tuple_slice(items: &[(&ByteStr, usize)]) -> Result { + let is_all_ascii = items.iter().all(|(s, _)| s.is_all_ascii()); + if is_all_ascii && items.len() < 512 { + ZeroTrieSimpleAscii::try_from_tuple_slice(items).map(|x| x.into_zerotrie()) + } else { + ZeroTriePerfectHash::try_from_tuple_slice(items).map(|x| x.into_zerotrie()) + } + } +} + +#[cfg(feature = "alloc")] +impl FromIterator<(K, usize)> for ZeroTrie> +where + K: AsRef<[u8]>, +{ + fn from_iter>(iter: T) -> Self { + let items = Vec::from_iter(iter); + let mut items: Vec<(&[u8], usize)> = items.iter().map(|(k, v)| (k.as_ref(), *v)).collect(); + items.sort(); + let byte_str_slice = ByteStr::from_byte_slice_with_value(&items); + #[allow(clippy::unwrap_used)] // FromIterator is panicky + Self::try_from_tuple_slice(byte_str_slice).unwrap() + } +} diff --git a/experimental/zerotrie/tests/asciitrie_test.rs b/experimental/zerotrie/tests/asciitrie_test.rs new file mode 100644 index 00000000000..2511eeace33 --- /dev/null +++ b/experimental/zerotrie/tests/asciitrie_test.rs @@ -0,0 +1,73 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use postcard::ser_flavors::{AllocVec, Flavor}; +use serde::Serialize; +use zerotrie::ZeroTriePerfectHash; +use zerotrie::ZeroTrieSimpleAscii; +use zerovec::ZeroMap; + +mod testdata { + include!("data.rs"); +} + +#[test] +fn test_basic() { + let bytes_ascii = testdata::basic::TRIE_ASCII; + let data_ascii = testdata::basic::DATA_ASCII; + let trie_ascii = ZeroTrieSimpleAscii::from_bytes(bytes_ascii); + let trie_phf_ascii = ZeroTriePerfectHash::from_bytes(bytes_ascii); + + let bytes_unicode = testdata::basic::TRIE_UNICODE; + let data_unicode = testdata::basic::DATA_UNICODE; + let trie_phf_unicode = ZeroTriePerfectHash::from_bytes(bytes_unicode); + + let bytes_binary = testdata::basic::TRIE_BINARY; + let data_binary = testdata::basic::DATA_BINARY; + let trie_phf_binary = ZeroTriePerfectHash::from_bytes(bytes_binary); + + // Check that the getter works + for (key, expected) in data_ascii { + let actual = match trie_ascii.get(key) { + Some(v) => v, + None => panic!("value should be in trie: {:?} => {}", key, expected), + }; + assert_eq!(*expected, actual); + let actual = match trie_phf_ascii.get(key) { + Some(v) => v, + None => panic!("value should be in trie6: {:?} => {}", key, expected), + }; + assert_eq!(*expected, actual); + } + + for (key, expected) in data_unicode { + let actual_unicode = match trie_phf_unicode.get(key) { + Some(v) => v, + None => panic!("value should be in trie6: {:?} => {}", key, expected), + }; + assert_eq!(*expected, actual_unicode); + } + + for (key, expected) in data_binary { + let actual_bin6 = match trie_phf_binary.get(key) { + Some(v) => v, + None => panic!("value should be in trie6: {:?} => {}", key, expected), + }; + assert_eq!(*expected, actual_bin6); + } + + // Compare the size to a postcard ZeroMap + let zm: ZeroMap<[u8], usize> = data_ascii.iter().copied().collect(); + let mut serializer = postcard::Serializer { + output: AllocVec::new(), + }; + Serialize::serialize(&zm, &mut serializer).unwrap(); + let zeromap_bytes = serializer + .output + .finalize() + .expect("Failed to finalize serializer output"); + + assert_eq!(26, bytes_ascii.len()); + assert_eq!(61, zeromap_bytes.len()); +} diff --git a/experimental/zerotrie/tests/builder_test.rs b/experimental/zerotrie/tests/builder_test.rs new file mode 100644 index 00000000000..2d7920fa377 --- /dev/null +++ b/experimental/zerotrie/tests/builder_test.rs @@ -0,0 +1,836 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use litemap::LiteMap; +use zerotrie::ZeroTriePerfectHash; +use zerotrie::ZeroTrieSimpleAscii; + +mod testdata { + include!("data.rs"); +} + +use testdata::strings_to_litemap; + +const NON_EXISTENT_STRINGS: &[&str] = &[ + "a9PS", "ahsY", "ahBO", "a8IN", "xk8o", "xv1l", "xI2S", "618y", "d6My", "uszy", +]; + +macro_rules! assert_bytes_eq { + ($len:literal, $a:expr, $b:expr) => { + assert_eq!($len, $a.len()); + assert_eq!($a, $b); + }; +} + +fn check_simple_ascii_trie(items: &LiteMap<&[u8], usize>, trie: &ZeroTrieSimpleAscii) +where + S: AsRef<[u8]> + ?Sized, +{ + // Check that each item is in the trie + for (k, v) in items.iter() { + assert_eq!(trie.get(k), Some(*v)); + } + // Check that some items are not in the trie + for s in NON_EXISTENT_STRINGS.iter() { + assert_eq!(trie.get(s.as_bytes()), None); + } + // Check that the iterator returns items in the same order as the LiteMap + assert!(items + .iter() + .map(|(s, v)| (String::from_utf8(s.to_vec()).unwrap(), *v)) + .eq(trie.iter())); + // Check that the const builder works + let const_trie = ZeroTrieSimpleAscii::try_from_litemap_with_const_builder(items).unwrap(); + assert_eq!(trie.as_bytes(), const_trie.as_bytes()); +} + +fn check_phf_ascii_trie(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash) +where + S: AsRef<[u8]> + ?Sized, +{ + // Check that each item is in the trie + for (k, v) in items.iter() { + assert_eq!(trie.get(k), Some(*v)); + } + // Check that some items are not in the trie + for s in NON_EXISTENT_STRINGS.iter() { + assert_eq!(trie.get(s.as_bytes()), None); + } + // Check that the iterator returns the contents of the LiteMap + // Note: Since the items might not be in order, we collect them into a new LiteMap + let recovered_items: LiteMap<_, _> = trie.iter().collect(); + assert_eq!( + items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(), + recovered_items.to_borrowed_keys_values() + ); +} + +fn check_phf_bytes_trie(items: &LiteMap<&[u8], usize>, trie: &ZeroTriePerfectHash) +where + S: AsRef<[u8]> + ?Sized, +{ + // Check that each item is in the trie + for (k, v) in items.iter() { + assert_eq!(trie.get(k), Some(*v)); + } + // Check that some items are not in the trie + for s in NON_EXISTENT_STRINGS.iter() { + assert_eq!(trie.get(s.as_bytes()), None); + } + // Check that the iterator returns the contents of the LiteMap + // Note: Since the items might not be in order, we collect them into a new LiteMap + let recovered_items: LiteMap<_, _> = trie.iter().collect(); + assert_eq!( + items.to_borrowed_keys_values::<[u8], usize, Vec<_>>(), + recovered_items.to_borrowed_keys_values() + ); +} + +#[test] +fn test_basic() { + let lm1a: LiteMap<&[u8], usize> = testdata::basic::DATA_ASCII.iter().copied().collect(); + let lm1b: LiteMap<&[u8], usize> = lm1a.to_borrowed_keys(); + let lm2: LiteMap<&[u8], usize> = testdata::basic::DATA_UNICODE.iter().copied().collect(); + let lm3: LiteMap<&[u8], usize> = testdata::basic::DATA_BINARY.iter().copied().collect(); + + let expected_bytes = testdata::basic::TRIE_ASCII; + let trie = ZeroTrieSimpleAscii::try_from(&lm1a).unwrap(); + assert_bytes_eq!(26, trie.as_bytes(), expected_bytes); + check_simple_ascii_trie(&lm1a, &trie); + + let trie = ZeroTriePerfectHash::try_from(&lm1b).unwrap(); + assert_bytes_eq!(26, trie.as_bytes(), expected_bytes); + check_phf_ascii_trie(&lm1a, &trie); + + let expected_bytes = testdata::basic::TRIE_UNICODE; + let trie = ZeroTriePerfectHash::try_from(&lm2).unwrap(); + assert_bytes_eq!(39, trie.as_bytes(), expected_bytes); + check_phf_bytes_trie(&lm2, &trie); + + let expected_bytes = testdata::basic::TRIE_BINARY; + let trie = ZeroTriePerfectHash::try_from(&lm3).unwrap(); + assert_bytes_eq!(26, trie.as_bytes(), expected_bytes); + check_phf_bytes_trie(&lm3, &trie); +} + +#[test] +fn test_empty() { + let trie = ZeroTrieSimpleAscii::try_from(&LiteMap::<&[u8], usize>::new_vec()).unwrap(); + assert_eq!(trie.byte_len(), 0); + assert!(trie.is_empty()); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.as_bytes(), &[]); +} + +#[test] +fn test_single_empty_value() { + let litemap: LiteMap<&[u8], usize> = [ + (&b""[..], 10), // + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), Some(10)); + assert_eq!(trie.get(b"x"), None); + let expected_bytes = &[0b10001010]; + assert_eq!(trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(1, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_single_byte_string() { + let litemap: LiteMap<&[u8], usize> = [ + (&b"x"[..], 10), // + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"xy"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[b'x', 0b10001010]; + assert_bytes_eq!(2, trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(2, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_single_string() { + let litemap: LiteMap<&[u8], usize> = [ + (&b"xyz"[..], 10), // + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"x"), None); + assert_eq!(trie.get(b"xy"), None); + assert_eq!(trie.get(b"xyzz"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[b'x', b'y', b'z', 0b10001010]; + assert_bytes_eq!(4, trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_prefix_strings() { + let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"xy", 1)].into_iter().collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"xyz"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[b'x', 0b10000000, b'y', 0b10000001]; + assert_bytes_eq!(4, trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(4, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_single_byte_branch() { + let litemap: LiteMap<&[u8], usize> = [(&b"x"[..], 0), (b"y", 1)].into_iter().collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"xy"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[0b11000010, b'x', b'y', 1, 0b10000000, 0b10000001]; + assert_bytes_eq!(6, trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(6, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_multi_byte_branch() { + let litemap: LiteMap<&[u8], usize> = [(&b"axb"[..], 0), (b"ayc", 1)].into_iter().collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"a"), None); + assert_eq!(trie.get(b"ax"), None); + assert_eq!(trie.get(b"ay"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[ + b'a', 0b11000010, b'x', b'y', 2, b'b', 0b10000000, b'c', 0b10000001, + ]; + assert_bytes_eq!(9, trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(9, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_linear_varint_values() { + let litemap: LiteMap<&[u8], usize> = [(&b""[..], 100), (b"x", 500), (b"xyz", 5000)] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b"xy"), None); + assert_eq!(trie.get(b"xz"), None); + assert_eq!(trie.get(b"xyzz"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[0x90, 0x54, b'x', 0x93, 0x64, b'y', b'z', 0x90, 0x96, 0x78]; + assert_bytes_eq!(10, trie.as_bytes(), expected_bytes); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(10, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_varint_branch() { + let chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + let litemap: LiteMap<&[u8], usize> = (0..chars.len()) + .map(|i| (chars.get(i..i + 1).unwrap().as_bytes(), i)) + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"ax"), None); + assert_eq!(trie.get(b"ay"), None); + check_simple_ascii_trie(&litemap, &trie); + #[rustfmt::skip] + let expected_bytes = &[ + 0b11100000, // branch varint lead + 0x14, // branch varint trail + // search array: + b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', + b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', + b'U', b'V', b'W', b'X', b'Y', b'Z', + b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', + b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', + b'u', b'v', b'w', b'x', b'y', b'z', + // offset array: + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, + 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, + 86, + // single-byte values: + 0x80, (0x80 | 1), (0x80 | 2), (0x80 | 3), (0x80 | 4), + (0x80 | 5), (0x80 | 6), (0x80 | 7), (0x80 | 8), (0x80 | 9), + (0x80 | 10), (0x80 | 11), (0x80 | 12), (0x80 | 13), (0x80 | 14), + (0x80 | 15), + // multi-byte values: + 0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5, + 0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x90, 10, 0x90, 11, + 0x90, 12, 0x90, 13, 0x90, 14, 0x90, 15, 0x90, 16, 0x90, 17, + 0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23, + 0x90, 24, 0x90, 25, 0x90, 26, 0x90, 27, 0x90, 28, 0x90, 29, + 0x90, 30, 0x90, 31, 0x90, 32, 0x90, 33, 0x90, 34, 0x90, 35, + ]; + assert_bytes_eq!(193, trie.as_bytes(), expected_bytes); + + #[rustfmt::skip] + let expected_bytes = &[ + 0b11100000, // branch varint lead + 0x14, // branch varint trail + // PHF metadata: + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 10, 12, 16, 4, 4, 4, 4, 4, 4, 8, + 4, 4, 4, 16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 7, + // search array: + b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', + b'p', b'u', b'v', b'w', b'D', b'E', b'F', b'q', + b'r', b'A', b'B', b'C', b'x', b'y', b'z', b's', + b'H', b'I', b'J', b'G', b'P', b'Q', b'R', b'S', + b'T', b'U', b'V', b'W', b'X', b'Y', b'Z', b'K', + b'L', b'M', b'N', b'O', b'g', b'a', b'b', b'c', + b't', b'd', b'f', b'e', + // offset array: + 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 25, 26, 27, + 29, 31, 32, 33, 34, 36, 38, 40, + 42, 43, 44, 45, 46, 47, 49, 51, + 53, 55, 57, 59, 61, 63, 65, 67, + 68, 69, 70, 71, 72, 74, 76, 78, + 80, 82, 84, 86, + // values: + 0x90, 17, 0x90, 18, 0x90, 19, 0x90, 20, 0x90, 21, 0x90, 22, 0x90, 23, + 0x90, 24, 0x90, 25, 0x90, 30, 0x90, 31, 0x90, 32, 0x80 | 3, 0x80 | 4, + 0x80 | 5, 0x90, 26, 0x90, 27, 0x80, 0x80 | 1, 0x80 | 2, 0x90, 33, + 0x90, 34, 0x90, 35, 0x90, 28, 0x80 | 7, 0x80 | 8, 0x80 | 9, 0x80 | 6, + 0x80 | 15, 0x90, 0, 0x90, 1, 0x90, 2, 0x90, 3, 0x90, 4, 0x90, 5, + 0x90, 6, 0x90, 7, 0x90, 8, 0x90, 9, 0x80 | 10, 0x80 | 11, 0x80 | 12, + 0x80 | 13, 0x80 | 14, 0x90, 16, 0x90, 10, 0x90, 11, 0x90, 12, 0x90, 29, + 0x90, 13, 0x90, 15, 0x90, 14, + ]; + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(246, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); +} + +#[test] +fn test_below_wide() { + let litemap: LiteMap<&[u8], usize> = [ + (&b"abcdefghijklmnopqrstuvwxyz"[..], 1), + (b"bcdefghijklmnopqrstuvwxyza", 2), + (b"cdefghijklmnopqrstuvwxyzab", 3), + (b"defghijklmnopqrstuvwxyzabc", 4), + (b"efghijklmnopqrstuvwxyzabcd", 5), + (b"fghijklmnopqrstuvwxyzabcde", 6), + (b"ghijklmnopqrstuvwxyzabcdef", 7), + (b"hijklmnopqrstuvwxyzabcdefg", 8), + (b"ijklmnopqrstuvwxyzabcdefgh", 9), + (b"jklmnopqrstuvwxyzabcd", 10), + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"abc"), None); + check_simple_ascii_trie(&litemap, &trie); + #[rustfmt::skip] + let expected_bytes = &[ + 0b11001010, // branch + // search array: + b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', + // offset array: + 26, 52, 78, 104, 130, 156, 182, 208, 234, + // offset data: + b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', + b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', + 0x81, + b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', + b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', + 0x82, + b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', + b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', + 0x83, + b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', + b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', + 0x84, + b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', + b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', + 0x85, + b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', + b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', + 0x86, + b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', + b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', + 0x87, + b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', + b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', + 0x88, + b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', + b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', + 0x89, + b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', + b'x', b'y', b'z', b'a', b'b', b'c', b'd', + 0x8A, + ]; + assert_bytes_eq!(275, trie.as_bytes(), expected_bytes); +} + +#[test] +fn test_at_wide() { + let litemap: LiteMap<&[u8], usize> = [ + (&b"abcdefghijklmnopqrstuvwxyz"[..], 1), + (b"bcdefghijklmnopqrstuvwxyza", 2), + (b"cdefghijklmnopqrstuvwxyzab", 3), + (b"defghijklmnopqrstuvwxyzabc", 4), + (b"efghijklmnopqrstuvwxyzabcd", 5), + (b"fghijklmnopqrstuvwxyzabcde", 6), + (b"ghijklmnopqrstuvwxyzabcdef", 7), + (b"hijklmnopqrstuvwxyzabcdefg", 8), + (b"ijklmnopqrstuvwxyzabcdefgh", 9), + (b"jklmnopqrstuvwxyzabcde", 10), + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"abc"), None); + check_simple_ascii_trie(&litemap, &trie); + #[rustfmt::skip] + let expected_bytes = &[ + 0b11100001, // branch lead + 0x6A, // branch trail + // search array: + b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', + // offset array (wide): + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 26, 52, 78, 104, 130, 156, 182, 208, 234, + // offset data: + b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', + b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', + 0x81, + b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', + b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', + 0x82, + b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', + b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', + 0x83, + b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', + b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', + 0x84, + b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', + b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', + 0x85, + b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', + b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', + 0x86, + b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', + b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', + 0x87, + b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', + b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', + 0x88, + b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', + b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', + 0x89, + b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', + b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', + 0x8A, + ]; + assert_bytes_eq!(286, trie.as_bytes(), expected_bytes); +} + +#[test] +fn test_at_wide_plus() { + let litemap: LiteMap<&[u8], usize> = [ + (&b"abcdefghijklmnopqrstuvwxyz"[..], 1), + (b"bcdefghijklmnopqrstuvwxyza", 2), + (b"cdefghijklmnopqrstuvwxyzab", 3), + (b"defghijklmnopqrstuvwxyzabc", 4), + (b"efghijklmnopqrstuvwxyzabcd", 5), + (b"fghijklmnopqrstuvwxyzabcde", 6), + (b"ghijklmnopqrstuvwxyzabcdef", 7), + (b"hijklmnopqrstuvwxyzabcdefg", 8), + (b"ijklmnopqrstuvwxyzabcdefgh", 9), + (b"jklmnopqrstuvwxyzabcdef", 10), + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), None); + assert_eq!(trie.get(b"abc"), None); + check_simple_ascii_trie(&litemap, &trie); + #[rustfmt::skip] + let expected_bytes = &[ + 0b11100001, // branch lead + 0x6A, // branch trail + // search array: + b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', + // offset array (wide): + 0, 0, 0, 0, 0, 0, 0, 0, 0, + 26, 52, 78, 104, 130, 156, 182, 208, 234, + // offset data: + b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', + b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', + 0x81, + b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', + b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', + 0x82, + b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', + b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', + 0x83, + b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', + b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', + 0x84, + b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', + b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', + 0x85, + b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', + b't', b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', + 0x86, + b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', + b'u', b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', + 0x87, + b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', + b'v', b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', + 0x88, + b'j', b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', + b'w', b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', + 0x89, + b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', + b'x', b'y', b'z', b'a', b'b', b'c', b'd', b'e', b'f', + 0x8A, + ]; + assert_bytes_eq!(287, trie.as_bytes(), expected_bytes); +} + +#[test] +fn test_everything() { + let litemap: LiteMap<&[u8], usize> = [ + (&b""[..], 0), + (b"axb", 100), + (b"ayc", 2), + (b"azd", 3), + (b"bxe", 4), + (b"bxefg", 500), + (b"bxefh", 6), + (b"bxei", 7), + (b"bxeikl", 8), + ] + .into_iter() + .collect(); + let trie = ZeroTrieSimpleAscii::try_from(&litemap.as_sliced()).unwrap(); + assert_eq!(trie.get(b""), Some(0)); + assert_eq!(trie.get(b"a"), None); + assert_eq!(trie.get(b"ax"), None); + assert_eq!(trie.get(b"ay"), None); + check_simple_ascii_trie(&litemap, &trie); + let expected_bytes = &[ + 0b10000000, // value 0 + 0b11000010, // branch of 2 + b'a', // + b'b', // + 13, // + 0b11000011, // branch of 3 + b'x', // + b'y', // + b'z', // + 3, // + 5, // + b'b', // + 0b10010000, // value 100 (lead) + 0x54, // value 100 (trail) + b'c', // + 0b10000010, // value 2 + b'd', // + 0b10000011, // value 3 + b'x', // + b'e', // + 0b10000100, // value 4 + 0b11000010, // branch of 2 + b'f', // + b'i', // + 7, // + 0b11000010, // branch of 2 + b'g', // + b'h', // + 2, // + 0b10010011, // value 500 (lead) + 0x64, // value 500 (trail) + 0b10000110, // value 6 + 0b10000111, // value 7 + b'k', // + b'l', // + 0b10001000, // value 8 + ]; + assert_bytes_eq!(36, trie.as_bytes(), expected_bytes); + + #[rustfmt::skip] + let expected_bytes = &[ + 0b10000000, // value 0 + 0b11000010, // branch of 2 + b'a', // + b'b', // + 13, // + 0b11000011, // start of 'a' subtree: branch of 3 + b'x', // + b'y', // + b'z', // + 3, // + 5, // + b'b', // + 0b10010000, // value 100 (lead) + 0x54, // value 100 (trail) + b'c', // + 0b10000010, // value 2 + b'd', // + 0b10000011, // value 3 + b'x', // start of 'b' subtree + b'e', // + 0b10000100, // value 4 + 0b11000010, // branch of 2 + b'f', // + b'i', // + 7, // + 0b11000010, // branch of 2 + b'g', // + b'h', // + 2, // + 0b10010011, // value 500 (lead) + 0x64, // value 500 (trail) + 0b10000110, // value 6 + 0b10000111, // value 7 + b'k', // + b'l', // + 0b10001000, // value 8 + ]; + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_bytes_eq!(36, trie_phf.as_bytes(), expected_bytes); + check_phf_ascii_trie(&litemap, &trie_phf); + + let zhm: zerovec::ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 75); + + let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 65); + + let zhm: zerovec::ZeroHashMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 148); + + let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 138); +} + +macro_rules! utf8_byte { + ($ch:expr, $i:literal) => {{ + let mut utf8_encoder_buf = [0u8; 4]; + $ch.encode_utf8(&mut utf8_encoder_buf); + utf8_encoder_buf[$i] + }}; +} + +#[test] +fn test_non_ascii() { + let litemap: LiteMap<&[u8], usize> = [ + ("".as_bytes(), 0), + ("axb".as_bytes(), 100), + ("ayc".as_bytes(), 2), + ("azd".as_bytes(), 3), + ("bxe".as_bytes(), 4), + ("bxefg".as_bytes(), 500), + ("bxefh".as_bytes(), 6), + ("bxei".as_bytes(), 7), + ("bxeikl".as_bytes(), 8), + ("bxeiklmΚαλημέρααα".as_bytes(), 9), + ("bxeiklmαnλo".as_bytes(), 10), + ("bxeiklmη".as_bytes(), 11), + ] + .into_iter() + .collect(); + + #[rustfmt::skip] + let expected_bytes = &[ + 0b10000000, // value 0 + 0b11000010, // branch of 2 + b'a', // + b'b', // + 13, // + 0b11000011, // start of 'a' subtree: branch of 3 + b'x', // + b'y', // + b'z', // + 3, // + 5, // + b'b', // + 0b10010000, // value 100 (lead) + 0x54, // value 100 (trail) + b'c', // + 0b10000010, // value 2 + b'd', // + 0b10000011, // value 3 + b'x', // start of 'b' subtree + b'e', // + 0b10000100, // value 4 + 0b11000010, // branch of 2 + b'f', // + b'i', // + 7, // + 0b11000010, // branch of 2 + b'g', // + b'h', // + 2, // + 0b10010011, // value 500 (lead) + 0x64, // value 500 (trail) + 0b10000110, // value 6 + 0b10000111, // value 7 + b'k', // + b'l', // + 0b10001000, // value 8 + b'm', // + 0b10100001, // span of length 1 + utf8_byte!('Κ', 0), // NOTE: all three letters have the same lead byte + 0b11000011, // branch of 3 + utf8_byte!('Κ', 1), + utf8_byte!('α', 1), + utf8_byte!('η', 1), + 21, + 27, + 0b10110000, // span of length 18 (lead) + 0b00000010, // span of length 18 (trail) + utf8_byte!('α', 0), + utf8_byte!('α', 1), + utf8_byte!('λ', 0), + utf8_byte!('λ', 1), + utf8_byte!('η', 0), + utf8_byte!('η', 1), + utf8_byte!('μ', 0), + utf8_byte!('μ', 1), + utf8_byte!('έ', 0), + utf8_byte!('έ', 1), + utf8_byte!('ρ', 0), + utf8_byte!('ρ', 1), + utf8_byte!('α', 0), + utf8_byte!('α', 1), + utf8_byte!('α', 0), + utf8_byte!('α', 1), + utf8_byte!('α', 0), + utf8_byte!('α', 1), + 0b10001001, // value 9 + b'n', + 0b10100010, // span of length 2 + utf8_byte!('λ', 0), + utf8_byte!('λ', 1), + b'o', + 0b10001010, // value 10 + 0b10001011, // value 11 + ]; + let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap(); + assert_bytes_eq!(73, trie_phf.as_bytes(), expected_bytes); + check_phf_bytes_trie(&litemap, &trie_phf); +} + +#[test] +fn test_max_branch() { + // Evaluate a branch with all 256 possible children + let mut litemap: LiteMap<&[u8], usize> = LiteMap::new_vec(); + let all_bytes: Vec = (u8::MIN..=u8::MAX).collect(); + assert_eq!(all_bytes.len(), 256); + let all_bytes_prefixed: Vec<[u8; 2]> = (u8::MIN..=u8::MAX).map(|x| [b'\0', x]).collect(); + for b in all_bytes.iter() { + litemap.insert(core::slice::from_ref(b), *b as usize); + } + for s in all_bytes_prefixed.iter() { + litemap.insert(s, s[1] as usize); + } + let trie_phf = ZeroTriePerfectHash::try_from(&litemap).unwrap(); + assert_eq!(trie_phf.byte_len(), 3042); + check_phf_bytes_trie(&litemap, &trie_phf); +} + +#[test] +fn test_short_subtags_10pct() { + let litemap = strings_to_litemap(testdata::short_subtags_10pct::STRINGS); + + let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap(); + assert_eq!(trie.byte_len(), 1050); + check_simple_ascii_trie(&litemap, &trie); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_eq!(trie_phf.byte_len(), 1100); + check_phf_ascii_trie(&litemap, &trie_phf); + + let zhm: zerovec::ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 1331); + + let zhm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 1330); + + let zhm: zerovec::ZeroHashMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 2837); + + let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 2836); +} + +#[test] +fn test_short_subtags() { + let litemap = strings_to_litemap(testdata::short_subtags::STRINGS); + + let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap(); + assert_eq!(trie.byte_len(), 8793); + check_simple_ascii_trie(&litemap, &trie); + + let litemap_bytes = litemap.to_borrowed_keys::<[u8], Vec<_>>(); + let trie_phf = ZeroTriePerfectHash::try_from(&litemap_bytes).unwrap(); + assert_eq!(trie_phf.byte_len(), 9400); + check_phf_ascii_trie(&litemap, &trie_phf); + + let zm: zerovec::ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + let zhm_buf = postcard::to_allocvec(&zm).unwrap(); + assert_eq!(zhm_buf.len(), 15182); + + let zm: zerovec::ZeroMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect(); + let zhm_buf = postcard::to_allocvec(&zm).unwrap(); + assert_eq!(zhm_buf.len(), 13304); + + let zhm: zerovec::ZeroHashMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 30200); + + let zhm: zerovec::ZeroHashMap<[u8], u8> = litemap.iter().map(|(a, b)| (*a, *b as u8)).collect(); + let zhm_buf = postcard::to_allocvec(&zhm).unwrap(); + assert_eq!(zhm_buf.len(), 28322); +} diff --git a/experimental/zerotrie/tests/data.rs b/experimental/zerotrie/tests/data.rs new file mode 100644 index 00000000000..9de102251b0 --- /dev/null +++ b/experimental/zerotrie/tests/data.rs @@ -0,0 +1,2204 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use litemap::LiteMap; + +const fn single_byte_intermediate_value(x: u8) -> u8 { + debug_assert!(x <= 0b00001111); + x | 0b10000000 +} + +use single_byte_intermediate_value as single_byte_short_value; + +const fn single_byte_branch_equal(x: u8) -> u8 { + debug_assert!(x <= 0b00001111); + x | 0b11000000 +} + +use single_byte_branch_equal as single_byte_short_match; + +#[allow(dead_code)] +pub fn strings_to_litemap<'a>(strings: &[&'a str]) -> LiteMap<&'a [u8], usize> { + strings + .iter() + .copied() + .map(|x| x.as_bytes()) + .enumerate() + .map(|(i, s)| (s, i)) + .collect() +} + +#[allow(dead_code)] +pub mod basic { + use super::*; + pub static TRIE_ASCII: &[u8] = &[ + b'a', + b'b', + single_byte_short_value(1), + b'c', + single_byte_short_value(2), + // Begin Match Node + single_byte_short_match(3), + b'd', + b'e', + b'f', + 5, + 8, + // End Match Node + // subslice @ 0 + single_byte_short_value(3), + b'g', + b'h', + b'i', + single_byte_short_value(4), + // subslice @ 5 + b'j', + b'k', + single_byte_short_value(5), + // subslice @ 8 + // Begin Match Node + single_byte_short_match(2), + b'l', + b'm', + 1, + // End Match Node + // subsubslice @ 0 + single_byte_short_value(6), + // subsubslice @ 1 + b'n', + single_byte_short_value(7), + ]; + pub static DATA_ASCII: &[(&[u8], usize)] = &[ + (b"ab", 1), + (b"abc", 2), + (b"abcd", 3), + (b"abcdghi", 4), + (b"abcejk", 5), + (b"abcfl", 6), + (b"abcfmn", 7), + ]; + + pub static TRIE_UNICODE: &[u8] = &[ + 196, 100, 102, 103, 107, 12, 17, 23, 195, 97, 101, 105, 2, 4, 115, 129, 114, 130, 101, 131, + 162, 195, 188, 114, 132, 114, 111, 162, 195, 159, 133, 162, 195, 182, 110, 110, 101, 110, + 134, + ]; + pub static DATA_UNICODE: &[(&[u8], usize)] = &[ + ("das".as_bytes(), 1), + ("der".as_bytes(), 2), + ("die".as_bytes(), 3), + ("für".as_bytes(), 4), + ("groß".as_bytes(), 5), + ("können".as_bytes(), 6), + ]; + + pub static TRIE_BINARY: &[u8] = &[ + 196, 0, 129, 144, 240, 3, 9, 12, 161, 144, 131, 194, 130, 131, 1, 129, 130, 161, 144, 132, + 161, 255, 133, 161, 255, 134, + ]; + pub static DATA_BINARY: &[(&[u8], usize)] = &[ + (b"\0\x90", 3), + (b"\x81\x82", 1), + (b"\x81\x83", 2), + (b"\x90\x90", 4), + (b"\xF0\xFF", 5), + (b"\xF0\xFF\xFF", 6), + ]; + + // Note: Cow and ZeroVec have the same serialized form + pub static JSON_STR_ASCII: &str = "{\"trie\":{\"ab\":1,\"abc\":2,\"abcd\":3,\"abcdghi\":4,\"abcejk\":5,\"abcfl\":6,\"abcfmn\":7}}"; + pub static JSON_STR_UNICODE: &str = + "{\"trie\":{\"das\":1,\"der\":2,\"die\":3,\"für\":4,\"groß\":5,\"können\":6}}"; + pub static JSON_STR_BINARY: &str = "{\"trie\":[[[0,144],3],[[129,130],1],[[129,131],2],[[144,144],4],[[240,255],5],[[240,255,255],6]]}"; + pub static BINCODE_BYTES_ASCII: &[u8] = &[ + 26, 0, 0, 0, 0, 0, 0, 0, 97, 98, 129, 99, 130, 195, 100, 101, 102, 5, 8, 131, 103, 104, + 105, 132, 106, 107, 133, 194, 108, 109, 1, 134, 110, 135, + ]; + pub static BINCODE_BYTES_UNICODE: &[u8] = &[ + 39, 0, 0, 0, 0, 0, 0, 0, 196, 100, 102, 103, 107, 12, 17, 23, 195, 97, 101, 105, 2, 4, 115, + 129, 114, 130, 101, 131, 162, 195, 188, 114, 132, 114, 111, 162, 195, 159, 133, 162, 195, + 182, 110, 110, 101, 110, 134, + ]; + pub static BINCODE_BYTES_BINARY: &[u8] = &[ + 26, 0, 0, 0, 0, 0, 0, 0, 196, 0, 129, 144, 240, 3, 9, 12, 161, 144, 131, 194, 130, 131, 1, + 129, 130, 161, 144, 132, 161, 255, 133, 161, 255, 134, + ]; +} + +#[allow(dead_code)] +pub mod short_subtags { + pub static STRINGS: &[&str] = &[ + "aa", + "aai", + "aak", + "aau", + "ab", + "abi", + "abq", + "abr", + "abt", + "aby", + "acd", + "ace", + "ach", + "ada", + "ade", + "adj", + "adp", + "ady", + "adz", + "ae", + "aeb", + "aey", + "af", + "agc", + "agd", + "agg", + "agm", + "ago", + "agq", + "aha", + "ahl", + "aho", + "ajg", + "ak", + "akk", + "ala", + "ali", + "aln", + "alt", + "am", + "amm", + "amn", + "amo", + "amp", + "an", + "anc", + "ank", + "ann", + "any", + "aoj", + "aom", + "aoz", + "apc", + "apd", + "ape", + "apr", + "aps", + "apz", + "ar", + "arc", + "arc-Nbat", + "arc-Palm", + "arh", + "arn", + "aro", + "arq", + "ars", + "ary", + "arz", + "as", + "asa", + "ase", + "asg", + "aso", + "ast", + "ata", + "atg", + "atj", + "auy", + "av", + "avl", + "avn", + "avt", + "avu", + "awa", + "awb", + "awo", + "awx", + "ay", + "ayb", + "az", + "az-Arab", + "az-IQ", + "az-IR", + "az-RU", + "ba", + "bal", + "ban", + "bap", + "bar", + "bas", + "bav", + "bax", + "bba", + "bbb", + "bbc", + "bbd", + "bbj", + "bbp", + "bbr", + "bcf", + "bch", + "bci", + "bcm", + "bcn", + "bco", + "bcq", + "bcu", + "bdd", + "be", + "bef", + "beh", + "bej", + "bem", + "bet", + "bew", + "bex", + "bez", + "bfd", + "bfq", + "bft", + "bfy", + "bg", + "bgc", + "bgn", + "bgx", + "bhb", + "bhg", + "bhi", + "bhl", + "bho", + "bhy", + "bi", + "bib", + "big", + "bik", + "bim", + "bin", + "bio", + "biq", + "bjh", + "bji", + "bjj", + "bjn", + "bjo", + "bjr", + "bjt", + "bjz", + "bkc", + "bkm", + "bkq", + "bku", + "bkv", + "bla", + "blg", + "blt", + "bm", + "bmh", + "bmk", + "bmq", + "bmu", + "bn", + "bng", + "bnm", + "bnp", + "bo", + "boj", + "bom", + "bon", + "bpy", + "bqc", + "bqi", + "bqp", + "bqv", + "br", + "bra", + "brh", + "brx", + "brz", + "bs", + "bsj", + "bsq", + "bss", + "bst", + "bto", + "btt", + "btv", + "bua", + "buc", + "bud", + "bug", + "buk", + "bum", + "buo", + "bus", + "buu", + "bvb", + "bwd", + "bwr", + "bxh", + "bye", + "byn", + "byr", + "bys", + "byv", + "byx", + "bza", + "bze", + "bzf", + "bzh", + "bzw", + "ca", + "cad", + "can", + "cbj", + "cch", + "ccp", + "ce", + "ceb", + "cfa", + "cgg", + "ch", + "chk", + "chm", + "cho", + "chp", + "chr", + "cic", + "cja", + "cjm", + "cjv", + "ckb", + "ckl", + "cko", + "cky", + "cla", + "clc", + "cme", + "cmg", + "co", + "cop", + "cps", + "cr", + "crg", + "crh", + "crk", + "crl", + "crs", + "cs", + "csb", + "csw", + "ctd", + "cu", + "cu-Glag", + "cv", + "cy", + "da", + "dad", + "daf", + "dag", + "dah", + "dak", + "dar", + "dav", + "dbd", + "dbq", + "dcc", + "ddn", + "de", + "ded", + "den", + "dga", + "dgh", + "dgi", + "dgl", + "dgr", + "dgz", + "dia", + "dje", + "dmf", + "dnj", + "dob", + "doi", + "dop", + "dow", + "drh", + "dri", + "drs", + "dsb", + "dtm", + "dtp", + "dts", + "dty", + "dua", + "duc", + "dud", + "dug", + "dv", + "dva", + "dww", + "dyo", + "dyu", + "dz", + "dzg", + "ebu", + "ee", + "efi", + "egl", + "egy", + "eka", + "eky", + "el", + "ema", + "emi", + "en", + "en-Shaw", + "enn", + "enq", + "eo", + "eri", + "es", + "esg", + "esu", + "et", + "etr", + "ett", + "etu", + "etx", + "eu", + "ewo", + "ext", + "eza", + "fa", + "faa", + "fab", + "fag", + "fai", + "fan", + "ff", + "ff-Adlm", + "ffi", + "ffm", + "fi", + "fia", + "fil", + "fit", + "fj", + "flr", + "fmp", + "fo", + "fod", + "fon", + "for", + "fpe", + "fqs", + "fr", + "frc", + "frp", + "frr", + "frs", + "fub", + "fud", + "fue", + "fuf", + "fuh", + "fuq", + "fur", + "fuv", + "fuy", + "fvr", + "fy", + "ga", + "gaa", + "gaf", + "gag", + "gah", + "gaj", + "gam", + "gan", + "gaw", + "gay", + "gba", + "gbf", + "gbm", + "gby", + "gbz", + "gcr", + "gd", + "gde", + "gdn", + "gdr", + "geb", + "gej", + "gel", + "gez", + "gfk", + "ggn", + "ghs", + "gil", + "gim", + "gjk", + "gjn", + "gju", + "gkn", + "gkp", + "gl", + "glk", + "gmm", + "gmv", + "gn", + "gnd", + "gng", + "god", + "gof", + "goi", + "gom", + "gon", + "gor", + "gos", + "got", + "grb", + "grc", + "grc-Linb", + "grt", + "grw", + "gsw", + "gu", + "gub", + "guc", + "gud", + "gur", + "guw", + "gux", + "guz", + "gv", + "gvf", + "gvr", + "gvs", + "gwc", + "gwi", + "gwt", + "gyi", + "ha", + "ha-CM", + "ha-SD", + "hag", + "hak", + "ham", + "haw", + "haz", + "hbb", + "hdy", + "he", + "hhy", + "hi", + "hi-Latn", + "hia", + "hif", + "hig", + "hih", + "hil", + "hla", + "hlu", + "hmd", + "hmt", + "hnd", + "hne", + "hnj", + "hnn", + "hno", + "ho", + "hoc", + "hoj", + "hot", + "hr", + "hsb", + "hsn", + "ht", + "hu", + "hui", + "hur", + "hy", + "hz", + "ia", + "ian", + "iar", + "iba", + "ibb", + "iby", + "ica", + "ich", + "id", + "idd", + "idi", + "idu", + "ife", + "ig", + "igb", + "ige", + "ii", + "ijj", + "ik", + "ikk", + "ikw", + "ikx", + "ilo", + "imo", + "in", + "inh", + "io", + "iou", + "iri", + "is", + "it", + "iu", + "iw", + "iwm", + "iws", + "izh", + "izi", + "ja", + "jab", + "jam", + "jar", + "jbo", + "jbu", + "jen", + "jgk", + "jgo", + "ji", + "jib", + "jmc", + "jml", + "jra", + "jut", + "jv", + "jw", + "ka", + "kaa", + "kab", + "kac", + "kad", + "kai", + "kaj", + "kam", + "kao", + "kaw", + "kbd", + "kbm", + "kbp", + "kbq", + "kbx", + "kby", + "kcg", + "kck", + "kcl", + "kct", + "kde", + "kdh", + "kdl", + "kdt", + "kea", + "ken", + "kez", + "kfo", + "kfr", + "kfy", + "kg", + "kge", + "kgf", + "kgp", + "kha", + "khb", + "khn", + "khq", + "khs", + "kht", + "khw", + "khz", + "ki", + "kij", + "kiu", + "kiw", + "kj", + "kjd", + "kjg", + "kjs", + "kjy", + "kk", + "kk-AF", + "kk-Arab", + "kk-CN", + "kk-IR", + "kk-MN", + "kkc", + "kkj", + "kl", + "kln", + "klq", + "klt", + "klx", + "km", + "kmb", + "kmh", + "kmo", + "kms", + "kmu", + "kmw", + "kn", + "knf", + "knp", + "ko", + "koi", + "kok", + "kol", + "kos", + "koz", + "kpe", + "kpf", + "kpo", + "kpr", + "kpx", + "kqb", + "kqf", + "kqs", + "kqy", + "kr", + "krc", + "kri", + "krj", + "krl", + "krs", + "kru", + "ks", + "ksb", + "ksd", + "ksf", + "ksh", + "ksj", + "ksr", + "ktb", + "ktm", + "kto", + "ktr", + "ku", + "ku-Arab", + "ku-LB", + "ku-Yezi", + "kub", + "kud", + "kue", + "kuj", + "kum", + "kun", + "kup", + "kus", + "kv", + "kvg", + "kvr", + "kvx", + "kw", + "kwj", + "kwk", + "kwo", + "kwq", + "kxa", + "kxc", + "kxe", + "kxl", + "kxm", + "kxp", + "kxw", + "kxz", + "ky", + "ky-Arab", + "ky-CN", + "ky-Latn", + "ky-TR", + "kye", + "kyx", + "kzh", + "kzj", + "kzr", + "kzt", + "la", + "lab", + "lad", + "lag", + "lah", + "laj", + "las", + "lb", + "lbe", + "lbu", + "lbw", + "lcm", + "lcp", + "ldb", + "led", + "lee", + "lem", + "lep", + "leq", + "leu", + "lez", + "lg", + "lgg", + "li", + "lia", + "lid", + "lif", + "lif-Limb", + "lig", + "lih", + "lij", + "lil", + "lis", + "ljp", + "lki", + "lkt", + "lle", + "lln", + "lmn", + "lmo", + "lmp", + "ln", + "lns", + "lnu", + "lo", + "loj", + "lok", + "lol", + "lor", + "los", + "loz", + "lrc", + "lt", + "ltg", + "lu", + "lua", + "luo", + "luy", + "luz", + "lv", + "lwl", + "lzh", + "lzz", + "mad", + "maf", + "mag", + "mai", + "mak", + "man", + "man-GN", + "man-Nkoo", + "mas", + "maw", + "maz", + "mbh", + "mbo", + "mbq", + "mbu", + "mbw", + "mci", + "mcp", + "mcq", + "mcr", + "mcu", + "mda", + "mde", + "mdf", + "mdh", + "mdj", + "mdr", + "mdx", + "med", + "mee", + "mek", + "men", + "mer", + "met", + "meu", + "mfa", + "mfe", + "mfn", + "mfo", + "mfq", + "mg", + "mgh", + "mgl", + "mgo", + "mgp", + "mgy", + "mh", + "mhi", + "mhl", + "mi", + "mic", + "mif", + "min", + "miw", + "mk", + "mki", + "mkl", + "mkp", + "mkw", + "ml", + "mle", + "mlp", + "mls", + "mmo", + "mmu", + "mmx", + "mn", + "mn-CN", + "mn-Mong", + "mna", + "mnf", + "mni", + "mnw", + "mo", + "moa", + "moe", + "moh", + "mos", + "mox", + "mpp", + "mps", + "mpt", + "mpx", + "mql", + "mr", + "mrd", + "mrj", + "mro", + "ms", + "ms-CC", + "mt", + "mtc", + "mtf", + "mti", + "mtr", + "mua", + "mur", + "mus", + "mva", + "mvn", + "mvy", + "mwk", + "mwr", + "mwv", + "mww", + "mxc", + "mxm", + "my", + "myk", + "mym", + "myv", + "myw", + "myx", + "myz", + "mzk", + "mzm", + "mzn", + "mzp", + "mzw", + "mzz", + "na", + "nac", + "naf", + "nak", + "nan", + "nap", + "naq", + "nas", + "nb", + "nca", + "nce", + "ncf", + "nch", + "nco", + "ncu", + "nd", + "ndc", + "nds", + "ne", + "neb", + "new", + "nex", + "nfr", + "ng", + "nga", + "ngb", + "ngl", + "nhb", + "nhe", + "nhw", + "nif", + "nii", + "nij", + "nin", + "niu", + "niy", + "niz", + "njo", + "nkg", + "nko", + "nl", + "nmg", + "nmz", + "nn", + "nnf", + "nnh", + "nnk", + "nnm", + "nnp", + "no", + "nod", + "noe", + "non", + "nop", + "nou", + "nqo", + "nr", + "nrb", + "nsk", + "nsn", + "nso", + "nss", + "nst", + "ntm", + "ntr", + "nui", + "nup", + "nus", + "nuv", + "nux", + "nv", + "nwb", + "nxq", + "nxr", + "ny", + "nym", + "nyn", + "nzi", + "oc", + "ogc", + "oj", + "ojs", + "oka", + "okr", + "okv", + "om", + "ong", + "onn", + "ons", + "opm", + "or", + "oro", + "oru", + "os", + "osa", + "ota", + "otk", + "oui", + "ozm", + "pa", + "pa-Arab", + "pa-PK", + "pag", + "pal", + "pal-Phlp", + "pam", + "pap", + "pau", + "pbi", + "pcd", + "pcm", + "pdc", + "pdt", + "ped", + "peo", + "pex", + "pfl", + "phl", + "phn", + "pil", + "pip", + "pka", + "pko", + "pl", + "pla", + "pms", + "png", + "pnn", + "pnt", + "pon", + "ppa", + "ppo", + "pqm", + "pra", + "prd", + "prg", + "ps", + "pss", + "pt", + "ptp", + "puu", + "pwa", + "qu", + "quc", + "qug", + "rai", + "raj", + "rao", + "rcf", + "rej", + "rel", + "res", + "rgn", + "rhg", + "ria", + "rif", + "rif-NL", + "rjs", + "rkt", + "rm", + "rmf", + "rmo", + "rmt", + "rmu", + "rn", + "rna", + "rng", + "ro", + "rob", + "rof", + "roo", + "rro", + "rtm", + "ru", + "rue", + "rug", + "rw", + "rwk", + "rwo", + "ryu", + "sa", + "saf", + "sah", + "saq", + "sas", + "sat", + "sav", + "saz", + "sba", + "sbe", + "sbp", + "sc", + "sck", + "scl", + "scn", + "sco", + "sd", + "sd-Deva", + "sd-IN", + "sd-Khoj", + "sd-Sind", + "sdc", + "sdh", + "se", + "sef", + "seh", + "sei", + "ses", + "sg", + "sga", + "sgs", + "sgw", + "sgz", + "shi", + "shk", + "shn", + "shu", + "si", + "sid", + "sig", + "sil", + "sim", + "sjr", + "sk", + "skc", + "skr", + "sks", + "sl", + "sld", + "sli", + "sll", + "sly", + "sm", + "sma", + "smj", + "smn", + "smp", + "smq", + "sms", + "sn", + "snc", + "snk", + "snp", + "snx", + "sny", + "so", + "sog", + "sok", + "soq", + "sou", + "soy", + "spd", + "spl", + "sps", + "sq", + "sr", + "sr-ME", + "sr-RO", + "sr-RU", + "sr-TR", + "srb", + "srn", + "srr", + "srx", + "ss", + "ssd", + "ssg", + "ssy", + "st", + "stk", + "stq", + "su", + "sua", + "sue", + "suk", + "sur", + "sus", + "sv", + "sw", + "swb", + "swc", + "swg", + "swp", + "swv", + "sxn", + "sxw", + "syl", + "syr", + "szl", + "ta", + "taj", + "tal", + "tan", + "taq", + "tbc", + "tbd", + "tbf", + "tbg", + "tbo", + "tbw", + "tbz", + "tci", + "tcy", + "tdd", + "tdg", + "tdh", + "tdu", + "te", + "ted", + "tem", + "teo", + "tet", + "tfi", + "tg", + "tg-Arab", + "tg-PK", + "tgc", + "tgo", + "tgu", + "th", + "thl", + "thq", + "thr", + "ti", + "tif", + "tig", + "tik", + "tim", + "tio", + "tiv", + "tk", + "tkl", + "tkr", + "tkt", + "tl", + "tlf", + "tlx", + "tly", + "tmh", + "tmy", + "tn", + "tnh", + "to", + "tof", + "tog", + "toq", + "tpi", + "tpm", + "tpz", + "tqo", + "tr", + "tru", + "trv", + "trw", + "ts", + "tsd", + "tsf", + "tsg", + "tsj", + "tsw", + "tt", + "ttd", + "tte", + "ttj", + "ttr", + "tts", + "ttt", + "tuh", + "tul", + "tum", + "tuq", + "tvd", + "tvl", + "tvu", + "twh", + "twq", + "txg", + "txo", + "ty", + "tya", + "tyv", + "tzm", + "ubu", + "udi", + "udm", + "ug", + "ug-Cyrl", + "ug-KZ", + "ug-MN", + "uga", + "uk", + "uli", + "umb", + "und", + "und-002", + "und-003", + "und-005", + "und-009", + "und-011", + "und-013", + "und-014", + "und-015", + "und-017", + "und-018", + "und-019", + "und-021", + "und-029", + "und-030", + "und-034", + "und-035", + "und-039", + "und-053", + "und-054", + "und-057", + "und-061", + "und-142", + "und-143", + "und-145", + "und-150", + "und-151", + "und-154", + "und-155", + "und-202", + "und-419", + "und-AD", + "und-Adlm", + "und-AE", + "und-AF", + "und-Aghb", + "und-Ahom", + "und-AL", + "und-AM", + "und-AO", + "und-AQ", + "und-AR", + "und-Arab", + "und-Arab-CC", + "und-Arab-CN", + "und-Arab-GB", + "und-Arab-ID", + "und-Arab-IN", + "und-Arab-KH", + "und-Arab-MM", + "und-Arab-MN", + "und-Arab-MU", + "und-Arab-NG", + "und-Arab-PK", + "und-Arab-TG", + "und-Arab-TH", + "und-Arab-TJ", + "und-Arab-TR", + "und-Arab-YT", + "und-Armi", + "und-Armn", + "und-AS", + "und-AT", + "und-Avst", + "und-AW", + "und-AX", + "und-AZ", + "und-BA", + "und-Bali", + "und-Bamu", + "und-Bass", + "und-Batk", + "und-BD", + "und-BE", + "und-Beng", + "und-BF", + "und-BG", + "und-BH", + "und-Bhks", + "und-BI", + "und-BJ", + "und-BL", + "und-BN", + "und-BO", + "und-Bopo", + "und-BQ", + "und-BR", + "und-Brah", + "und-Brai", + "und-BT", + "und-Bugi", + "und-Buhd", + "und-BV", + "und-BY", + "und-Cakm", + "und-Cans", + "und-Cari", + "und-CD", + "und-CF", + "und-CG", + "und-CH", + "und-Cham", + "und-Cher", + "und-Chrs", + "und-CI", + "und-CL", + "und-CM", + "und-CN", + "und-CO", + "und-Copt", + "und-CP", + "und-Cpmn", + "und-Cpmn-CY", + "und-Cprt", + "und-CR", + "und-CU", + "und-CV", + "und-CW", + "und-CY", + "und-Cyrl", + "und-Cyrl-AL", + "und-Cyrl-BA", + "und-Cyrl-GE", + "und-Cyrl-GR", + "und-Cyrl-MD", + "und-Cyrl-RO", + "und-Cyrl-SK", + "und-Cyrl-TR", + "und-Cyrl-XK", + "und-CZ", + "und-DE", + "und-Deva", + "und-Deva-BT", + "und-Deva-FJ", + "und-Deva-MU", + "und-Deva-PK", + "und-Diak", + "und-DJ", + "und-DK", + "und-DO", + "und-Dogr", + "und-Dupl", + "und-DZ", + "und-EA", + "und-EC", + "und-EE", + "und-EG", + "und-Egyp", + "und-EH", + "und-Elba", + "und-Elym", + "und-ER", + "und-ES", + "und-ET", + "und-Ethi", + "und-EU", + "und-EZ", + "und-FI", + "und-FO", + "und-FR", + "und-GA", + "und-GE", + "und-Geor", + "und-GF", + "und-GH", + "und-GL", + "und-Glag", + "und-GN", + "und-Gong", + "und-Gonm", + "und-Goth", + "und-GP", + "und-GQ", + "und-GR", + "und-Gran", + "und-Grek", + "und-Grek-TR", + "und-GS", + "und-GT", + "und-Gujr", + "und-Guru", + "und-GW", + "und-Hanb", + "und-Hang", + "und-Hani", + "und-Hano", + "und-Hans", + "und-Hant", + "und-Hant-CA", + "und-Hebr", + "und-Hebr-SE", + "und-Hebr-UA", + "und-Hebr-US", + "und-Hira", + "und-HK", + "und-Hluw", + "und-HM", + "und-Hmng", + "und-Hmnp", + "und-HN", + "und-HR", + "und-HT", + "und-HU", + "und-Hung", + "und-IC", + "und-ID", + "und-IL", + "und-IN", + "und-IQ", + "und-IR", + "und-IS", + "und-IT", + "und-Ital", + "und-Jamo", + "und-Java", + "und-JO", + "und-JP", + "und-Jpan", + "und-Kali", + "und-Kana", + "und-Kawi", + "und-KE", + "und-KG", + "und-KH", + "und-Khar", + "und-Khmr", + "und-Khoj", + "und-Kits", + "und-KM", + "und-Knda", + "und-Kore", + "und-KP", + "und-KR", + "und-Kthi", + "und-KW", + "und-KZ", + "und-LA", + "und-Lana", + "und-Laoo", + "und-Latn-AF", + "und-Latn-AM", + "und-Latn-CN", + "und-Latn-CY", + "und-Latn-DZ", + "und-Latn-ET", + "und-Latn-GE", + "und-Latn-IR", + "und-Latn-KM", + "und-Latn-MA", + "und-Latn-MK", + "und-Latn-MM", + "und-Latn-MO", + "und-Latn-MR", + "und-Latn-RU", + "und-Latn-SY", + "und-Latn-TN", + "und-Latn-TW", + "und-Latn-UA", + "und-LB", + "und-Lepc", + "und-LI", + "und-Limb", + "und-Lina", + "und-Linb", + "und-Lisu", + "und-LK", + "und-LS", + "und-LT", + "und-LU", + "und-LV", + "und-LY", + "und-Lyci", + "und-Lydi", + "und-MA", + "und-Mahj", + "und-Maka", + "und-Mand", + "und-Mani", + "und-Marc", + "und-MC", + "und-MD", + "und-ME", + "und-Medf", + "und-Mend", + "und-Merc", + "und-Mero", + "und-MF", + "und-MG", + "und-MK", + "und-ML", + "und-Mlym", + "und-MM", + "und-MN", + "und-MO", + "und-Modi", + "und-Mong", + "und-MQ", + "und-MR", + "und-Mroo", + "und-MT", + "und-Mtei", + "und-MU", + "und-Mult", + "und-MV", + "und-MX", + "und-MY", + "und-Mymr", + "und-Mymr-IN", + "und-Mymr-TH", + "und-MZ", + "und-NA", + "und-Nagm", + "und-Nand", + "und-Narb", + "und-Nbat", + "und-NC", + "und-NE", + "und-Newa", + "und-NI", + "und-Nkoo", + "und-NL", + "und-NO", + "und-NP", + "und-Nshu", + "und-Ogam", + "und-Olck", + "und-OM", + "und-Orkh", + "und-Orya", + "und-Osge", + "und-Osma", + "und-Ougr", + "und-PA", + "und-Palm", + "und-Pauc", + "und-PE", + "und-Perm", + "und-PF", + "und-PG", + "und-PH", + "und-Phag", + "und-Phli", + "und-Phlp", + "und-Phnx", + "und-PK", + "und-PL", + "und-Plrd", + "und-PM", + "und-PR", + "und-Prti", + "und-PS", + "und-PT", + "und-PW", + "und-PY", + "und-QA", + "und-QO", + "und-RE", + "und-Rjng", + "und-RO", + "und-Rohg", + "und-RS", + "und-RU", + "und-Runr", + "und-RW", + "und-SA", + "und-Samr", + "und-Sarb", + "und-Saur", + "und-SC", + "und-SD", + "und-SE", + "und-Sgnw", + "und-Shaw", + "und-Shrd", + "und-SI", + "und-Sidd", + "und-Sind", + "und-Sinh", + "und-SJ", + "und-SK", + "und-SM", + "und-SN", + "und-SO", + "und-Sogd", + "und-Sogo", + "und-Sora", + "und-Soyo", + "und-SR", + "und-ST", + "und-Sund", + "und-SV", + "und-SY", + "und-Sylo", + "und-Syrc", + "und-Tagb", + "und-Takr", + "und-Tale", + "und-Talu", + "und-Taml", + "und-Tang", + "und-Tavt", + "und-TD", + "und-Telu", + "und-TF", + "und-Tfng", + "und-TG", + "und-Tglg", + "und-TH", + "und-Thaa", + "und-Thai", + "und-Thai-CN", + "und-Thai-KH", + "und-Thai-LA", + "und-Tibt", + "und-Tirh", + "und-TJ", + "und-TK", + "und-TL", + "und-TM", + "und-TN", + "und-Tnsa", + "und-TO", + "und-Toto", + "und-TR", + "und-TV", + "und-TW", + "und-TZ", + "und-UA", + "und-UG", + "und-Ugar", + "und-UY", + "und-UZ", + "und-VA", + "und-Vaii", + "und-VE", + "und-Vith", + "und-VN", + "und-VU", + "und-Wara", + "und-Wcho", + "und-WF", + "und-WS", + "und-XK", + "und-Xpeo", + "und-Xsux", + "und-YE", + "und-Yezi", + "und-Yiii", + "und-YT", + "und-Zanb", + "und-ZW", + "unr", + "unr-Deva", + "unr-NP", + "unx", + "uok", + "ur", + "uri", + "urt", + "urw", + "usa", + "uth", + "utr", + "uvh", + "uvl", + "uz", + "uz-AF", + "uz-Arab", + "uz-CN", + "vag", + "vai", + "van", + "ve", + "vec", + "vep", + "vi", + "vic", + "viv", + "vls", + "vmf", + "vmw", + "vo", + "vot", + "vro", + "vun", + "vut", + "wa", + "wae", + "waj", + "wal", + "wan", + "war", + "wbp", + "wbq", + "wbr", + "wci", + "wer", + "wgi", + "whg", + "wib", + "wiu", + "wiv", + "wja", + "wji", + "wls", + "wmo", + "wnc", + "wni", + "wnu", + "wo", + "wob", + "wos", + "wrs", + "wsg", + "wsk", + "wtm", + "wuu", + "wuv", + "wwa", + "xav", + "xbi", + "xco", + "xcr", + "xes", + "xh", + "xla", + "xlc", + "xld", + "xmf", + "xmn", + "xmr", + "xna", + "xnr", + "xog", + "xon", + "xpr", + "xrb", + "xsa", + "xsi", + "xsm", + "xsr", + "xwe", + "yam", + "yao", + "yap", + "yas", + "yat", + "yav", + "yay", + "yaz", + "yba", + "ybb", + "yby", + "yer", + "ygr", + "ygw", + "yi", + "yko", + "yle", + "ylg", + "yll", + "yml", + "yo", + "yon", + "yrb", + "yre", + "yrl", + "yss", + "yua", + "yue", + "yue-CN", + "yue-Hans", + "yuj", + "yut", + "yuw", + "za", + "zag", + "zdj", + "zea", + "zgh", + "zh", + "zh-AU", + "zh-BN", + "zh-Bopo", + "zh-GB", + "zh-GF", + "zh-Hanb", + "zh-Hant", + "zh-HK", + "zh-ID", + "zh-MO", + "zh-PA", + "zh-PF", + "zh-PH", + "zh-SR", + "zh-TH", + "zh-TW", + "zh-US", + "zh-VN", + "zhx", + "zia", + "zkt", + "zlm", + "zmi", + "zne", + "zu", + "zza", + ]; +} + +#[allow(dead_code)] +pub mod short_subtags_10pct { + pub static STRINGS: &[&str] = &[ + "aa", + "acd", + "aeb", + "ahl", + "amm", + "aom", + "arc-Nbat", + "asa", + "avl", + "az", + "bas", + "bcf", + "bef", + "bft", + "bho", + "bjh", + "bkq", + "bmu", + "bqc", + "bsj", + "bug", + "bye", + "bzw", + "cgg", + "cjv", + "cop", + "csw", + "dah", + "den", + "dnj", + "dtp", + "dyo", + "eky", + "es", + "ext", + "ffi", + "fod", + "fub", + "fy", + "gay", + "gdr", + "gjk", + "gnd", + "grb", + "gur", + "gwt", + "hbb", + "hil", + "ho", + "hur", + "ich", + "ijj", + "iou", + "ja", + "jib", + "kac", + "kbq", + "kdt", + "kgp", + "kij", + "kk-Arab", + "klx", + "knp", + "kpr", + "krl", + "ktb", + "kue", + "kw", + "kxp", + "kzh", + "las", + "lem", + "lif", + "lle", + "lok", + "luo", + "mai", + "mbq", + "mdf", + "met", + "mgo", + "miw", + "mmo", + "mo", + "mql", + "mti", + "mwv", + "myz", + "nak", + "nco", + "ng", + "nin", + "nn", + "nop", + "ntm", + "nxr", + "okr", + "os", + "pal", + "ped", + "pl", + "pra", + "quc", + "rhg", + "rmu", + "ru", + "saq", + "scl", + "se", + "shi", + "sk", + "sma", + "snx", + "sps", + "srx", + "sue", + "swv", + "taq", + "tdd", + "tg", + "ti", + "tkt", + "tof", + "trw", + "ttj", + "tvu", + "udi", + "und", + "und-018", + "und-057", + "und-419", + "und-AQ", + "und-Arab-MN", + "und-Armn", + "und-Bass", + "und-BJ", + "und-Bugi", + "und-CH", + "und-CP", + "und-Cyrl-AL", + "und-DE", + "und-Dogr", + "und-Elym", + "und-GA", + "und-Goth", + "und-Guru", + "und-Hebr-SE", + "und-HR", + "und-IS", + "und-Kawi", + "und-Kore", + "und-Latn-AM", + "und-Latn-MM", + "und-LI", + "und-LY", + "und-MD", + "und-Mlym", + "und-Mtei", + "und-NA", + "und-NL", + "und-Osma", + "und-Phag", + "und-PS", + "und-RS", + "und-SE", + "und-SM", + "und-SV", + "und-Tavt", + "und-Thai-CN", + "und-Tnsa", + "und-UY", + "und-WF", + "und-ZW", + "usa", + "vai", + "vmw", + "wan", + "wiu", + "wob", + "xbi", + "xmr", + "xsr", + "yba", + "yll", + "yue-CN", + "zh", + "zh-MO", + "zia", + ]; +} diff --git a/utils/litemap/src/map.rs b/utils/litemap/src/map.rs index 9264cbd3e04..6b1bc2e7780 100644 --- a/utils/litemap/src/map.rs +++ b/utils/litemap/src/map.rs @@ -4,12 +4,13 @@ use crate::store::*; use alloc::borrow::Borrow; +use alloc::boxed::Box; use alloc::vec::Vec; use core::cmp::Ordering; use core::iter::FromIterator; use core::marker::PhantomData; use core::mem; -use core::ops::{Index, IndexMut}; +use core::ops::{Index, IndexMut, Range}; /// A simple "flat" map based on a sorted vector /// @@ -125,6 +126,118 @@ where pub fn last(&self) -> Option<(&K, &V)> { self.values.lm_get(self.len() - 1).map(|(k, v)| (k, v)) } + + /// Returns a new [`LiteMap`] with owned keys and values. + /// + /// The trait bounds allow transforming most slice and string types. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map: LiteMap<&str, &str> = LiteMap::new_vec(); + /// map.insert("one", "uno"); + /// map.insert("two", "dos"); + /// + /// let boxed_map: LiteMap, Box> = map.to_boxed_keys_values(); + /// + /// assert_eq!(boxed_map.get("one"), Some(&Box::from("uno"))); + /// ``` + pub fn to_boxed_keys_values(&self) -> LiteMap, Box, SB> + where + SB: StoreMut, Box>, + K: Borrow, + V: Borrow, + Box: for<'a> From<&'a KB>, + Box: for<'a> From<&'a VB>, + { + let mut values = SB::lm_with_capacity(self.len()); + for i in 0..self.len() { + #[allow(clippy::unwrap_used)] // iterating over our own length + let (k, v) = self.values.lm_get(i).unwrap(); + values.lm_push(Box::from(k.borrow()), Box::from(v.borrow())) + } + LiteMap { + values, + _key_type: PhantomData, + _value_type: PhantomData, + } + } + + /// Returns a new [`LiteMap`] with owned keys and cloned values. + /// + /// The trait bounds allow transforming most slice and string types. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map: LiteMap<&str, usize> = LiteMap::new_vec(); + /// map.insert("one", 11); + /// map.insert("two", 22); + /// + /// let boxed_map: LiteMap, usize> = map.to_boxed_keys(); + /// + /// assert_eq!(boxed_map.get("one"), Some(&11)); + /// ``` + pub fn to_boxed_keys(&self) -> LiteMap, V, SB> + where + V: Clone, + SB: StoreMut, V>, + K: Borrow, + Box: for<'a> From<&'a KB>, + { + let mut values = SB::lm_with_capacity(self.len()); + for i in 0..self.len() { + #[allow(clippy::unwrap_used)] // iterating over our own length + let (k, v) = self.values.lm_get(i).unwrap(); + values.lm_push(Box::from(k.borrow()), v.clone()) + } + LiteMap { + values, + _key_type: PhantomData, + _value_type: PhantomData, + } + } + + /// Returns a new [`LiteMap`] with cloned keys and owned values. + /// + /// The trait bounds allow transforming most slice and string types. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map: LiteMap = LiteMap::new_vec(); + /// map.insert(11, "uno"); + /// map.insert(22, "dos"); + /// + /// let boxed_map: LiteMap> = map.to_boxed_values(); + /// + /// assert_eq!(boxed_map.get(&11), Some(&Box::from("uno"))); + /// ``` + pub fn to_boxed_values(&self) -> LiteMap, SB> + where + K: Clone, + SB: StoreMut>, + V: Borrow, + Box: for<'a> From<&'a VB>, + { + let mut values = SB::lm_with_capacity(self.len()); + for i in 0..self.len() { + #[allow(clippy::unwrap_used)] // iterating over our own length + let (k, v) = self.values.lm_get(i).unwrap(); + values.lm_push(k.clone(), Box::from(v.borrow())) + } + LiteMap { + values, + _key_type: PhantomData, + _value_type: PhantomData, + } + } } impl LiteMap @@ -197,6 +310,197 @@ where } } +impl LiteMap +where + S: StoreSlice, +{ + /// Creates a new [`LiteMap`] from a range of the current [`LiteMap`]. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(1, "one"); + /// map.insert(2, "two"); + /// map.insert(3, "three"); + /// + /// let mut sub_map = map.get_indexed_range(1..3).expect("valid range"); + /// assert_eq!(sub_map.get(&1), None); + /// assert_eq!(sub_map.get(&2), Some(&"two")); + /// assert_eq!(sub_map.get(&3), Some(&"three")); + /// ``` + pub fn get_indexed_range(&self, range: Range) -> Option> { + let subslice = self.values.lm_get_range(range)?; + Some(LiteMap { + values: subslice, + _key_type: PhantomData, + _value_type: PhantomData, + }) + } + + /// Borrows this [`LiteMap`] as one of its slice type. + /// + /// This can be useful in situations where you need a `LiteMap` by value but do not want + /// to clone the owned version. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(1, "one"); + /// map.insert(2, "two"); + /// + /// let borrowed_map = map.as_sliced(); + /// assert_eq!(borrowed_map.get(&1), Some(&"one")); + /// assert_eq!(borrowed_map.get(&2), Some(&"two")); + /// ``` + pub fn as_sliced(&self) -> LiteMap { + // Won't panic: 0..self.len() is within range + #[allow(clippy::unwrap_used)] + let subslice = self.values.lm_get_range(0..self.len()).unwrap(); + LiteMap { + values: subslice, + _key_type: PhantomData, + _value_type: PhantomData, + } + } + + /// Borrows the backing buffer of this [`LiteMap`] as its slice type. + /// + /// The slice will be sorted. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(1, "one"); + /// map.insert(2, "two"); + /// + /// let slice = map.as_slice(); + /// assert_eq!(slice, &[(1, "one"), (2, "two")]); + /// ``` + pub fn as_slice(&self) -> &S::Slice { + // Won't panic: 0..self.len() is within range + #[allow(clippy::unwrap_used)] + self.values.lm_get_range(0..self.len()).unwrap() + } +} + +impl<'a, K: 'a, V: 'a, S> LiteMap +where + S: Store, +{ + /// Returns a new [`LiteMap`] with keys and values borrowed from this one. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map: LiteMap, String> = LiteMap::new_vec(); + /// map.insert(Box::new(1), "one".to_string()); + /// map.insert(Box::new(2), "two".to_string()); + /// + /// let borrowed_map: LiteMap<&usize, &str> = map.to_borrowed_keys_values(); + /// + /// assert_eq!(borrowed_map.get(&1), Some(&"one")); + /// ``` + pub fn to_borrowed_keys_values( + &'a self, + ) -> LiteMap<&'a KB, &'a VB, SB> + where + K: Borrow, + V: Borrow, + SB: StoreMut<&'a KB, &'a VB>, + { + let mut values = SB::lm_with_capacity(self.len()); + for i in 0..self.len() { + #[allow(clippy::unwrap_used)] // iterating over our own length + let (k, v) = self.values.lm_get(i).unwrap(); + values.lm_push(k.borrow(), v.borrow()) + } + LiteMap { + values, + _key_type: PhantomData, + _value_type: PhantomData, + } + } + + /// Returns a new [`LiteMap`] with keys borrowed from this one and cloned values. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map: LiteMap, String> = LiteMap::new_vec(); + /// map.insert(Box::new(1), "one".to_string()); + /// map.insert(Box::new(2), "two".to_string()); + /// + /// let borrowed_map: LiteMap<&usize, String> = map.to_borrowed_keys(); + /// + /// assert_eq!(borrowed_map.get(&1), Some(&"one".to_string())); + /// ``` + pub fn to_borrowed_keys(&'a self) -> LiteMap<&'a KB, V, SB> + where + K: Borrow, + V: Clone, + SB: StoreMut<&'a KB, V>, + { + let mut values = SB::lm_with_capacity(self.len()); + for i in 0..self.len() { + #[allow(clippy::unwrap_used)] // iterating over our own length + let (k, v) = self.values.lm_get(i).unwrap(); + values.lm_push(k.borrow(), v.clone()) + } + LiteMap { + values, + _key_type: PhantomData, + _value_type: PhantomData, + } + } + + /// Returns a new [`LiteMap`] with values borrowed from this one and cloned keys. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// + /// let mut map: LiteMap, String> = LiteMap::new_vec(); + /// map.insert(Box::new(1), "one".to_string()); + /// map.insert(Box::new(2), "two".to_string()); + /// + /// let borrowed_map: LiteMap, &str> = map.to_borrowed_values(); + /// + /// assert_eq!(borrowed_map.get(&1), Some(&"one")); + /// ``` + pub fn to_borrowed_values(&'a self) -> LiteMap + where + K: Clone, + V: Borrow, + SB: StoreMut, + { + let mut values = SB::lm_with_capacity(self.len()); + for i in 0..self.len() { + #[allow(clippy::unwrap_used)] // iterating over our own length + let (k, v) = self.values.lm_get(i).unwrap(); + values.lm_push(k.clone(), v.borrow()) + } + LiteMap { + values, + _key_type: PhantomData, + _value_type: PhantomData, + } + } +} + impl LiteMap where S: StoreMut, @@ -359,6 +663,64 @@ where } } + /// Attemps to insert a unique entry into the map. + /// + /// If `key` is not already in the map, invokes the closure to compute `value`, inserts + /// the pair into the map, and returns a reference to the value. The closure is passed + /// a reference to the `key` argument. + /// + /// If `key` is already in the map, a reference to the existing value is returned. + /// + /// Additionally, the index of the value in the map is returned. If it is not desirable + /// to hold on to the mutable reference's lifetime, the index can be used to access the + /// element via [`LiteMap::get_indexed()`]. + /// + /// The closure returns a `Result` to allow for a fallible insertion function. If the + /// creation of `value` is infallible, you can use [`core::convert::Infallible`]. + /// + /// ``` + /// use litemap::LiteMap; + /// + /// /// Helper function to unwrap an `Infallible` result from the insertion function + /// fn unwrap_infallible(result: Result) -> T { + /// result.unwrap_or_else(|never| match never {}) + /// } + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(1, "one"); + /// map.insert(3, "three"); + /// + /// // 2 is not yet in the map... + /// let result1 = unwrap_infallible( + /// map.try_get_or_insert(2, |_| Ok("two")) + /// ); + /// assert_eq!(result1.1, &"two"); + /// assert_eq!(map.len(), 3); + /// + /// // ...but now it is. + /// let result1 = unwrap_infallible( + /// map.try_get_or_insert(2, |_| Ok("TWO")) + /// ); + /// assert_eq!(result1.1, &"two"); + /// assert_eq!(map.len(), 3); + /// ``` + pub fn try_get_or_insert( + &mut self, + key: K, + value: impl FnOnce(&K) -> Result, + ) -> Result<(usize, &V), E> { + let idx = match self.values.lm_binary_search_by(|k| k.cmp(&key)) { + Ok(idx) => idx, + Err(idx) => { + let value = value(&key)?; + self.values.lm_insert(idx, key, value); + idx + } + }; + #[allow(clippy::unwrap_used)] // item at idx found or inserted above + Ok((idx, self.values.lm_get(idx).unwrap().1)) + } + /// Remove the value at `key`, returning it if it exists. /// /// ```rust diff --git a/utils/litemap/src/store/mod.rs b/utils/litemap/src/store/mod.rs index 3468ebb97f4..f41e1e3ffb2 100644 --- a/utils/litemap/src/store/mod.rs +++ b/utils/litemap/src/store/mod.rs @@ -30,6 +30,7 @@ use core::cmp::Ordering; use core::iter::DoubleEndedIterator; use core::iter::FromIterator; use core::iter::Iterator; +use core::ops::Range; /// Trait to enable const construction of empty store. pub trait StoreConstEmpty { @@ -76,6 +77,12 @@ pub trait StoreFromIterable: Store { fn lm_sort_from_iter>(iter: I) -> Self; } +pub trait StoreSlice: Store { + type Slice: ?Sized; + + fn lm_get_range(&self, range: Range) -> Option<&Self::Slice>; +} + pub trait StoreMut: Store { /// Creates a new store with the specified capacity hint. /// @@ -129,7 +136,7 @@ pub trait StoreMut: Store { } /// Iterator methods for the LiteMap store. -pub trait StoreIterable<'a, K: 'a, V: 'a>: Store { +pub trait StoreIterable<'a, K: 'a + ?Sized, V: 'a + ?Sized>: Store { type KeyValueIter: Iterator + DoubleEndedIterator + 'a; /// Returns an iterator over key/value pairs. diff --git a/utils/litemap/src/store/slice_impl.rs b/utils/litemap/src/store/slice_impl.rs index 4afb4fac268..48f6ca40cf9 100644 --- a/utils/litemap/src/store/slice_impl.rs +++ b/utils/litemap/src/store/slice_impl.rs @@ -45,6 +45,14 @@ impl<'a, K: 'a, V: 'a> Store for &'a [(K, V)] { } } +impl<'a, K, V> StoreSlice for &'a [(K, V)] { + type Slice = [(K, V)]; + + fn lm_get_range(&self, range: Range) -> Option<&Self::Slice> { + self.get(range) + } +} + impl<'a, K: 'a, V: 'a> StoreIterable<'a, K, V> for &'a [(K, V)] { type KeyValueIter = core::iter::Map, MapF>; diff --git a/utils/litemap/src/store/vec_impl.rs b/utils/litemap/src/store/vec_impl.rs index 361b926c31c..2205e8e8ff1 100644 --- a/utils/litemap/src/store/vec_impl.rs +++ b/utils/litemap/src/store/vec_impl.rs @@ -53,6 +53,14 @@ impl Store for Vec<(K, V)> { } } +impl StoreSlice for Vec<(K, V)> { + type Slice = [(K, V)]; + + fn lm_get_range(&self, range: Range) -> Option<&Self::Slice> { + self.get(range) + } +} + impl StoreMut for Vec<(K, V)> { #[inline] fn lm_with_capacity(capacity: usize) -> Self { diff --git a/utils/zerovec/src/zerovec/mod.rs b/utils/zerovec/src/zerovec/mod.rs index 634fe05d393..7a3ea0e5380 100644 --- a/utils/zerovec/src/zerovec/mod.rs +++ b/utils/zerovec/src/zerovec/mod.rs @@ -257,6 +257,24 @@ impl<'a, T: AsULE + Ord> Ord for ZeroVec<'a, T> { } } +impl<'a, T: AsULE> AsRef<[T::ULE]> for ZeroVec<'a, T> { + fn as_ref(&self) -> &[T::ULE] { + self.as_ule_slice() + } +} + +impl<'a, T: AsULE> From<&'a [T::ULE]> for ZeroVec<'a, T> { + fn from(other: &'a [T::ULE]) -> Self { + ZeroVec::new_borrowed(other) + } +} + +impl<'a, T: AsULE> From> for ZeroVec<'a, T> { + fn from(other: Vec) -> Self { + ZeroVec::new_owned(other) + } +} + impl<'a, T> ZeroVec<'a, T> where T: AsULE + ?Sized, From 9c8242f2cb85aed1797e4a090a1499f44deeb134 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 30 Jun 2023 16:50:57 +0200 Subject: [PATCH 02/31] deps --- Cargo.lock | 11 ----------- experimental/zerotrie/Cargo.toml | 2 -- 2 files changed, 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 23490edef06..d91ead1336f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4487,15 +4487,6 @@ dependencies = [ "rand", ] -[[package]] -name = "wyhash" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf6e163c25e3fac820b4b453185ea2dea3b6a3e0a721d4d23d75bd33734c295" -dependencies = [ - "rand_core", -] - [[package]] name = "yoke" version = "0.7.1" @@ -4553,8 +4544,6 @@ dependencies = [ "ref-cast", "serde", "serde_json", - "t1ha", - "wyhash", "zerovec", ] diff --git a/experimental/zerotrie/Cargo.toml b/experimental/zerotrie/Cargo.toml index 265b28c33a2..7b3eb53cdc2 100644 --- a/experimental/zerotrie/Cargo.toml +++ b/experimental/zerotrie/Cargo.toml @@ -34,8 +34,6 @@ zerovec = { path = "../../utils/zerovec", optional = true } litemap = { path = "../../utils/litemap", default-features = false, features = ["alloc"], optional = true } ref-cast = { version = "1.0.12" } serde = { version = "1.0", optional = true } -t1ha = "0.1" -wyhash = "0.5" displaydoc = { version = "0.2.3", default-features = false } [dev-dependencies] From 7829adbddc83af008a1dd44ca194266ef1ee3800 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 30 Jun 2023 16:51:18 +0200 Subject: [PATCH 03/31] inline --- experimental/zerotrie/src/zerotrie.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 3e83e78b022..1ff93358d15 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -125,23 +125,28 @@ macro_rules! impl_zerotrie_subtype { ($name:ident, $variant:ident, $getter_fn:path, $iter_ty:ty, $iter_fn:path, $cnv_fn:path) => { impl $name { /// Wrap this specific ZeroTrie variant into a ZeroTrie. + #[inline] pub const fn into_zerotrie(self) -> ZeroTrie { ZeroTrie(ZeroTrieInner::$variant(self)) } /// Create a trie directly from a store. /// /// If the store does not contain valid bytes, unexpected behavior may occur. + #[inline] pub const fn from_store(store: S) -> Self { Self { store } } /// Takes the byte store from this trie. + #[inline] pub fn take_store(self) -> S { self.store } /// Maps the store into another type. + #[inline] pub fn map_store(self, f: impl FnOnce(S) -> X) -> $name { $name::::from_store(f(self.store)) } + #[inline] pub(crate) fn map_store_into_zerotrie(self, f: impl FnOnce(S) -> X) -> ZeroTrie { $name::::from_store(f(self.store)).into_zerotrie() } @@ -151,11 +156,13 @@ macro_rules! impl_zerotrie_subtype { S: AsRef<[u8]> + ?Sized, { /// Queries the trie for a string. + #[inline] pub fn get(&self, key: K) -> Option where K: AsRef<[u8]> { // TODO: Should this be AsRef or Borrow? $getter_fn(self.store.as_ref(), key.as_ref()) } /// Returns `true` if the trie is empty. + #[inline] pub fn is_empty(&self) -> bool { self.store.as_ref().is_empty() } @@ -172,14 +179,17 @@ macro_rules! impl_zerotrie_subtype { /// assert_eq!(8, trie.byte_len()); /// assert_eq!(2, trie.iter().count()); /// ``` + #[inline] pub fn byte_len(&self) -> usize { self.store.as_ref().len() } /// Returns the bytes contained in the underlying store. + #[inline] pub fn as_bytes(&self) -> &[u8] { self.store.as_ref() } /// Returns this trie as a reference transparent over a byte slice. + #[inline] pub fn as_borrowed(&self) -> &$name<[u8]> { $name::from_bytes(self.store.as_ref()) } @@ -205,11 +215,13 @@ macro_rules! impl_zerotrie_subtype { /// assert_eq!(trie.get(b"abc"), Some(5)); /// assert_eq!(owned.get(b"abc"), Some(5)); /// ``` + #[inline] pub fn to_owned(&self) -> $name> { $name::from_store( Vec::from(self.store.as_ref()), ) } + #[inline] pub fn iter(&self) -> impl Iterator + '_ { $iter_fn(self.as_bytes()) } @@ -218,6 +230,7 @@ macro_rules! impl_zerotrie_subtype { /// Casts from a byte slice to a reference to a trie with the same lifetime. /// /// If the bytes are not a valid trie, unexpected behavior may occur. + #[inline] pub fn from_bytes(trie: &[u8]) -> &Self { Self::ref_cast(trie) } @@ -299,6 +312,7 @@ macro_rules! impl_zerotrie_subtype { } // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. impl Borrow<$name<[u8]>> for $name<&[u8]> { + #[inline] fn borrow(&self) -> &$name<[u8]> { self.as_borrowed() } @@ -306,6 +320,7 @@ macro_rules! impl_zerotrie_subtype { // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. #[cfg(feature = "alloc")] impl Borrow<$name<[u8]>> for $name> { + #[inline] fn borrow(&self) -> &$name<[u8]> { self.as_borrowed() } @@ -313,6 +328,7 @@ macro_rules! impl_zerotrie_subtype { // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. #[cfg(feature = "alloc")] impl Borrow<$name<[u8]>> for $name> { + #[inline] fn borrow(&self) -> &$name<[u8]> { self.as_borrowed() } From c36537a5407e42cd875a2c13df5e621d777fd762 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 30 Jun 2023 16:51:45 +0200 Subject: [PATCH 04/31] comments and docs --- experimental/zerotrie/examples/byteph.rs | 6 ------ experimental/zerotrie/src/lib.rs | 2 +- experimental/zerotrie/src/zerotrie.rs | 19 +++++++++++++++++++ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/experimental/zerotrie/examples/byteph.rs b/experimental/zerotrie/examples/byteph.rs index 79d09741e5f..ffbe4279490 100644 --- a/experimental/zerotrie/examples/byteph.rs +++ b/experimental/zerotrie/examples/byteph.rs @@ -34,16 +34,10 @@ fn random_alphanums(seed: u64, len: usize) -> Vec { fn main(_argc: isize, _argv: *const *const u8) -> isize { icu_benchmark_macros::main_setup!(); - // let bytes = b"abdeghi"; - // let bytes = b"abdeghklmopuvxz"; - // let bytes = b"qwertyuiopasdfgh"; - // let bytes = b"qwrtuipadgklzxcbmQWRUOPADHKZVM"; - let mut p_distr = vec![0; 256]; for len in 0..256 { for seed in 0..100 { let bytes = random_alphanums(seed, len); - // println!("{len} {seed}"); let (p, _) = find(bytes.as_slice()).unwrap(); p_distr[p as usize] += 1; } diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index dd960e1e8b0..4e606c169db 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -3,7 +3,7 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). //! A data structure offering zero-copy storage and retrieval of byte strings, with a focus -//! on the efficient storage of ASCII strings. Strings are mapped to a `usize` values. +//! on the efficient storage of ASCII strings. Strings are mapped to `usize` values. //! //! [`ZeroTrie`] does not support mutation because doing so would require recomputing the entire //! data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`]. diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 1ff93358d15..adbe690ccdd 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -28,6 +28,19 @@ use litemap::LiteMap; /// You can create a `ZeroTrie` directly, in which case the most appropriate /// backing implementation will be chosen. /// +/// # Backing Store +/// +/// The data structure has a flexible backing data store. The only requirement for most +/// functionality is that it implement `AsRef<[u8]>`. All of the following are valid +/// ZeroTrie types: +/// +/// - `ZeroTrie<[u8]>` (dynamically sized type: must be stored in a reference or Box) +/// - `ZeroTrie<&[u8]>` (borrows its data from a u8 buffer) +/// - `ZeroTrie>` (fully owned data) +/// - `ZeroTrie>` (the recommended borrowed-or-owned signature) +/// - `Cow>` (another borrowed-or-owned signature) +/// - `ZeroTrie>` (another borrowed-or-owned signature) +/// /// # Examples /// /// ``` @@ -60,6 +73,8 @@ pub(crate) enum ZeroTrieInner { /// A data structure that compactly maps from ASCII strings to integers. /// +/// For more information, see [`ZeroTrie`]. +/// /// # Examples /// /// ``` @@ -88,6 +103,8 @@ pub struct ZeroTrieSimpleAscii { /// A data structure that compactly maps from byte strings to integers. /// +/// For more information, see [`ZeroTrie`]. +/// /// # Examples /// /// ``` @@ -115,6 +132,8 @@ pub struct ZeroTriePerfectHash { } /// A data structure that maps from a large number of byte strings to integers. +/// +/// For more information, see [`ZeroTrie`]. #[repr(transparent)] #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] pub struct ZeroTrieExtendedCapacity { From 52517fb0314dbf5643068ca8f482209b889bde3d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 30 Jun 2023 16:51:55 +0200 Subject: [PATCH 05/31] ZeroTrieFlavor --- experimental/zerotrie/src/serde.rs | 8 +++---- experimental/zerotrie/src/zerotrie.rs | 30 +++++++++++++-------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/experimental/zerotrie/src/serde.rs b/experimental/zerotrie/src/serde.rs index 6fd87f6b325..793aa9595ff 100644 --- a/experimental/zerotrie/src/serde.rs +++ b/experimental/zerotrie/src/serde.rs @@ -3,7 +3,7 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::builder::bytestr::ByteStr; -use crate::zerotrie::ZeroTrieInner; +use crate::zerotrie::ZeroTrieFlavor; use crate::ZeroTrie; use crate::ZeroTrieExtendedCapacity; use crate::ZeroTriePerfectHash; @@ -288,9 +288,9 @@ where lm.serialize(serializer) } else { let (tag, bytes) = match &self.0 { - ZeroTrieInner::SimpleAscii(t) => (tags::SIMPLE_ASCII, t.as_bytes()), - ZeroTrieInner::PerfectHash(t) => (tags::PERFECT_HASH, t.as_bytes()), - ZeroTrieInner::ExtendedCapacity(t) => (tags::EXTENDED_CAPACITY, t.as_bytes()), + ZeroTrieFlavor::SimpleAscii(t) => (tags::SIMPLE_ASCII, t.as_bytes()), + ZeroTrieFlavor::PerfectHash(t) => (tags::PERFECT_HASH, t.as_bytes()), + ZeroTrieFlavor::ExtendedCapacity(t) => (tags::EXTENDED_CAPACITY, t.as_bytes()), }; let mut all_in_one_vec = Vec::with_capacity(bytes.len() + 1); all_in_one_vec.push(tag); diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index adbe690ccdd..dc3c24bd8fe 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -62,10 +62,10 @@ use litemap::LiteMap; /// # Ok::<_, zerotrie::ZeroTrieError>(()) /// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct ZeroTrie(pub(crate) ZeroTrieInner); +pub struct ZeroTrie(pub(crate) ZeroTrieFlavor); #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(crate) enum ZeroTrieInner { +pub(crate) enum ZeroTrieFlavor { SimpleAscii(ZeroTrieSimpleAscii), PerfectHash(ZeroTriePerfectHash), ExtendedCapacity(ZeroTrieExtendedCapacity), @@ -146,7 +146,7 @@ macro_rules! impl_zerotrie_subtype { /// Wrap this specific ZeroTrie variant into a ZeroTrie. #[inline] pub const fn into_zerotrie(self) -> ZeroTrie { - ZeroTrie(ZeroTrieInner::$variant(self)) + ZeroTrie(ZeroTrieFlavor::$variant(self)) } /// Create a trie directly from a store. /// @@ -493,30 +493,30 @@ impl_zerotrie_subtype!( macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 { - ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn(), - ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn(), - ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn(), + ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn(), + ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn(), + ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn(), } }; (&$self:ident, $inner_fn:ident()) => { match &$self.0 { - ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn(), - ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn(), - ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn(), + ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn(), + ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn(), + ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn(), } }; ($self:ident, $inner_fn:ident($arg:ident)) => { match $self.0 { - ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn($arg), - ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn($arg), - ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn($arg), + ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn($arg), + ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn($arg), + ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn($arg), } }; (&$self:ident, $inner_fn:ident($arg:ident)) => { match &$self.0 { - ZeroTrieInner::SimpleAscii(subtype) => subtype.$inner_fn($arg), - ZeroTrieInner::PerfectHash(subtype) => subtype.$inner_fn($arg), - ZeroTrieInner::ExtendedCapacity(subtype) => subtype.$inner_fn($arg), + ZeroTrieFlavor::SimpleAscii(subtype) => subtype.$inner_fn($arg), + ZeroTrieFlavor::PerfectHash(subtype) => subtype.$inner_fn($arg), + ZeroTrieFlavor::ExtendedCapacity(subtype) => subtype.$inner_fn($arg), } }; } From 993f0c2f8a652949a29f67e643d4f8d3af63f94a Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 30 Jun 2023 17:06:29 +0200 Subject: [PATCH 06/31] Name the type parameter `Store` --- experimental/zerotrie/src/serde.rs | 32 +++++------ experimental/zerotrie/src/zerotrie.rs | 76 +++++++++++++-------------- 2 files changed, 54 insertions(+), 54 deletions(-) diff --git a/experimental/zerotrie/src/serde.rs b/experimental/zerotrie/src/serde.rs index 793aa9595ff..ee688c8cb35 100644 --- a/experimental/zerotrie/src/serde.rs +++ b/experimental/zerotrie/src/serde.rs @@ -91,14 +91,14 @@ impl<'data> Serialize for &'data ByteStr { } } -impl<'de, 'data, X> Deserialize<'de> for ZeroTrieSimpleAscii +impl<'de, 'data, Store> Deserialize<'de> for ZeroTrieSimpleAscii where 'de: 'data, // DISCUSS: There are several possibilities for the bounds here that would // get the job done. I could look for Deserialize, but this would require // creating a custom Deserializer for the map case. I also considered // introducing a new trait instead of relying on From. - X: From<&'data [u8]> + From> + 'data, + Store: From<&'data [u8]> + From> + 'data, { fn deserialize(deserializer: D) -> Result where @@ -118,9 +118,9 @@ where } } -impl Serialize for ZeroTrieSimpleAscii +impl Serialize for ZeroTrieSimpleAscii where - X: AsRef<[u8]>, +Store: AsRef<[u8]>, { fn serialize(&self, serializer: S) -> Result where @@ -136,10 +136,10 @@ where } } -impl<'de, 'data, X> Deserialize<'de> for ZeroTriePerfectHash +impl<'de, 'data, Store> Deserialize<'de> for ZeroTriePerfectHash where 'de: 'data, - X: From<&'data [u8]> + From> + 'data, + Store: From<&'data [u8]> + From> + 'data, { fn deserialize(deserializer: D) -> Result where @@ -159,9 +159,9 @@ where } } -impl Serialize for ZeroTriePerfectHash +impl Serialize for ZeroTriePerfectHash where - X: AsRef<[u8]>, + Store: AsRef<[u8]>, { fn serialize(&self, serializer: S) -> Result where @@ -181,10 +181,10 @@ where } } -impl<'de, 'data, X> Deserialize<'de> for ZeroTrieExtendedCapacity +impl<'de, 'data, Store> Deserialize<'de> for ZeroTrieExtendedCapacity where 'de: 'data, - X: From<&'data [u8]> + From> + 'data, + Store: From<&'data [u8]> + From> + 'data, { fn deserialize(deserializer: D) -> Result where @@ -204,9 +204,9 @@ where } } -impl Serialize for ZeroTrieExtendedCapacity +impl Serialize for ZeroTrieExtendedCapacity where - X: AsRef<[u8]>, + Store: AsRef<[u8]>, { fn serialize(&self, serializer: S) -> Result where @@ -236,10 +236,10 @@ mod tags { pub(crate) const EXTENDED_CAPACITY: u8 = USE_PHF | BINARY_SPANS | EXTENDED; } -impl<'de, 'data, X> Deserialize<'de> for ZeroTrie +impl<'de, 'data, Store> Deserialize<'de> for ZeroTrie where 'de: 'data, - X: From<&'data [u8]> + From> + 'data, + Store: From<&'data [u8]> + From> + 'data, { fn deserialize(deserializer: D) -> Result where @@ -271,9 +271,9 @@ where } } -impl Serialize for ZeroTrie +impl Serialize for ZeroTrie where - X: AsRef<[u8]>, + Store: AsRef<[u8]>, { fn serialize(&self, serializer: S) -> Result where diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index dc3c24bd8fe..37b584f6d21 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -62,13 +62,13 @@ use litemap::LiteMap; /// # Ok::<_, zerotrie::ZeroTrieError>(()) /// ``` #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct ZeroTrie(pub(crate) ZeroTrieFlavor); +pub struct ZeroTrie(pub(crate) ZeroTrieFlavor); #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(crate) enum ZeroTrieFlavor { - SimpleAscii(ZeroTrieSimpleAscii), - PerfectHash(ZeroTriePerfectHash), - ExtendedCapacity(ZeroTrieExtendedCapacity), +pub(crate) enum ZeroTrieFlavor { + SimpleAscii(ZeroTrieSimpleAscii), + PerfectHash(ZeroTriePerfectHash), + ExtendedCapacity(ZeroTrieExtendedCapacity), } /// A data structure that compactly maps from ASCII strings to integers. @@ -97,8 +97,8 @@ pub(crate) enum ZeroTrieFlavor { /// ``` #[repr(transparent)] #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] -pub struct ZeroTrieSimpleAscii { - pub(crate) store: S, +pub struct ZeroTrieSimpleAscii { + pub(crate) store: Store, } /// A data structure that compactly maps from byte strings to integers. @@ -127,8 +127,8 @@ pub struct ZeroTrieSimpleAscii { /// ``` #[repr(transparent)] #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] -pub struct ZeroTriePerfectHash { - pub(crate) store: S, +pub struct ZeroTriePerfectHash { + pub(crate) store: Store, } /// A data structure that maps from a large number of byte strings to integers. @@ -136,43 +136,43 @@ pub struct ZeroTriePerfectHash { /// For more information, see [`ZeroTrie`]. #[repr(transparent)] #[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] -pub struct ZeroTrieExtendedCapacity { - pub(crate) store: S, +pub struct ZeroTrieExtendedCapacity { + pub(crate) store: Store, } macro_rules! impl_zerotrie_subtype { ($name:ident, $variant:ident, $getter_fn:path, $iter_ty:ty, $iter_fn:path, $cnv_fn:path) => { - impl $name { + impl $name { /// Wrap this specific ZeroTrie variant into a ZeroTrie. #[inline] - pub const fn into_zerotrie(self) -> ZeroTrie { + pub const fn into_zerotrie(self) -> ZeroTrie { ZeroTrie(ZeroTrieFlavor::$variant(self)) } /// Create a trie directly from a store. /// /// If the store does not contain valid bytes, unexpected behavior may occur. #[inline] - pub const fn from_store(store: S) -> Self { + pub const fn from_store(store: Store) -> Self { Self { store } } /// Takes the byte store from this trie. #[inline] - pub fn take_store(self) -> S { + pub fn take_store(self) -> Store { self.store } /// Maps the store into another type. #[inline] - pub fn map_store(self, f: impl FnOnce(S) -> X) -> $name { + pub fn map_store(self, f: impl FnOnce(Store) -> X) -> $name { $name::::from_store(f(self.store)) } #[inline] - pub(crate) fn map_store_into_zerotrie(self, f: impl FnOnce(S) -> X) -> ZeroTrie { + pub(crate) fn map_store_into_zerotrie(self, f: impl FnOnce(Store) -> X) -> ZeroTrie { $name::::from_store(f(self.store)).into_zerotrie() } } - impl $name + impl $name where - S: AsRef<[u8]> + ?Sized, + Store: AsRef<[u8]> + ?Sized, { /// Queries the trie for a string. #[inline] @@ -214,9 +214,9 @@ macro_rules! impl_zerotrie_subtype { } } #[cfg(feature = "alloc")] - impl $name + impl $name where - S: AsRef<[u8]> + ?Sized, + Store: AsRef<[u8]> + ?Sized, { /// Converts a possibly-borrowed $name to an owned one. /// @@ -298,9 +298,9 @@ macro_rules! impl_zerotrie_subtype { } } #[cfg(feature = "alloc")] - impl $name + impl $name where - S: AsRef<[u8]> + ?Sized + Store: AsRef<[u8]> + ?Sized { /// Exports the data from this ZeroTrie type into a BTreeMap. /// @@ -378,9 +378,9 @@ macro_rules! impl_zerotrie_subtype { } } #[cfg(feature = "litemap")] - impl $name + impl $name where - S: AsRef<[u8]> + ?Sized, + Store: AsRef<[u8]> + ?Sized, { /// Exports the data from this ZeroTrie type into a LiteMap. /// @@ -439,17 +439,17 @@ macro_rules! impl_zerotrie_subtype { // TODO(#2778): Auto-derive these impls based on the repr(transparent). // Safety: $name is repr(transparent) over S, a VarULE #[cfg(feature = "zerovec")] - unsafe impl zerovec::ule::VarULE for $name + unsafe impl zerovec::ule::VarULE for $name where - S: zerovec::ule::VarULE, + Store: zerovec::ule::VarULE, { #[inline] fn validate_byte_slice(bytes: &[u8]) -> Result<(), zerovec::ZeroVecError> { - S::validate_byte_slice(bytes) + Store::validate_byte_slice(bytes) } #[inline] unsafe fn from_byte_slice_unchecked(bytes: &[u8]) -> &Self { - core::mem::transmute(S::from_byte_slice_unchecked(bytes)) + core::mem::transmute(Store::from_byte_slice_unchecked(bytes)) } } }; @@ -521,20 +521,20 @@ macro_rules! impl_dispatch { }; } -impl ZeroTrie { +impl ZeroTrie { /// Takes the byte store from this trie. - pub fn take_store(self) -> S { + pub fn take_store(self) -> Store { impl_dispatch!(self, take_store()) } /// Maps the store into another type. - pub fn map_store(self, f: impl FnOnce(S) -> X) -> ZeroTrie { + pub fn map_store(self, f: impl FnOnce(Store) -> NewStore) -> ZeroTrie { impl_dispatch!(self, map_store_into_zerotrie(f)) } } -impl ZeroTrie +impl ZeroTrie where - S: AsRef<[u8]>, + Store: AsRef<[u8]>, { /// Queries the trie for a string. pub fn get(&self, key: K) -> Option @@ -556,9 +556,9 @@ where } #[cfg(feature = "alloc")] -impl ZeroTrie +impl ZeroTrie where - S: AsRef<[u8]>, + Store: AsRef<[u8]>, { /// Exports the data from this ZeroTrie into a BTreeMap. pub fn to_btreemap(&self) -> BTreeMap, usize> { @@ -567,9 +567,9 @@ where } #[cfg(feature = "litemap")] -impl ZeroTrie +impl ZeroTrie where - S: AsRef<[u8]>, + Store: AsRef<[u8]>, { /// Exports the data from this ZeroTrie into a LiteMap. pub fn to_litemap(&self) -> LiteMap, usize> { From 9290ec5b9db5a47f76c69fe8f5958e448245cd71 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 4 Jul 2023 16:11:44 +0200 Subject: [PATCH 07/31] Rearrange code --- experimental/zerotrie/src/zerotrie.rs | 136 +++++++++++++------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 37b584f6d21..4966a4d3e3f 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -267,6 +267,23 @@ macro_rules! impl_zerotrie_subtype { } } #[cfg(feature = "alloc")] + impl<'a, K> FromIterator<(K, usize)> for $name> + where + K: AsRef<[u8]> + { + fn from_iter>(iter: T) -> Self { + use crate::builder::nonconst::ZeroTrieBuilder; + ZeroTrieBuilder::>::from_bytes_iter( + iter, + Self::BUILDER_OPTIONS + ) + .map(|s| Self { + store: s.to_bytes(), + }) + .unwrap() + } + } + #[cfg(feature = "alloc")] impl<'a, K> TryFrom<&'a BTreeMap> for $name> where K: Borrow<[u8]> @@ -281,6 +298,38 @@ macro_rules! impl_zerotrie_subtype { Self::try_from_tuple_slice(byte_str_slice) } } + #[cfg(feature = "alloc")] + impl $name + where + Store: AsRef<[u8]> + ?Sized + { + /// Exports the data from this ZeroTrie type into a BTreeMap. + /// + /// ***Enable this impl with the `"alloc"` feature.*** + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// use std::collections::BTreeMap; + /// + #[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")] + /// let items = trie.to_btreemap(); + /// + /// assert_eq!(items.len(), 2); + /// + #[doc = concat!("let recovered_trie: ", stringify!($name), "> = items")] + /// .into_iter() + /// .collect(); + /// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes()); + /// ``` + pub fn to_btreemap(&self) -> BTreeMap<$iter_ty, usize> { + self.iter().collect() + } + pub(crate) fn to_btreemap_bytes(&self) -> BTreeMap, usize> { + self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect() + } + } #[cfg(feature = "litemap")] impl<'a, K, S> TryFrom<&'a LiteMap> for $name> where @@ -297,38 +346,48 @@ macro_rules! impl_zerotrie_subtype { Self::try_from_tuple_slice(byte_str_slice) } } - #[cfg(feature = "alloc")] + #[cfg(feature = "litemap")] impl $name where - Store: AsRef<[u8]> + ?Sized + Store: AsRef<[u8]> + ?Sized, { - /// Exports the data from this ZeroTrie type into a BTreeMap. + /// Exports the data from this ZeroTrie type into a LiteMap. /// - /// ***Enable this impl with the `"alloc"` feature.*** + /// ***Enable this function with the `"litemap"` feature.*** /// /// # Examples /// /// ``` #[doc = concat!("use zerotrie::", stringify!($name), ";")] - /// use std::collections::BTreeMap; + /// use litemap::LiteMap; /// #[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")] - /// let items = trie.to_btreemap(); /// + /// let items = trie.to_litemap(); /// assert_eq!(items.len(), 2); /// #[doc = concat!("let recovered_trie: ", stringify!($name), "> = items")] - /// .into_iter() + /// .iter() + /// .map(|(k, v)| (k, *v)) /// .collect(); /// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes()); /// ``` - pub fn to_btreemap(&self) -> BTreeMap<$iter_ty, usize> { + pub fn to_litemap(&self) -> LiteMap<$iter_ty, usize> { self.iter().collect() } - pub(crate) fn to_btreemap_bytes(&self) -> BTreeMap, usize> { + pub(crate) fn to_litemap_bytes(&self) -> LiteMap, usize> { self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect() } } + #[cfg(feature = "litemap")] + impl $name> + { + #[cfg(feature = "serde")] + pub(crate) fn try_from_serde_litemap(items: &LiteMap, usize>) -> Result { + let lm_borrowed: LiteMap<&ByteStr, usize> = items.to_borrowed_keys(); + Self::try_from_tuple_slice(lm_borrowed.as_slice()) + } + } // Note: Can't generalize this impl due to the `core::borrow::Borrow` blanket impl. impl Borrow<$name<[u8]>> for $name<&[u8]> { #[inline] @@ -377,65 +436,6 @@ macro_rules! impl_zerotrie_subtype { ) } } - #[cfg(feature = "litemap")] - impl $name - where - Store: AsRef<[u8]> + ?Sized, - { - /// Exports the data from this ZeroTrie type into a LiteMap. - /// - /// ***Enable this function with the `"litemap"` feature.*** - /// - /// # Examples - /// - /// ``` - #[doc = concat!("use zerotrie::", stringify!($name), ";")] - /// use litemap::LiteMap; - /// - #[doc = concat!("let trie = ", stringify!($name), "::from_bytes(b\"abc\\x81def\\x82\");")] - /// - /// let items = trie.to_litemap(); - /// assert_eq!(items.len(), 2); - /// - #[doc = concat!("let recovered_trie: ", stringify!($name), "> = items")] - /// .iter() - /// .map(|(k, v)| (k, *v)) - /// .collect(); - /// assert_eq!(trie.as_bytes(), recovered_trie.as_bytes()); - /// ``` - pub fn to_litemap(&self) -> LiteMap<$iter_ty, usize> { - self.iter().collect() - } - pub(crate) fn to_litemap_bytes(&self) -> LiteMap, usize> { - self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect() - } - } - #[cfg(feature = "litemap")] - impl $name> - { - #[cfg(feature = "serde")] - pub(crate) fn try_from_serde_litemap(items: &LiteMap, usize>) -> Result { - let lm_borrowed: LiteMap<&ByteStr, usize> = items.to_borrowed_keys(); - Self::try_from_tuple_slice(lm_borrowed.as_slice()) - } - } - #[cfg(feature = "alloc")] - impl<'a, K> FromIterator<(K, usize)> for $name> - where - K: AsRef<[u8]> - { - fn from_iter>(iter: T) -> Self { - use crate::builder::nonconst::ZeroTrieBuilder; - ZeroTrieBuilder::>::from_bytes_iter( - iter, - Self::BUILDER_OPTIONS - ) - .map(|s| Self { - store: s.to_bytes(), - }) - .unwrap() - } - } // TODO(#2778): Auto-derive these impls based on the repr(transparent). // Safety: $name is repr(transparent) over S, a VarULE #[cfg(feature = "zerovec")] From 60a93af6d167bd19c78e8ad3cd8bd9cc87d038e7 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 4 Jul 2023 16:21:05 +0200 Subject: [PATCH 08/31] Add From impls --- experimental/zerotrie/src/zerotrie.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 4966a4d3e3f..f2f8f2e6bd8 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -330,6 +330,16 @@ macro_rules! impl_zerotrie_subtype { self.iter().map(|(k, v)| ($cnv_fn(k), v)).collect() } } + #[cfg(feature = "alloc")] + impl From<&$name> for BTreeMap<$iter_ty, usize> + where + Store: AsRef<[u8]> + ?Sized, + { + #[inline] + fn from(other: &$name) -> Self { + other.to_btreemap() + } + } #[cfg(feature = "litemap")] impl<'a, K, S> TryFrom<&'a LiteMap> for $name> where @@ -380,6 +390,16 @@ macro_rules! impl_zerotrie_subtype { } } #[cfg(feature = "litemap")] + impl From<&$name> for LiteMap<$iter_ty, usize> + where + Store: AsRef<[u8]> + ?Sized, + { + #[inline] + fn from(other: &$name) -> Self { + other.to_litemap() + } + } + #[cfg(feature = "litemap")] impl $name> { #[cfg(feature = "serde")] From 059ca93404c2c96674d3b905d286759af01faf33 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 4 Jul 2023 16:26:33 +0200 Subject: [PATCH 09/31] Don't use ref-cast --- Cargo.lock | 21 --------------------- experimental/zerotrie/Cargo.toml | 1 - experimental/zerotrie/src/byte_phf/mod.rs | 15 +++++++++++---- experimental/zerotrie/src/zerotrie.rs | 10 +++++----- 4 files changed, 16 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d91ead1336f..3b70c259a27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2864,26 +2864,6 @@ dependencies = [ "bitflags", ] -[[package]] -name = "ref-cast" -version = "1.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43faa91b1c8b36841ee70e97188a869d37ae21759da6846d4be66de5bf7b12c" -dependencies = [ - "ref-cast-impl", -] - -[[package]] -name = "ref-cast-impl" -version = "1.0.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d2275aab483050ab2a7364c1a46604865ee7d6906684e08db0f090acf74f9e7" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.15", -] - [[package]] name = "regex" version = "1.7.3" @@ -4541,7 +4521,6 @@ dependencies = [ "postcard", "rand", "rand_pcg", - "ref-cast", "serde", "serde_json", "zerovec", diff --git a/experimental/zerotrie/Cargo.toml b/experimental/zerotrie/Cargo.toml index 7b3eb53cdc2..82cf01b5896 100644 --- a/experimental/zerotrie/Cargo.toml +++ b/experimental/zerotrie/Cargo.toml @@ -32,7 +32,6 @@ denylist = ["bench"] [dependencies] zerovec = { path = "../../utils/zerovec", optional = true } litemap = { path = "../../utils/litemap", default-features = false, features = ["alloc"], optional = true } -ref-cast = { version = "1.0.12" } serde = { version = "1.0", optional = true } displaydoc = { version = "0.2.3", default-features = false } diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index fc89b50d0de..e962c2d1f64 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -12,8 +12,6 @@ pub use builder::find; #[cfg(feature = "alloc")] pub use cached_owned::PerfectByteHashMapCacheOwned; -use ref_cast::RefCast; - const P_FAST_MAX: u8 = 11; const Q_FAST_MAX: u8 = 95; @@ -87,7 +85,7 @@ pub fn f2(byte: u8, q: u8, n: usize) -> usize { } // Standard layout: P, N bytes of Q, N bytes of expected keys -#[derive(Debug, PartialEq, Eq, RefCast)] +#[derive(Debug, PartialEq, Eq)] #[repr(transparent)] pub struct PerfectByteHashMap(S); @@ -166,12 +164,21 @@ where } } +impl PerfectByteHashMap<[u8]> { + #[inline] + pub fn from_bytes(bytes: &[u8]) -> &Self { + // Safety: Self is repr(transparent) over [u8] + unsafe { core::mem::transmute(bytes) } + } +} + impl PerfectByteHashMap where S: AsRef<[u8]> + ?Sized, { + #[inline] pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> { - PerfectByteHashMap::ref_cast(self.0.as_ref()) + PerfectByteHashMap::from_bytes(self.0.as_ref()) } } diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index f2f8f2e6bd8..acbebe47dc4 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -5,7 +5,6 @@ use crate::reader::*; use core::borrow::Borrow; -use ref_cast::RefCast; #[cfg(feature = "alloc")] use crate::{builder::bytestr::ByteStr, builder::nonconst::ZeroTrieBuilder, error::Error}; @@ -96,7 +95,7 @@ pub(crate) enum ZeroTrieFlavor { /// # Ok::<_, zerotrie::ZeroTrieError>(()) /// ``` #[repr(transparent)] -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] pub struct ZeroTrieSimpleAscii { pub(crate) store: Store, } @@ -126,7 +125,7 @@ pub struct ZeroTrieSimpleAscii { /// # Ok::<_, zerotrie::ZeroTrieError>(()) /// ``` #[repr(transparent)] -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] pub struct ZeroTriePerfectHash { pub(crate) store: Store, } @@ -135,7 +134,7 @@ pub struct ZeroTriePerfectHash { /// /// For more information, see [`ZeroTrie`]. #[repr(transparent)] -#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ref_cast::RefCast)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] pub struct ZeroTrieExtendedCapacity { pub(crate) store: Store, } @@ -251,7 +250,8 @@ macro_rules! impl_zerotrie_subtype { /// If the bytes are not a valid trie, unexpected behavior may occur. #[inline] pub fn from_bytes(trie: &[u8]) -> &Self { - Self::ref_cast(trie) + // Safety: Self is repr(transparent) over [u8] + unsafe { core::mem::transmute(trie) } } } #[cfg(feature = "alloc")] From 2f25994162bcc475ae91b89848df532314c22c45 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 4 Jul 2023 16:27:08 +0200 Subject: [PATCH 10/31] fmt --- experimental/zerotrie/src/serde.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/zerotrie/src/serde.rs b/experimental/zerotrie/src/serde.rs index ee688c8cb35..44918b90b26 100644 --- a/experimental/zerotrie/src/serde.rs +++ b/experimental/zerotrie/src/serde.rs @@ -120,7 +120,7 @@ where impl Serialize for ZeroTrieSimpleAscii where -Store: AsRef<[u8]>, + Store: AsRef<[u8]>, { fn serialize(&self, serializer: S) -> Result where From 9df8cae62cc055ba81a08e7421f80be9aef01527 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 4 Jul 2023 16:27:56 +0200 Subject: [PATCH 11/31] generate-readmes --- experimental/zerotrie/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/zerotrie/README.md b/experimental/zerotrie/README.md index 9f75b0602ad..61794c5672a 100644 --- a/experimental/zerotrie/README.md +++ b/experimental/zerotrie/README.md @@ -1,7 +1,7 @@ # zerotrie [![crates.io](https://img.shields.io/crates/v/zerotrie)](https://crates.io/crates/zerotrie) A data structure offering zero-copy storage and retrieval of byte strings, with a focus -on the efficient storage of ASCII strings. Strings are mapped to a `usize` values. +on the efficient storage of ASCII strings. Strings are mapped to `usize` values. [`ZeroTrie`] does not support mutation because doing so would require recomputing the entire data structure. Instead, it supports conversion to and from [`LiteMap`] and [`BTreeMap`]. From 8345f978db503918274e0c2d62e53788b95ab799 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 10 Jul 2023 19:30:36 +0200 Subject: [PATCH 12/31] Docs, tests, and function names for varint --- .../zerotrie/src/builder/konst/builder.rs | 4 +- .../zerotrie/src/builder/nonconst/builder.rs | 8 +- experimental/zerotrie/src/reader.rs | 20 +-- experimental/zerotrie/src/varint.rs | 120 +++++++++++++++--- 4 files changed, 119 insertions(+), 33 deletions(-) diff --git a/experimental/zerotrie/src/builder/konst/builder.rs b/experimental/zerotrie/src/builder/konst/builder.rs index 89df85ca1c0..0cbd5c1ac5f 100644 --- a/experimental/zerotrie/src/builder/konst/builder.rs +++ b/experimental/zerotrie/src/builder/konst/builder.rs @@ -44,7 +44,7 @@ impl ZeroTrieBuilderConst { #[must_use] const fn prepend_value(self, value: usize) -> (Self, usize) { let mut data = self.data; - let varint_array = varint::write_extended_varint(value); + let varint_array = varint::write_varint_meta3(value); data = data.const_extend_front_or_panic(varint_array.as_const_slice()); data = data.const_bitor_assign(0, 0b10000000); (Self { data }, varint_array.len()) @@ -53,7 +53,7 @@ impl ZeroTrieBuilderConst { #[must_use] const fn prepend_branch(self, value: usize) -> (Self, usize) { let mut data = self.data; - let varint_array = varint::write_varint(value); + let varint_array = varint::write_varint_meta2(value); data = data.const_extend_front_or_panic(varint_array.as_const_slice()); data = data.const_bitor_assign(0, 0b11000000); (Self { data }, varint_array.len()) diff --git a/experimental/zerotrie/src/builder/nonconst/builder.rs b/experimental/zerotrie/src/builder/nonconst/builder.rs index fd1be2b16a7..14e6807e21c 100644 --- a/experimental/zerotrie/src/builder/nonconst/builder.rs +++ b/experimental/zerotrie/src/builder/nonconst/builder.rs @@ -65,10 +65,10 @@ impl ZeroTrieBuilder { // Unwrap OK: there is a varint at this location in the buffer #[allow(clippy::unwrap_used)] let old_span_size = - varint::try_read_extended_varint_from_tstore(old_front, &mut self.data) + varint::try_read_varint_meta3_from_tstore(old_front, &mut self.data) .unwrap(); self.data.atbs_push_front(ascii); - let varint_array = varint::write_extended_varint(old_span_size + 1); + let varint_array = varint::write_varint_meta3(old_span_size + 1); self.data.atbs_extend_front(varint_array.as_slice()); self.data.atbs_bitor_assign(0, 0b10100000); let new_byte_len = self.data.atbs_len(); @@ -88,7 +88,7 @@ impl ZeroTrieBuilder { #[must_use] fn prepend_value(&mut self, value: usize) -> usize { - let varint_array = varint::write_extended_varint(value); + let varint_array = varint::write_varint_meta3(value); self.data.atbs_extend_front(varint_array.as_slice()); self.data.atbs_bitor_assign(0, 0b10000000); varint_array.len() @@ -96,7 +96,7 @@ impl ZeroTrieBuilder { #[must_use] fn prepend_branch(&mut self, value: usize) -> usize { - let varint_array = varint::write_varint(value); + let varint_array = varint::write_varint_meta2(value); self.data.atbs_extend_front(varint_array.as_slice()); self.data.atbs_bitor_assign(0, 0b11000000); varint_array.len() diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 917927f7b62..96e016a06b5 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -3,8 +3,8 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::byte_phf::PerfectByteHashMap; -use crate::varint::read_extended_varint; -use crate::varint::read_varint; +use crate::varint::read_varint_meta2; +use crate::varint::read_varint_meta3; use core::ops::Range; #[cfg(feature = "alloc")] @@ -154,8 +154,8 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { let byte_type = byte_type(*b); (x, trie) = match byte_type { ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, - ByteType::Match => read_varint(*b, trie)?, + ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, + ByteType::Match => read_varint_meta2(*b, trie)?, }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, ByteType::Ascii) { @@ -218,8 +218,8 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { let byte_type = byte_type(*b); (x, trie) = match byte_type { ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, - ByteType::Match => read_varint(*b, trie)?, + ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, + ByteType::Match => read_varint_meta2(*b, trie)?, }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, ByteType::Ascii) { @@ -288,8 +288,8 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { let byte_type = byte_type(*b); (x, trie) = match byte_type { ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, - ByteType::Match => read_varint(*b, trie)?, + ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, + ByteType::Match => read_varint_meta2(*b, trie)?, }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, ByteType::Ascii) { @@ -391,8 +391,8 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { } (x, trie) = match byte_type { ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_extended_varint(*b, trie)?, - ByteType::Match => read_varint(*b, trie)?, + ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, + ByteType::Match => read_varint_meta2(*b, trie)?, }; if matches!(byte_type, ByteType::Span) { (span, trie) = debug_split_at(trie, x)?; diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs index 503c4b7874a..c2df67c019d 100644 --- a/experimental/zerotrie/src/varint.rs +++ b/experimental/zerotrie/src/varint.rs @@ -4,9 +4,40 @@ //! Varint spec for ZeroTrie: //! -//! - Lead byte: top 2 bits are trie metadata; third is varint extender; rest is value -//! - Trail bytes: top bit is varint extender; add rest to current value * 2^7 -//! - Add the "latent value" to the final result: (1<<5) + (1<<7) + (1<<14) + ... +//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value +//! - Trail bytes: top bit is varint extender; rest are low bits of value +//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte +//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize` +//! +//! This is best shown by examples. +//! +//! ```txt +//! xxx0'1010 = 10 +//! xxx0'1111 = 15 (largest single-byte value with M=3) +//! xxx1'0000 0000'0000 = 16 (smallest two-byte value with M=3) +//! xxx1'0000 0000'0001 = 17 +//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3) +//! xxx1'0000 1000'0000 0000'0000 = 2064 (smallest three-byte value with M=3) +//! xxx1'0000 1000'0000 0000'0001 = 2065 +//! ``` +//! +//! The latent values by number of bytes for M=3 are: +//! +//! - 1 byte: 0 +//! - 2 bytes: 16 = 0x10 = 0b10000 +//! - 3 bytes: 2064 = 0x810 = 0b100000010000 +//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000 +//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000 +//! - … +//! +//! For M=2, the latent values are: +//! +//! - 1 byte: 0 +//! - 2 bytes: 32 = 0x20 = 0b100000 +//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000 +//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000 +//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000 +//! - … use crate::builder::konst::ConstArrayBuilder; @@ -14,7 +45,7 @@ use crate::builder::konst::ConstArrayBuilder; use crate::builder::nonconst::TrieBuilderStore; /// Reads a varint with 2 bits of metadata in the lead byte. -pub const fn read_varint(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { +pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { let mut value = (start & 0b00011111) as usize; let mut remainder = remainder; if (start & 0b00100000) != 0 { @@ -37,7 +68,7 @@ pub const fn read_varint(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> } /// Reads a varint with 3 bits of metadata in the lead byte. -pub const fn read_extended_varint(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { +pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { let mut value = (start & 0b00001111) as usize; let mut remainder = remainder; if (start & 0b00010000) != 0 { @@ -60,7 +91,7 @@ pub const fn read_extended_varint(start: u8, remainder: &[u8]) -> Option<(usize, } #[cfg(feature = "alloc")] -pub(crate) fn try_read_extended_varint_from_tstore( +pub(crate) fn try_read_varint_meta3_from_tstore( start: u8, remainder: &mut S, ) -> Option { @@ -87,7 +118,7 @@ const MAX_VARINT: usize = usize::MAX; // Add an extra 1 since the lead byte holds only 5 bits of data. const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::() * 8 / 7; -pub(crate) const fn write_varint(value: usize) -> ConstArrayBuilder { +pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder { let mut result = [0; MAX_VARINT_LENGTH]; let mut i = MAX_VARINT_LENGTH - 1; let mut value = value; @@ -114,9 +145,7 @@ pub(crate) const fn write_varint(value: usize) -> ConstArrayBuilder ConstArrayBuilder { +pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder { let mut result = [0; MAX_VARINT_LENGTH]; let mut i = MAX_VARINT_LENGTH - 1; let mut value = value; @@ -324,7 +353,7 @@ mod tests { #[test] fn test_read() { for cas in CASES { - let recovered = read_varint(cas.bytes[0], &cas.bytes[1..]).unwrap(); + let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]).unwrap(); assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); } } @@ -345,9 +374,9 @@ mod tests { "{:?}", cas ); - let recovered = read_varint(cas.bytes[0], &cas.bytes[1..]).unwrap(); + let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]).unwrap(); assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); - let write_bytes = write_varint(cas.value); + let write_bytes = write_varint_meta2(cas.value); assert_eq!( reference_bytes.as_slice(), write_bytes.as_slice(), @@ -361,8 +390,21 @@ mod tests { fn test_roundtrip() { let mut i = 0usize; while i < MAX_VARINT { - let bytes = write_varint(i); - let recovered = read_varint(bytes.as_slice()[0], &bytes.as_slice()[1..]); + let bytes = write_varint_meta2(i); + let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]); + assert!(recovered.is_some(), "{:?}", i); + assert_eq!(i, recovered.unwrap().0, "{:?}", bytes.as_slice()); + i <<= 1; + i += 1; + } + } + + #[test] + fn test_extended_roundtrip() { + let mut i = 0usize; + while i < MAX_VARINT { + let bytes = write_varint_meta3(i); + let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]); assert!(recovered.is_some(), "{:?}", i); assert_eq!(i, recovered.unwrap().0, "{:?}", bytes.as_slice()); i <<= 1; @@ -373,13 +415,13 @@ mod tests { #[test] fn test_max() { let reference_bytes = write_varint_reference(MAX_VARINT); - let write_bytes = write_varint(MAX_VARINT); + let write_bytes = write_varint_meta2(MAX_VARINT); assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH); assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice()); let subarray = write_bytes .as_const_slice() .get_subslice_or_panic(1, write_bytes.len()); - let (recovered_value, remainder) = read_varint( + let (recovered_value, remainder) = read_varint_meta2( *write_bytes.as_const_slice().first().unwrap(), subarray.as_slice(), ) @@ -402,4 +444,48 @@ mod tests { ] ); } + + #[test] + fn text_extended_max() { + let write_bytes = write_varint_meta3(MAX_VARINT); + assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH); + let (lead, trailing) = write_bytes.as_slice().split_first().unwrap(); + let (recovered_value, remainder) = read_varint_meta3(*lead, trailing).unwrap(); + assert!(remainder.is_empty()); + assert_eq!(recovered_value, MAX_VARINT); + assert_eq!( + write_bytes.as_slice(), + &[ + 0b00010001, // + 0b11101111, // + 0b11101111, // + 0b11101111, // + 0b11101111, // + 0b11101111, // + 0b11101111, // + 0b11101111, // + 0b11101111, // + 0b01101111, // + ] + ); + } + + #[test] + fn test_latent_values() { + // Same values documented in the module docs: M=2 + let m2 = read_varint_meta2; + assert_eq!(m2(0, &[]).unwrap().0, 0); + assert_eq!(m2(0x20, &[0x00]).unwrap().0, 32); + assert_eq!(m2(0x20, &[0x80, 0x00]).unwrap().0, 4128); + assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).unwrap().0, 528416); + assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).unwrap().0, 67637280); + + // Same values documented in the module docs: M=3 + let m3 = read_varint_meta3; + assert_eq!(m3(0, &[]).unwrap().0, 0); + assert_eq!(m3(0x10, &[0x00]).unwrap().0, 16); + assert_eq!(m3(0x10, &[0x80, 0x00]).unwrap().0, 2064); + assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).unwrap().0, 264208); + assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).unwrap().0, 33818640); + } } From 25e71be7af2b314fb2167474c9fb6f2c20f829b1 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 11 Jul 2023 00:55:16 +0200 Subject: [PATCH 13/31] Start writing layout docs --- experimental/zerotrie/src/reader.rs | 43 +++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 96e016a06b5..f817cb627ae 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -2,6 +2,49 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +//! # Internal layout of ZeroTrie +//! +//! A ZeroTrie is composed of a series of nodes stored in sequence in a byte slice. +//! +//! There are 4 types of nodes: +//! +//! 1. ASCII (`0xxxxxxx`): matches a literal ASCII byte. +//! 2. Span (`101xxxxx`): matches a span of non-ASCII bytes. +//! 3. Value (`100xxxxx`): associates a value with a string +//! 4. Branch (`11xxxxxx`): matches one of a set of bytes. +//! +//! Span, Value, and Branch nodes contain a varint, which has different semantics for each: +//! +//! - Span varint: length of the span +//! - Value varint: value associated with the string +//! - Branch varint: number of edges in the branch and width of the offset table +//! +//! The exact structure of the Branch node is what varies between ZeroTrie types. +//! +//! Here is an example ZeroTrie without branch nodes: +//! +//! ``` +//! use zerotrie::ZeroTrieSimpleAscii; +//! +//! let bytes = [ +//! b'a', // ASCII literal +//! 0b10001010, // value 10 +//! b'b', // ASCII literal +//! 0b10100010, // span of 3 +//! 0x81, // first byte in span +//! 0x91, // second byte in span +//! 0xA1, // third and final byte in span +//! 0b1000100, // value 4 +//! ]; +//! +//! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes); +//! +//! assert_eq!(trie.get(b"a"), Some(10)); +//! assert_eq!(trie.get(b"ab"), None); +//! assert_eq!(trie.get(b"b"), None); +//! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4)); +//! ``` + use crate::byte_phf::PerfectByteHashMap; use crate::varint::read_varint_meta2; use crate::varint::read_varint_meta3; From 9739ce16be698ed7f9aff07740bd2fc41aa6f60a Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Tue, 11 Jul 2023 09:29:22 +0200 Subject: [PATCH 14/31] More layout docs --- experimental/zerotrie/src/reader.rs | 44 ++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index f817cb627ae..73ca7cf0de9 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -19,7 +19,12 @@ //! - Value varint: value associated with the string //! - Branch varint: number of edges in the branch and width of the offset table //! -//! The exact structure of the Branch node is what varies between ZeroTrie types. +//! If reading an ASCII, Span, or Branch node, one or more bytes are consumed from the input +//! string. If the next byte(s) in the input string do not match the node, we return `None`. +//! If reading a Value node, if the string is empty, return `Some(value)`; otherwise, we skip +//! the Value node and continue on to the next node. +//! +//! When a node is consumed, a shorter, well-formed ZeroTrie remains. //! //! Here is an example ZeroTrie without branch nodes: //! @@ -30,20 +35,51 @@ //! b'a', // ASCII literal //! 0b10001010, // value 10 //! b'b', // ASCII literal -//! 0b10100010, // span of 3 +//! 0b10100011, // span of 3 //! 0x81, // first byte in span //! 0x91, // second byte in span //! 0xA1, // third and final byte in span -//! 0b1000100, // value 4 +//! 0b10000100, // value 4 //! ]; //! //! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes); //! +//! // First value: "a" → 10 //! assert_eq!(trie.get(b"a"), Some(10)); +//! +//! // Second value: "ab\x81\x91\xA1" → 4 +//! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4)); +//! +//! // A few examples of strings that do NOT have values in the trie: //! assert_eq!(trie.get(b"ab"), None); //! assert_eq!(trie.get(b"b"), None); -//! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4)); +//! assert_eq!(trie.get(b"b\x81\x91\xA1"), None); //! ``` +//! +//! ## Branch Nodes +//! +//! There are two types of branch nodes: binary search and perfect hash. `ZeroTrieSimpleAscii` +//! contains only binary search nodes, whereas `ZeroTriePerfectHash` can contain either. +//! +//! The head node of the branch has a varint that encodes two things: +//! +//! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256 +//! - Bits 9 and 10: width of the offset table (`W`) +//! +//! A few examples of the head node of the branch: +//! +//! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0 +//! - `0b11000110`: varint bits `110`: N = 6; W = 0 +//! - `0b11100000 0b00000101`: varint bits `1000101`: N = 69; W = 0 +//! - `0b11100010 0b00000000`: varint bits `101000000`: N = 64; W = 1 +//! +//! In `ZeroTriePerfectHash`, if N <= 15, the branch is assumed to be a binary search, and if +//! N > 15, the branch is assumed to be a perfect hash. +//! +//! ### Binary Search Branch Nodes +//! +//! Here, the head branch node is followed by N sorted bytes and then (W+1)*(N-1) bytes +//! for the offset table. use crate::byte_phf::PerfectByteHashMap; use crate::varint::read_varint_meta2; From 1d5555748b23fd175b7a678e712345343afee57d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 11:50:19 +0200 Subject: [PATCH 15/31] Add some more docs --- experimental/zerotrie/src/byte_phf/mod.rs | 117 ++++++++++++++++++++++ experimental/zerotrie/src/reader.rs | 31 +++++- 2 files changed, 146 insertions(+), 2 deletions(-) diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index e962c2d1f64..dd9dab4406d 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -2,6 +2,41 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +//! # Byte Perfect Hash Function Internals +//! +//! This module contains a perfect hash function (PHF) optimized for... TODO +//! +//! Reading a `key` from the PHF uses the following algorithm: +//! +//! 1. Let `t`, the bucket index, be `f1(key, p)`. +//! 2. Let `i`, the key index, be `f2(key, q_t)`. +//! 3. If `key == k_i`, return `Some(i)`; else return `None`. +//! +//! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across +//! serialization versions of `ZeroTrie`. +//! +//! ``` +//! let phf_example_bytes = [ +//! // `p` parameter +//! 1, +//! // `q` parameters, one for each of the N buckets +//! 0, 0, 1, 1, +//! // Exact keys to be compared with the input +//! b'e', b'a', b'c', b'g' +//! ]; +//! +//! let phf = zerotrie::byte_phf::PerfectByteHashMap::from_bytes(&phf_example_bytes); +//! +//! // The PHF returns the index of the key or `None` if not found. +//! assert_eq!(phf.get(b'a'), Some(1)); +//! assert_eq!(phf.get(b'b'), None); +//! assert_eq!(phf.get(b'c'), Some(2)); +//! assert_eq!(phf.get(b'd'), None); +//! assert_eq!(phf.get(b'e'), Some(0)); +//! assert_eq!(phf.get(b'f'), None); +//! assert_eq!(phf.get(b'g'), Some(3)); +//! ``` + #[cfg(feature = "alloc")] mod builder; #[cfg(feature = "alloc")] @@ -12,11 +47,17 @@ pub use builder::find; #[cfg(feature = "alloc")] pub use cached_owned::PerfectByteHashMapCacheOwned; +/// The cutoff for the fast version of [`f1`]. const P_FAST_MAX: u8 = 11; + +/// The cutoff for the fast version of [`f2`]. const Q_FAST_MAX: u8 = 95; +/// The maximum allowable value of `p`. This could be raised if found to be necessary. #[cfg(feature = "alloc")] // used in the builder code const P_REAL_MAX: u8 = 15; + +/// The maximum allowable value of `q`. This could be raised if found to be necessary. #[cfg(feature = "alloc")] // used in the builder code const Q_REAL_MAX: u8 = 127; @@ -44,7 +85,41 @@ fn debug_get(slice: &[u8], index: usize) -> Option { } } +/// Calculates the function `f1` for the PHF. For the exact formula, please read the code. +/// +/// When `p == 0`, the operation is a simple modulus. +/// +/// The argument `n` is used only for taking the modulus so that the return value is +/// in the range `[0, n)`. +/// /// Invariant: n > 0 +/// +/// # Examples +/// +/// ``` +/// use zerotrie::byte_phf::f1; +/// const N: usize = 10; +/// +/// // With p = 0: +/// assert_eq!(0, f1(0, 0, N)); +/// assert_eq!(1, f1(1, 0, N)); +/// assert_eq!(2, f1(2, 0, N)); +/// assert_eq!(9, f1(9, 0, N)); +/// assert_eq!(0, f1(10, 0, N)); +/// assert_eq!(1, f1(11, 0, N)); +/// assert_eq!(2, f1(12, 0, N)); +/// assert_eq!(9, f1(19, 0, N)); +/// +/// // With p = 1: +/// assert_eq!(1, f1(0, 1, N)); +/// assert_eq!(0, f1(1, 1, N)); +/// assert_eq!(2, f1(2, 1, N)); +/// assert_eq!(2, f1(9, 1, N)); +/// assert_eq!(4, f1(10, 1, N)); +/// assert_eq!(5, f1(11, 1, N)); +/// assert_eq!(1, f1(12, 1, N)); +/// assert_eq!(7, f1(19, 1, N)); +/// ``` #[inline] pub fn f1(byte: u8, p: u8, n: usize) -> usize { let n = if n > 0 { @@ -57,6 +132,9 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize { byte as usize % n } else { let mut result = byte ^ p ^ byte.wrapping_shr(p as u32); + // In almost all cases, the PHF works with the above constant-time operation. + // However, to crack a few difficult cases, we fall back to the linear-time + // operation shown below. for _ in P_FAST_MAX..p { result = result ^ (result << 1) ^ (result >> 1); } @@ -64,7 +142,41 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize { } } +/// Calculates the function `f2` for the PHF. For the exact formula, please read the code. +/// +/// When `q == 0`, the operation is a simple modulus. +/// +/// The argument `n` is used only for taking the modulus so that the return value is +/// in the range `[0, n)`. +/// /// Invariant: n > 0 +/// +/// # Examples +/// +/// ``` +/// use zerotrie::byte_phf::f2; +/// const N: usize = 10; +/// +/// // With q = 0: +/// assert_eq!(0, f2(0, 0, N)); +/// assert_eq!(1, f2(1, 0, N)); +/// assert_eq!(2, f2(2, 0, N)); +/// assert_eq!(9, f2(9, 0, N)); +/// assert_eq!(0, f2(10, 0, N)); +/// assert_eq!(1, f2(11, 0, N)); +/// assert_eq!(2, f2(12, 0, N)); +/// assert_eq!(9, f2(19, 0, N)); +/// +/// // With q = 1: +/// assert_eq!(1, f2(0, 1, N)); +/// assert_eq!(0, f2(1, 1, N)); +/// assert_eq!(3, f2(2, 1, N)); +/// assert_eq!(8, f2(9, 1, N)); +/// assert_eq!(1, f2(10, 1, N)); +/// assert_eq!(0, f2(11, 1, N)); +/// assert_eq!(3, f2(12, 1, N)); +/// assert_eq!(8, f2(19, 1, N)); +/// ``` #[inline] pub fn f2(byte: u8, q: u8, n: usize) -> usize { let n = if n > 0 { @@ -278,6 +390,11 @@ mod tests { expected: &[1, 0, 1, b'c', b'a'], reordered_keys: "ca", }, + TestCase { + keys: "aceg", + expected: &[1, 0, 0, 1, 1, b'e', b'a', b'c', b'g'], + reordered_keys: "eacg", + }, TestCase { keys: "abd", expected: &[0, 0, 1, 3, b'a', b'b', b'd'], diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 73ca7cf0de9..1da8e7b9cfd 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -66,6 +66,9 @@ //! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256 //! - Bits 9 and 10: width of the offset table (`W`) //! +//! Note that N is always in the range [1, 256]. There can't be more than 256 edges because +//! there are only 256 unique u8 values. +//! //! A few examples of the head node of the branch: //! //! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0 @@ -78,8 +81,32 @@ //! //! ### Binary Search Branch Nodes //! -//! Here, the head branch node is followed by N sorted bytes and then (W+1)*(N-1) bytes -//! for the offset table. +//! A binary search branch node is used when: +//! +//! 1. The trie is a `ZeroTrieSimpleAscii`, OR +//! 2. There are 15 or fewer items in the branch. +//! +//! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte +//! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search), +//! the index `i` of the byte within the list is used to index into the offset table (described +//! below). If the byte is not in the list, the string is not in the trie, so return `None`. +//! +//! ### Perfect Hash Branch Nodes +//! +//! A perfect hash branch node is used when: +//! +//! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND +//! 2. There are 16 or more items in the branch. +//! +//! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing +//! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an +//! index within the hash table `i` is resolved and used as input to index into the offset +//! table (described below), or the value is determined to not be present and `None` is +//! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`]. +//! +//! ### Offset Tables +//! +//! Both types of branch node are followed by an offset table. use crate::byte_phf::PerfectByteHashMap; use crate::varint::read_varint_meta2; From 9327015317340456869daefc48f08e43c5216d15 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 15:29:15 +0200 Subject: [PATCH 16/31] More docs --- experimental/zerotrie/examples/byteph.rs | 7 ++++++- experimental/zerotrie/src/byte_phf/builder.rs | 4 ++++ experimental/zerotrie/src/byte_phf/mod.rs | 15 +++++++++++++-- experimental/zerotrie/src/lib.rs | 11 +++++++++-- 4 files changed, 32 insertions(+), 5 deletions(-) diff --git a/experimental/zerotrie/examples/byteph.rs b/experimental/zerotrie/examples/byteph.rs index ffbe4279490..e294dc82b07 100644 --- a/experimental/zerotrie/examples/byteph.rs +++ b/experimental/zerotrie/examples/byteph.rs @@ -35,14 +35,19 @@ fn main(_argc: isize, _argv: *const *const u8) -> isize { icu_benchmark_macros::main_setup!(); let mut p_distr = vec![0; 256]; + let mut q_distr = vec![0; 256]; for len in 0..256 { for seed in 0..100 { let bytes = random_alphanums(seed, len); - let (p, _) = find(bytes.as_slice()).unwrap(); + let (p, qq) = find(bytes.as_slice()).unwrap(); p_distr[p as usize] += 1; + for q in qq { + q_distr[q as usize] += 1; + } } } println!("p_distr: {p_distr:?}"); + println!("q_distr: {q_distr:?}"); let bytes = random_alphanums(0, 16); diff --git a/experimental/zerotrie/src/byte_phf/builder.rs b/experimental/zerotrie/src/byte_phf/builder.rs index c5c01b90104..6db5ac29264 100644 --- a/experimental/zerotrie/src/byte_phf/builder.rs +++ b/experimental/zerotrie/src/byte_phf/builder.rs @@ -13,6 +13,9 @@ use alloc::vec::Vec; /// hash maps that fall back to the slow path. const MAX_L2_SEARCH_MISSES: usize = 24; +/// Directly compute the perfect hash function. +/// +/// Returns `(p, [q_0, q_1, ..., q_(N-1)])`, or an error if the PHF could not be computed. #[allow(unused_labels)] // for readability pub fn find(bytes: &[u8]) -> Result<(u8, Vec), Error> { #[allow(non_snake_case)] @@ -65,6 +68,7 @@ pub fn find(bytes: &[u8]) -> Result<(u8, Vec), Error> { bqs[i] = 0; if i == 0 || num_max_q > MAX_L2_SEARCH_MISSES { if p == max_allowable_p && max_allowable_p != P_REAL_MAX { + // println!("Could not solve fast function: trying again: {bytes:?}"); max_allowable_p = P_REAL_MAX; max_allowable_q = Q_REAL_MAX; p = 0; diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index dd9dab4406d..4c1a34dc7a8 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -2,9 +2,18 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +#![allow(rustdoc::private_intra_doc_links)] // doc(hidden) module + //! # Byte Perfect Hash Function Internals //! -//! This module contains a perfect hash function (PHF) optimized for... TODO +//! This module contains a perfect hash function (PHF) designed for a fast, compact perfect +//! hash over 1 to 256 nodes (bytes). +//! +//! The PHF uses the following variables: +//! +//! 1. A single parameter `p`, which is 0 in about 98% of cases. +//! 2. A list of `N` parameters `q_t`, one per _bucket_ +//! 3. The `N` keys in an arbitrary order determined by the PHF //! //! Reading a `key` from the PHF uses the following algorithm: //! @@ -13,7 +22,9 @@ //! 3. If `key == k_i`, return `Some(i)`; else return `None`. //! //! The functions [`f1`] and [`f2`] are internal to the PHF but should remain stable across -//! serialization versions of `ZeroTrie`. +//! serialization versions of `ZeroTrie`. They are very fast, constant-time operations as long +//! as `p` <= [`P_FAST_MAX`] and `q` <= [`Q_FAST_MAX`]. In practice, nearly 100% of parameter +//! values are in the fast range. //! //! ``` //! let phf_example_bytes = [ diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index 4e606c169db..f7eca038bf3 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -27,6 +27,14 @@ //! assert_eq!(trie.byte_len(), 18); //! ``` //! +//! # Internal Structure +//! +//! To read about the internal structure of [`ZeroTrie`], build the docs with private modules: +//! +//! ```bash +//! cargo doc --document-private-items --all-features --no-deps --open +//! ``` +//! //! [`LiteMap`]: litemap::LiteMap //! [`BTreeMap`]: alloc::collections::BTreeMap @@ -36,8 +44,7 @@ extern crate alloc; mod builder; -#[doc(hidden)] -pub mod byte_phf; +mod byte_phf; mod error; mod reader; #[cfg(feature = "serde")] From b8e06dd15c83d77913f28ef8f76ac8643a08768e Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 15:50:43 +0200 Subject: [PATCH 17/31] Move example byteph to a unit test and refactor exports of byte_phf --- experimental/zerotrie/examples/byteph.rs | 69 ------------------ experimental/zerotrie/src/byte_phf/builder.rs | 71 +++++++++++++++++++ .../zerotrie/src/byte_phf/cached_owned.rs | 4 -- experimental/zerotrie/src/byte_phf/mod.rs | 13 ++-- experimental/zerotrie/src/lib.rs | 7 ++ 5 files changed, 84 insertions(+), 80 deletions(-) delete mode 100644 experimental/zerotrie/examples/byteph.rs diff --git a/experimental/zerotrie/examples/byteph.rs b/experimental/zerotrie/examples/byteph.rs deleted file mode 100644 index e294dc82b07..00000000000 --- a/experimental/zerotrie/examples/byteph.rs +++ /dev/null @@ -1,69 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -// This example demonstrates the use of AsciiTrie to look up data based on a region code. - -#![no_main] // https://github.com/unicode-org/icu4x/issues/395 -#![allow(unused_labels)] -#![allow(dead_code)] - -icu_benchmark_macros::static_setup!(); - -use zerotrie::byte_phf::*; - -fn print_byte_to_stdout(byte: u8) { - if let Ok(c) = char::try_from(byte) { - if c.is_ascii_alphanumeric() { - print!("'{c}'"); - return; - } - } - print!("0x{byte:X}"); -} - -fn random_alphanums(seed: u64, len: usize) -> Vec { - use rand::seq::SliceRandom; - use rand::SeedableRng; - const BYTES: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; - let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed); - BYTES.choose_multiple(&mut rng, len).copied().collect() -} - -#[no_mangle] -fn main(_argc: isize, _argv: *const *const u8) -> isize { - icu_benchmark_macros::main_setup!(); - - let mut p_distr = vec![0; 256]; - let mut q_distr = vec![0; 256]; - for len in 0..256 { - for seed in 0..100 { - let bytes = random_alphanums(seed, len); - let (p, qq) = find(bytes.as_slice()).unwrap(); - p_distr[p as usize] += 1; - for q in qq { - q_distr[q as usize] += 1; - } - } - } - println!("p_distr: {p_distr:?}"); - println!("q_distr: {q_distr:?}"); - - let bytes = random_alphanums(0, 16); - - #[allow(non_snake_case)] - let N = bytes.len(); - - let (p, qq) = find(bytes.as_slice()).unwrap(); - - println!("Results:"); - for byte in bytes.iter() { - print_byte_to_stdout(*byte); - let l1 = f1(*byte, p, N); - let q = qq[l1]; - let l2 = f2(*byte, q, N); - println!(" => l1 {l1} => q {q} => l2 {l2}"); - } - - 0 -} diff --git a/experimental/zerotrie/src/byte_phf/builder.rs b/experimental/zerotrie/src/byte_phf/builder.rs index 6db5ac29264..0a846eb285a 100644 --- a/experimental/zerotrie/src/byte_phf/builder.rs +++ b/experimental/zerotrie/src/byte_phf/builder.rs @@ -117,3 +117,74 @@ impl PerfectByteHashMap> { Ok(Self(result)) } } + +#[cfg(test)] +mod tests { + use super::*; + + extern crate std; + use std::print; + use std::println; + + fn print_byte_to_stdout(byte: u8) { + if let Ok(c) = char::try_from(byte) { + if c.is_ascii_alphanumeric() { + print!("'{c}'"); + return; + } + } + print!("0x{byte:X}"); + } + + fn random_alphanums(seed: u64, len: usize) -> Vec { + use rand::seq::SliceRandom; + use rand::SeedableRng; + const BYTES: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(seed); + BYTES.choose_multiple(&mut rng, len).copied().collect() + } + + #[test] + fn test_random_distributions() { + let mut p_distr = vec![0; 256]; + let mut q_distr = vec![0; 256]; + for len in 0..50 { + for seed in 0..50 { + let bytes = random_alphanums(seed, len); + let (p, qq) = find(bytes.as_slice()).unwrap(); + p_distr[p as usize] += 1; + for q in qq { + q_distr[q as usize] += 1; + } + } + } + println!("p_distr: {p_distr:?}"); + println!("q_distr: {q_distr:?}"); + + let fast_p = p_distr[0..=P_FAST_MAX as usize].iter().sum::(); + let slow_p = p_distr[(P_FAST_MAX + 1) as usize..].iter().sum::(); + let fast_q = q_distr[0..=Q_FAST_MAX as usize].iter().sum::(); + let slow_q = q_distr[(Q_FAST_MAX + 1) as usize..].iter().sum::(); + + assert_eq!(2500, fast_p); + assert_eq!(0, slow_p); + assert_eq!(61247, fast_q); + assert_eq!(3, slow_q); + + let bytes = random_alphanums(0, 16); + + #[allow(non_snake_case)] + let N = bytes.len(); + + let (p, qq) = find(bytes.as_slice()).unwrap(); + + println!("Results:"); + for byte in bytes.iter() { + print_byte_to_stdout(*byte); + let l1 = f1(*byte, p, N); + let q = qq[l1]; + let l2 = f2(*byte, q, N); + println!(" => l1 {l1} => q {q} => l2 {l2}"); + } + } +} diff --git a/experimental/zerotrie/src/byte_phf/cached_owned.rs b/experimental/zerotrie/src/byte_phf/cached_owned.rs index a4c2732f474..5cacdf7c9ef 100644 --- a/experimental/zerotrie/src/byte_phf/cached_owned.rs +++ b/experimental/zerotrie/src/byte_phf/cached_owned.rs @@ -30,8 +30,4 @@ impl PerfectByteHashMapCacheOwned { }; Ok(mut_phf.as_borrowed()) } - - pub fn get(&self, keys: &[u8]) -> Option<&PerfectByteHashMap<[u8]>> { - self.data.get(keys).map(|p| p.as_borrowed()) - } } diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index 4c1a34dc7a8..9abc0a9a120 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -27,6 +27,8 @@ //! values are in the fast range. //! //! ``` +//! use zerotrie::_internal::PerfectByteHashMap; +//! //! let phf_example_bytes = [ //! // `p` parameter //! 1, @@ -36,7 +38,7 @@ //! b'e', b'a', b'c', b'g' //! ]; //! -//! let phf = zerotrie::byte_phf::PerfectByteHashMap::from_bytes(&phf_example_bytes); +//! let phf = PerfectByteHashMap::from_bytes(&phf_example_bytes); //! //! // The PHF returns the index of the key or `None` if not found. //! assert_eq!(phf.get(b'a'), Some(1)); @@ -108,7 +110,7 @@ fn debug_get(slice: &[u8], index: usize) -> Option { /// # Examples /// /// ``` -/// use zerotrie::byte_phf::f1; +/// use zerotrie::_internal::f1; /// const N: usize = 10; /// /// // With p = 0: @@ -165,7 +167,7 @@ pub fn f1(byte: u8, p: u8, n: usize) -> usize { /// # Examples /// /// ``` -/// use zerotrie::byte_phf::f2; +/// use zerotrie::_internal::f2; /// const N: usize = 10; /// /// // With q = 0: @@ -216,10 +218,6 @@ impl PerfectByteHashMap { pub fn from_store(store: S) -> Self { Self(store) } - - pub fn take_store(self) -> S { - self.0 - } } impl PerfectByteHashMap @@ -254,6 +252,7 @@ where .map(|s| s.1) .unwrap_or(&[]) } + #[cfg(test)] pub fn p_qmax(&self) -> Option<(u8, u8)> { let (p, buffer) = self.0.as_ref().split_first()?; let n = buffer.len() / 2; diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index f7eca038bf3..2609b20afbc 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -57,3 +57,10 @@ pub use crate::zerotrie::ZeroTrieExtendedCapacity; pub use crate::zerotrie::ZeroTriePerfectHash; pub use crate::zerotrie::ZeroTrieSimpleAscii; pub use error::Error as ZeroTrieError; + +#[doc(hidden)] +pub mod _internal { + pub use crate::byte_phf::f1; + pub use crate::byte_phf::f2; + pub use crate::byte_phf::PerfectByteHashMap; +} From f4ce8c2c66e9c05ad8ce7ae07b41b0dbe4cb5928 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 16:24:05 +0200 Subject: [PATCH 18/31] More docs --- .../zerotrie/src/byte_phf/cached_owned.rs | 3 + experimental/zerotrie/src/byte_phf/mod.rs | 28 ++++-- experimental/zerotrie/src/reader.rs | 97 ++++++++++++++++++- 3 files changed, 119 insertions(+), 9 deletions(-) diff --git a/experimental/zerotrie/src/byte_phf/cached_owned.rs b/experimental/zerotrie/src/byte_phf/cached_owned.rs index 5cacdf7c9ef..e6e17e35f26 100644 --- a/experimental/zerotrie/src/byte_phf/cached_owned.rs +++ b/experimental/zerotrie/src/byte_phf/cached_owned.rs @@ -8,18 +8,21 @@ use alloc::collections::btree_map::Entry; use alloc::collections::BTreeMap; use alloc::vec::Vec; +/// Helper class for caching the results of multiple [`PerfectByteHashMap`] calculations. pub struct PerfectByteHashMapCacheOwned { // Note: This should probably be a HashMap but that isn't in `alloc` data: BTreeMap, PerfectByteHashMap>>, } impl PerfectByteHashMapCacheOwned { + /// Creates a new empty instance. pub fn new_empty() -> Self { Self { data: BTreeMap::new(), } } + /// Gets the [`PerfectByteHashMap`] for the given bytes, calculating it if necessary. pub fn try_get_or_insert(&mut self, keys: Vec) -> Result<&PerfectByteHashMap<[u8]>, Error> { let mut_phf = match self.data.entry(keys) { Entry::Vacant(entry) => { diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index 9abc0a9a120..9d748753fa3 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -209,21 +209,28 @@ pub fn f2(byte: u8, q: u8, n: usize) -> usize { result as usize % n } -// Standard layout: P, N bytes of Q, N bytes of expected keys +/// A constant-time map from bytes to unique indices. +/// +/// Uses a perfect hash function (see module-level documentation). Does not support mutation. +/// +/// Standard layout: P, N bytes of Q, N bytes of expected keys #[derive(Debug, PartialEq, Eq)] #[repr(transparent)] -pub struct PerfectByteHashMap(S); +pub struct PerfectByteHashMap(Store); -impl PerfectByteHashMap { - pub fn from_store(store: S) -> Self { +impl PerfectByteHashMap { + /// Creates an instance from a pre-existing store. See [`Self::as_bytes`]. + #[inline] + pub fn from_store(store: Store) -> Self { Self(store) } } -impl PerfectByteHashMap +impl PerfectByteHashMap where - S: AsRef<[u8]> + ?Sized, + Store: AsRef<[u8]> + ?Sized, { + /// Gets the usize for the given byte, or `None` if it is not in the map. pub fn get(&self, key: u8) -> Option { let (p, buffer) = self.0.as_ref().split_first()?; let n = buffer.len() / 2; @@ -246,6 +253,7 @@ where pub fn num_items(&self) -> usize { self.0.as_ref().len() / 2 } + /// Get an iterator over the keys in the order in which they are stored in the map. pub fn keys(&self) -> &[u8] { let n = self.num_items(); debug_split_at(self.0.as_ref(), 1 + n) @@ -262,6 +270,8 @@ where let (qq, _) = debug_split_at(buffer, n)?; Some((*p, *qq.iter().max().unwrap())) } + /// Returns the map as bytes. The map can be recovered with [`Self::from_store`] + /// or [`Self::from_bytes`]. pub fn as_bytes(&self) -> &[u8] { self.0.as_ref() } @@ -287,6 +297,7 @@ where } impl PerfectByteHashMap<[u8]> { + /// Creates an instance from pre-existing bytes. See [`Self::as_bytes`]. #[inline] pub fn from_bytes(bytes: &[u8]) -> &Self { // Safety: Self is repr(transparent) over [u8] @@ -294,10 +305,11 @@ impl PerfectByteHashMap<[u8]> { } } -impl PerfectByteHashMap +impl PerfectByteHashMap where - S: AsRef<[u8]> + ?Sized, + Store: AsRef<[u8]> + ?Sized, { + /// Converts from `PerfectByteHashMap>` to `&PerfectByteHashMap<[u8]>` #[inline] pub fn as_borrowed(&self) -> &PerfectByteHashMap<[u8]> { PerfectByteHashMap::from_bytes(self.0.as_ref()) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 1da8e7b9cfd..6287442bfa1 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -26,6 +26,8 @@ //! //! When a node is consumed, a shorter, well-formed ZeroTrie remains. //! +//! ### Basic Example +//! //! Here is an example ZeroTrie without branch nodes: //! //! ``` @@ -106,7 +108,100 @@ //! //! ### Offset Tables //! -//! Both types of branch node are followed by an offset table. +//! The _offset table_ encodes the range of the remaining buffer containing the trie reachable +//! from the byte matched in the branch node. Both types of branch nodes include an offset +//! table followig the key lookup. Given the index `i` from the first step, the range +//! `[s_i, s_(i+1))` brackets the next step in the trie. +//! +//! Offset tables utilize the `W` parameter stored in the branch head node. The special case +//! when `W == 0`, with `N - 1` bytes, is easiest to understand: +//! +//! **Offset table, W = 0:** `[s_1, s_2, ..., s_(N-1)]` +//! +//! Note that `s_0` is always 0 and `s_N` is always the length of the remaining slice, so those +//! values are not explicitly included in the offset table. +//! +//! When W > 0, the high and low bits of the offsets are in separate bytes, arranged as follows: +//! +//! **Generalized offset table:** `[a_1, a_2, ..., a_(N-1), b_1, b_2, ..., b_(N-1), c_1, ...]` +//! +//! where `s_i = (a_i << 8 + b_i) << 8 + c_i ...` (high bits first, low bits last) +//! +//! ### Advanced Example +//! +//! The following trie encodes the following map. It has multiple varints and branch nodes, which +//! are all binary search with W = 0. Note that there is a value for the empty string. +//! +//! - "" → 0 +//! - "axb" → 100 +//! - "ayc" → 2 +//! - "azd" → 3 +//! - "bxe" → 4 +//! - "bxefg" → 500 +//! - "bxefh" → 6 +//! - "bxei" → 7 +//! - "bxeikl" → 8 +//! +//! ``` +//! use zerotrie::ZeroTrieSimpleAscii; +//! +//! let bytes = [ +//! 0b10000000, // value 0 +//! 0b11000010, // branch of 2 +//! b'a', // +//! b'b', // +//! 13, // +//! 0b11000011, // start of 'a' subtree: branch of 3 +//! b'x', // +//! b'y', // +//! b'z', // +//! 3, // +//! 5, // +//! b'b', // +//! 0b10010000, // value 100 (lead) +//! 0x54, // value 100 (trail) +//! b'c', // +//! 0b10000010, // value 2 +//! b'd', // +//! 0b10000011, // value 3 +//! b'x', // start of 'b' subtree +//! b'e', // +//! 0b10000100, // value 4 +//! 0b11000010, // branch of 2 +//! b'f', // +//! b'i', // +//! 7, // +//! 0b11000010, // branch of 2 +//! b'g', // +//! b'h', // +//! 2, // +//! 0b10010011, // value 500 (lead) +//! 0x64, // value 500 (trail) +//! 0b10000110, // value 6 +//! 0b10000111, // value 7 +//! b'k', // +//! b'l', // +//! 0b10001000, // value 8 +//! ]; +//! +//! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes); +//! +//! // Assert that the specified items are in the map +//! assert_eq!(trie.get(b""), Some(0)); +//! assert_eq!(trie.get(b"axb"), Some(100)); +//! assert_eq!(trie.get(b"ayc"), Some(2)); +//! assert_eq!(trie.get(b"azd"), Some(3)); +//! assert_eq!(trie.get(b"bxe"), Some(4)); +//! assert_eq!(trie.get(b"bxefg"), Some(500)); +//! assert_eq!(trie.get(b"bxefh"), Some(6)); +//! assert_eq!(trie.get(b"bxei"), Some(7)); +//! assert_eq!(trie.get(b"bxeikl"), Some(8)); +//! +//! // Assert that some other items are not in the map +//! assert_eq!(trie.get(b"a"), None); +//! assert_eq!(trie.get(b"bx"), None); +//! assert_eq!(trie.get(b"xba"), None); +//! ``` use crate::byte_phf::PerfectByteHashMap; use crate::varint::read_varint_meta2; From a8de97d63b63cfe60dfdc00fa86b2c251acfe4f3 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:02:34 +0200 Subject: [PATCH 19/31] atbs_split_first --> atbs_pop_front --- experimental/zerotrie/src/builder/nonconst/builder.rs | 2 +- experimental/zerotrie/src/builder/nonconst/store.rs | 4 ++-- experimental/zerotrie/src/varint.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/experimental/zerotrie/src/builder/nonconst/builder.rs b/experimental/zerotrie/src/builder/nonconst/builder.rs index 14e6807e21c..c63020cc1a4 100644 --- a/experimental/zerotrie/src/builder/nonconst/builder.rs +++ b/experimental/zerotrie/src/builder/nonconst/builder.rs @@ -58,7 +58,7 @@ impl ZeroTrieBuilder { self.data.atbs_push_front(ascii); Ok(1) } else if matches!(self.options.ascii_mode, AsciiMode::BinarySpans) { - if let Some(old_front) = self.data.atbs_split_first() { + if let Some(old_front) = self.data.atbs_pop_front() { let old_byte_len = self.data.atbs_len() + 1; if old_front & 0b11100000 == 0b10100000 { // Extend an existing span diff --git a/experimental/zerotrie/src/builder/nonconst/store.rs b/experimental/zerotrie/src/builder/nonconst/store.rs index d86f96c3967..cb6e8fd482b 100644 --- a/experimental/zerotrie/src/builder/nonconst/store.rs +++ b/experimental/zerotrie/src/builder/nonconst/store.rs @@ -17,7 +17,7 @@ pub trait TrieBuilderStore { fn atbs_to_bytes(&self) -> Vec; fn atbs_bitor_assign(&mut self, index: usize, other: u8); fn atbs_swap_ranges(&mut self, start: usize, mid: usize, limit: usize); - fn atbs_split_first(&mut self) -> Option; + fn atbs_pop_front(&mut self) -> Option; fn atbs_prepend_n_zeros(&mut self, n: usize) { let mut i = 0; @@ -87,7 +87,7 @@ impl TrieBuilderStore for VecDeque { } } } - fn atbs_split_first(&mut self) -> Option { + fn atbs_pop_front(&mut self) -> Option { self.pop_front() } } diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs index c2df67c019d..5ce14088f86 100644 --- a/experimental/zerotrie/src/varint.rs +++ b/experimental/zerotrie/src/varint.rs @@ -98,7 +98,7 @@ pub(crate) fn try_read_varint_meta3_from_tstore( let mut value = (start & 0b00001111) as usize; if (start & 0b00010000) != 0 { loop { - let next = remainder.atbs_split_first()?; + let next = remainder.atbs_pop_front()?; // Note: value << 7 could drop high bits. The first addition can't overflow. // The second addition could overflow; in such a case we just inform the // developer via the debug assertion. From ed65138d9572269d23a1714b6239fd91f469eea6 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:21:58 +0200 Subject: [PATCH 20/31] NodeType refactor and docs --- experimental/zerotrie/src/reader.rs | 81 +++++++++++++++-------------- experimental/zerotrie/src/varint.rs | 9 ++++ 2 files changed, 52 insertions(+), 38 deletions(-) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 6287442bfa1..b966a584a1a 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -309,33 +309,38 @@ fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> Option<&[u8]> { debug_get_range(trie, p..q) } -enum ByteType { +/// The node type. See the module-level docs for more explanation of the four node types. +enum NodeType { + /// An ASCII node. Contains a single literal ASCII byte and no varint. Ascii, + /// A span node. Contains a varint indicating how big the span is. Span, + /// A value node. Contains a varint representing the value. Value, - Match, + /// A branch node. Contains a varint of the number of output nodes, plus W in the high bits. + Branch, } -impl core::fmt::Debug for ByteType { +impl core::fmt::Debug for NodeType { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - use ByteType::*; + use NodeType::*; f.write_str(match *self { Ascii => "a", Span => "s", Value => "v", - Match => "m", + Branch => "m", }) } } #[inline] -fn byte_type(b: u8) -> ByteType { +fn byte_type(b: u8) -> NodeType { match b & 0b11100000 { - 0b10000000 => ByteType::Value, - 0b10100000 => ByteType::Span, - 0b11000000 => ByteType::Match, - 0b11100000 => ByteType::Match, - _ => ByteType::Ascii, + 0b10000000 => NodeType::Value, + 0b10100000 => NodeType::Span, + 0b11000000 => NodeType::Branch, + 0b11100000 => NodeType::Branch, + _ => NodeType::Ascii, } } @@ -354,12 +359,12 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { (b, trie) = trie.split_first()?; let byte_type = byte_type(*b); (x, trie) = match byte_type { - ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, - ByteType::Match => read_varint_meta2(*b, trie)?, + NodeType::Ascii => (0, trie), + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, + NodeType::Branch => read_varint_meta2(*b, trie)?, }; if let Some((c, temp)) = ascii.split_first() { - if matches!(byte_type, ByteType::Ascii) { + if matches!(byte_type, NodeType::Ascii) { if b == c { // Matched a byte ascii = temp; @@ -369,11 +374,11 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { return None; } } - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { // Value node, but not at end of string continue; } - if matches!(byte_type, ByteType::Span) { + if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); (trie_span, trie) = debug_split_at(trie, x)?; (ascii_span, ascii) = maybe_split_at(ascii, x)?; @@ -402,7 +407,7 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { ascii = temp; continue; } else { - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { // Value node at end of string return Some(x); } @@ -418,12 +423,12 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { (b, trie) = trie.split_first()?; let byte_type = byte_type(*b); (x, trie) = match byte_type { - ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, - ByteType::Match => read_varint_meta2(*b, trie)?, + NodeType::Ascii => (0, trie), + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, + NodeType::Branch => read_varint_meta2(*b, trie)?, }; if let Some((c, temp)) = ascii.split_first() { - if matches!(byte_type, ByteType::Ascii) { + if matches!(byte_type, NodeType::Ascii) { if b == c { // Matched a byte ascii = temp; @@ -433,11 +438,11 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { return None; } } - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { // Value node, but not at end of string continue; } - if matches!(byte_type, ByteType::Span) { + if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); (trie_span, trie) = debug_split_at(trie, x)?; (ascii_span, ascii) = maybe_split_at(ascii, x)?; @@ -472,7 +477,7 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { ascii = temp; continue; } else { - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { // Value node at end of string return Some(x); } @@ -488,12 +493,12 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { (b, trie) = trie.split_first()?; let byte_type = byte_type(*b); (x, trie) = match byte_type { - ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, - ByteType::Match => read_varint_meta2(*b, trie)?, + NodeType::Ascii => (0, trie), + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, + NodeType::Branch => read_varint_meta2(*b, trie)?, }; if let Some((c, temp)) = ascii.split_first() { - if matches!(byte_type, ByteType::Ascii) { + if matches!(byte_type, NodeType::Ascii) { if b == c { // Matched a byte ascii = temp; @@ -503,11 +508,11 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { return None; } } - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { // Value node, but not at end of string continue; } - if matches!(byte_type, ByteType::Span) { + if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); (trie_span, trie) = debug_split_at(trie, x)?; (ascii_span, ascii) = maybe_split_at(ascii, x)?; @@ -539,7 +544,7 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { ascii = temp; continue; } else { - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { // Value node at end of string return Some(x); } @@ -586,21 +591,21 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { } }; let byte_type = byte_type(*b); - if matches!(byte_type, ByteType::Ascii) { + if matches!(byte_type, NodeType::Ascii) { string.push(*b); continue; } (x, trie) = match byte_type { - ByteType::Ascii => (0, trie), - ByteType::Span | ByteType::Value => read_varint_meta3(*b, trie)?, - ByteType::Match => read_varint_meta2(*b, trie)?, + NodeType::Ascii => (0, trie), + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, + NodeType::Branch => read_varint_meta2(*b, trie)?, }; - if matches!(byte_type, ByteType::Span) { + if matches!(byte_type, NodeType::Span) { (span, trie) = debug_split_at(trie, x)?; string.extend(span); continue; } - if matches!(byte_type, ByteType::Value) { + if matches!(byte_type, NodeType::Value) { let retval = string.clone(); // Return to this position on the next step self.state.push((trie, string, 0)); diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs index 5ce14088f86..ca8d52fefd7 100644 --- a/experimental/zerotrie/src/varint.rs +++ b/experimental/zerotrie/src/varint.rs @@ -45,6 +45,8 @@ use crate::builder::konst::ConstArrayBuilder; use crate::builder::nonconst::TrieBuilderStore; /// Reads a varint with 2 bits of metadata in the lead byte. +/// +/// Returns the varint value and a subslice of `remainder` with the varint bytes removed. pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { let mut value = (start & 0b00011111) as usize; let mut remainder = remainder; @@ -68,6 +70,8 @@ pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> Option<(usize, &[ } /// Reads a varint with 3 bits of metadata in the lead byte. +/// +/// Returns the varint value and a subslice of `remainder` with the varint bytes removed. pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { let mut value = (start & 0b00001111) as usize; let mut remainder = remainder; @@ -90,6 +94,9 @@ pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> Option<(usize, &[ Some((value, remainder)) } +/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`]. +/// +/// Returns the varint value. #[cfg(feature = "alloc")] pub(crate) fn try_read_varint_meta3_from_tstore( start: u8, @@ -118,6 +125,7 @@ const MAX_VARINT: usize = usize::MAX; // Add an extra 1 since the lead byte holds only 5 bits of data. const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::() * 8 / 7; +/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata. pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder { let mut result = [0; MAX_VARINT_LENGTH]; let mut i = MAX_VARINT_LENGTH - 1; @@ -145,6 +153,7 @@ pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder ConstArrayBuilder { let mut result = [0; MAX_VARINT_LENGTH]; let mut i = MAX_VARINT_LENGTH - 1; From 4896988ff9e52c38e588781e9535bf6ee828dcdd Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:27:13 +0200 Subject: [PATCH 21/31] Docs for ZeroTrieIterator --- experimental/zerotrie/src/reader.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index b966a584a1a..38192ae443a 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -556,9 +556,15 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { #[cfg(feature = "alloc")] use alloc::vec::Vec; +/// Internal iterator type for walking the strings contained in a ZeroTrie. #[cfg(feature = "alloc")] pub(crate) struct ZeroTrieIterator<'a> { + /// Whether the PHF is enabled on this trie. use_phf: bool, + /// Intermediate state during iteration: + /// 1. A trie (usually a slice of the original, bigger trie) + /// 2. The string that leads to the trie + /// 3. If the trie's lead node is a branch node, the current index being evaluated state: Vec<(&'a [u8], Vec, usize)>, } From b484dc1cbeb217a28da7e0dcbd495e5ff0dd0a7d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:41:13 +0200 Subject: [PATCH 22/31] Move helper functions to helpers.rs --- experimental/zerotrie/src/byte_phf/mod.rs | 26 +---------- experimental/zerotrie/src/helpers.rs | 56 +++++++++++++++++++++++ experimental/zerotrie/src/lib.rs | 1 + experimental/zerotrie/src/reader.rs | 51 +-------------------- 4 files changed, 60 insertions(+), 74 deletions(-) create mode 100644 experimental/zerotrie/src/helpers.rs diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index 9d748753fa3..b12d9721f19 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -50,6 +50,8 @@ //! assert_eq!(phf.get(b'g'), Some(3)); //! ``` +use crate::helpers::*; + #[cfg(feature = "alloc")] mod builder; #[cfg(feature = "alloc")] @@ -74,30 +76,6 @@ const P_REAL_MAX: u8 = 15; #[cfg(feature = "alloc")] // used in the builder code const Q_REAL_MAX: u8 = 127; -/// Like slice::split_at but returns an Option instead of panicking -#[inline] -fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if mid > slice.len() { - debug_assert!(false, "debug_split_at: index expected to be in range"); - None - } else { - // Note: We're trusting the compiler to inline this and remove the assertion - // hiding on the top of slice::split_at: `assert(mid <= self.len())` - Some(slice.split_at(mid)) - } -} - -#[inline] -fn debug_get(slice: &[u8], index: usize) -> Option { - match slice.get(index) { - Some(x) => Some(*x), - None => { - debug_assert!(false, "debug_get: index expected to be in range"); - None - } - } -} - /// Calculates the function `f1` for the PHF. For the exact formula, please read the code. /// /// When `p == 0`, the operation is a simple modulus. diff --git a/experimental/zerotrie/src/helpers.rs b/experimental/zerotrie/src/helpers.rs new file mode 100644 index 00000000000..e7983b490e1 --- /dev/null +++ b/experimental/zerotrie/src/helpers.rs @@ -0,0 +1,56 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::ops::Range; + +/// Like slice::split_at but returns an Option instead of panicking. +/// +/// Debug-panics if `mid` is out of range. +#[inline] +pub(crate) fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if mid > slice.len() { + debug_assert!(false, "debug_split_at: index expected to be in range"); + None + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + Some(slice.split_at(mid)) + } +} + +/// Like slice::split_at but returns an Option instead of panicking. +#[inline] +pub(crate) fn maybe_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if mid > slice.len() { + None + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + Some(slice.split_at(mid)) + } +} + +/// Gets the item at the specified index, panicking in debug mode if it is not there. +#[inline] +pub(crate) fn debug_get(slice: &[u8], index: usize) -> Option { + match slice.get(index) { + Some(x) => Some(*x), + None => { + debug_assert!(false, "debug_get: index expected to be in range"); + None + } + } +} + +/// Gets the range between the specified indices, panicking in debug mode if not in bounds. +#[inline] +pub(crate) fn debug_get_range(slice: &[u8], range: Range) -> Option<&[u8]> { + match slice.get(range) { + Some(x) => Some(x), + None => { + debug_assert!(false, "debug_get_range: indices expected to be in range"); + None + } + } +} diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index 2609b20afbc..2067768cc19 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -46,6 +46,7 @@ extern crate alloc; mod builder; mod byte_phf; mod error; +mod helpers; mod reader; #[cfg(feature = "serde")] mod serde; diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 38192ae443a..21445246c7e 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -206,60 +206,11 @@ use crate::byte_phf::PerfectByteHashMap; use crate::varint::read_varint_meta2; use crate::varint::read_varint_meta3; -use core::ops::Range; +use crate::helpers::*; #[cfg(feature = "alloc")] use alloc::string::String; -/// Like slice::split_at but returns an Option instead of panicking. -/// -/// Debug-panics if `mid` is out of range. -#[inline] -fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if mid > slice.len() { - debug_assert!(false, "debug_split_at: index expected to be in range"); - None - } else { - // Note: We're trusting the compiler to inline this and remove the assertion - // hiding on the top of slice::split_at: `assert(mid <= self.len())` - Some(slice.split_at(mid)) - } -} - -/// Like slice::split_at but returns an Option instead of panicking. -#[inline] -fn maybe_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if mid > slice.len() { - None - } else { - // Note: We're trusting the compiler to inline this and remove the assertion - // hiding on the top of slice::split_at: `assert(mid <= self.len())` - Some(slice.split_at(mid)) - } -} - -#[inline] -fn debug_get(slice: &[u8], index: usize) -> Option { - match slice.get(index) { - Some(x) => Some(*x), - None => { - debug_assert!(false, "debug_get: index expected to be in range"); - None - } - } -} - -#[inline] -fn debug_get_range(slice: &[u8], range: Range) -> Option<&[u8]> { - match slice.get(range) { - Some(x) => Some(x), - None => { - debug_assert!(false, "debug_get_range: indices expected to be in range"); - None - } - } -} - /// Given a slice starting with an offset table, returns the trie for the given index. /// /// Arguments: From 0f933fcd7ebab982421febcaf17109007be07851 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:41:19 +0200 Subject: [PATCH 23/31] "must be" comment --- experimental/zerotrie/src/varint.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs index ca8d52fefd7..420bc4d9c13 100644 --- a/experimental/zerotrie/src/varint.rs +++ b/experimental/zerotrie/src/varint.rs @@ -14,10 +14,10 @@ //! ```txt //! xxx0'1010 = 10 //! xxx0'1111 = 15 (largest single-byte value with M=3) -//! xxx1'0000 0000'0000 = 16 (smallest two-byte value with M=3) +//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3) //! xxx1'0000 0000'0001 = 17 //! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3) -//! xxx1'0000 1000'0000 0000'0000 = 2064 (smallest three-byte value with M=3) +//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3) //! xxx1'0000 1000'0000 0000'0001 = 2065 //! ``` //! From 2395d30be8180fde7fac595d230ec760e9a7af71 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:43:15 +0200 Subject: [PATCH 24/31] f2 docs --- experimental/zerotrie/src/byte_phf/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index b12d9721f19..4784cb7b0eb 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -176,11 +176,10 @@ pub fn f2(byte: u8, q: u8, n: usize) -> usize { debug_assert!(false, "unreachable by invariant"); 1 }; - // ((byte ^ q) as usize) % n let mut result = byte ^ q; - // if q >= Q_FAST_MAX { - // result = result ^ byte.wrapping_shr(q as u32); - // } + // In almost all cases, the PHF works with the above constant-time operation. + // However, to crack a few difficult cases, we fall back to the linear-time + // operation shown below. for _ in Q_FAST_MAX..q { result = result ^ (result << 1) ^ (result >> 1); } From 9ab08a63c424ff86dd86f6532f6ef00f6639751f Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 20:05:48 +0200 Subject: [PATCH 25/31] More docs on builder utilities --- .../zerotrie/src/builder/branch_meta.rs | 4 +- experimental/zerotrie/src/builder/bytestr.rs | 4 + .../zerotrie/src/builder/konst/store.rs | 77 ++++++++++++++----- .../zerotrie/src/builder/nonconst/store.rs | 29 ++++++- 4 files changed, 92 insertions(+), 22 deletions(-) diff --git a/experimental/zerotrie/src/builder/branch_meta.rs b/experimental/zerotrie/src/builder/branch_meta.rs index 03dc3087d70..59fb881b6ab 100644 --- a/experimental/zerotrie/src/builder/branch_meta.rs +++ b/experimental/zerotrie/src/builder/branch_meta.rs @@ -5,7 +5,8 @@ /// Intermediate metadata for a branch node under construction. #[derive(Debug, Clone, Copy)] pub(crate) struct BranchMeta { - /// The lead byte for this branch. + /// The lead byte for this branch. Formerly it was required to be an ASCII byte, but now + /// it can be any byte. pub ascii: u8, /// The size in bytes of the trie data reachable from this branch. pub local_length: usize, @@ -16,6 +17,7 @@ pub(crate) struct BranchMeta { } impl BranchMeta { + /// Creates a new empty [`BranchMeta`]. pub const fn const_default() -> Self { BranchMeta { ascii: 0, diff --git a/experimental/zerotrie/src/builder/bytestr.rs b/experimental/zerotrie/src/builder/bytestr.rs index 11e614573fe..9910efd7ffd 100644 --- a/experimental/zerotrie/src/builder/bytestr.rs +++ b/experimental/zerotrie/src/builder/bytestr.rs @@ -7,6 +7,7 @@ use core::borrow::Borrow; #[cfg(feature = "serde")] use alloc::boxed::Box; +/// A struct transparent over `[u8]` with convenient helper functions. #[repr(transparent)] #[derive(PartialEq, Eq, PartialOrd, Ord)] pub(crate) struct ByteStr([u8]); @@ -71,10 +72,12 @@ impl ByteStr { self.0.get(index).copied() } + /// Returns the byte at the given index, panicking if out of bounds. pub(crate) const fn byte_at_or_panic(&self, index: usize) -> u8 { self.0[index] } + /// Const function to evaluate `self < other`. pub(crate) const fn is_less_then(&self, other: &Self) -> bool { let mut i = 0; while i < self.len() && i < other.len() { @@ -89,6 +92,7 @@ impl ByteStr { self.len() < other.len() } + /// Const function to evaluate `self[..prefix_len] == other[..prefix_len]` pub(crate) const fn prefix_eq(&self, other: &ByteStr, prefix_len: usize) -> bool { assert!(prefix_len <= self.len()); assert!(prefix_len <= other.len()); diff --git a/experimental/zerotrie/src/builder/konst/store.rs b/experimental/zerotrie/src/builder/konst/store.rs index 5b6cde31ec8..d2e6ad43be4 100644 --- a/experimental/zerotrie/src/builder/konst/store.rs +++ b/experimental/zerotrie/src/builder/konst/store.rs @@ -6,15 +6,21 @@ use super::super::branch_meta::BranchMeta; -/// A const-friendly slice type. +/// A const-friendly slice type. It is backed by a full slice but is primarily intended +/// to represent subslices of the full slice. We need this only because we can't take +/// subslices in const Rust. #[derive(Debug, Copy, Clone)] pub(crate) struct ConstSlice<'a, T> { + /// The full slice. full_slice: &'a [T], + /// The start index of the slice represented by this [`ConstSlice`]. start: usize, + /// The non-inclusive end index of the slice represented by this [`ConstSlice`]. limit: usize, } impl<'a, T> ConstSlice<'a, T> { + /// Creates a [`ConstSlice`] representing an entire slice. pub const fn from_slice(other: &'a [T]) -> Self { ConstSlice { full_slice: other, @@ -23,6 +29,7 @@ impl<'a, T> ConstSlice<'a, T> { } } + /// Creates a [`ConstSlice`] with the given start and limit. pub const fn from_manual_slice(full_slice: &'a [T], start: usize, limit: usize) -> Self { ConstSlice { full_slice, @@ -31,14 +38,17 @@ impl<'a, T> ConstSlice<'a, T> { } } + /// Returns the length of the [`ConstSlice`]. pub const fn len(&self) -> usize { self.limit - self.start } + /// Gets the element at `index`, panicking if not present. pub const fn get_or_panic(&self, index: usize) -> &T { &self.full_slice[index + self.start] } + /// Gets the first element or `None` if empty. #[cfg(test)] pub const fn first(&self) -> Option<&T> { if self.len() == 0 { @@ -48,6 +58,7 @@ impl<'a, T> ConstSlice<'a, T> { } } + /// Gets the last element or `None` if empty. pub const fn last(&self) -> Option<&T> { if self.len() == 0 { None @@ -56,6 +67,7 @@ impl<'a, T> ConstSlice<'a, T> { } } + /// Gets a subslice of this slice. #[cfg(test)] pub const fn get_subslice_or_panic( &self, @@ -71,6 +83,7 @@ impl<'a, T> ConstSlice<'a, T> { } } + /// Non-const function that returns this [`ConstSlice`] as a regular slice. #[cfg(any(test, feature = "alloc"))] pub fn as_slice(&self) -> &'a [T] { &self.full_slice[self.start..self.limit] @@ -98,6 +111,9 @@ impl Default for ConstArrayBuilder { } impl ConstArrayBuilder { + /// Creates a new, empty builder of the given size. `cursor` indicates where in the + /// array new elements will be inserted first. Since we use a lot of prepend operations, + /// it is common to set `cursor` to `N`. pub const fn new_empty(full_array: [T; N], cursor: usize) -> Self { assert!(cursor <= N); Self { @@ -107,6 +123,7 @@ impl ConstArrayBuilder { } } + /// Creates a new builder with some initial content in `[start, limit)`. pub const fn from_manual_slice(full_array: [T; N], start: usize, limit: usize) -> Self { assert!(start <= limit); assert!(limit <= N); @@ -117,39 +134,42 @@ impl ConstArrayBuilder { } } + /// Returns the number of initialized elements in the builder. pub const fn len(&self) -> usize { self.limit - self.start } + /// Whether there are no initialized elements in the builder. #[allow(dead_code)] pub const fn is_empty(&self) -> bool { self.len() == 0 } + /// Returns the initialized elements as a [`ConstSlice`]. pub const fn as_const_slice(&self) -> ConstSlice { ConstSlice::from_manual_slice(&self.full_array, self.start, self.limit) } + /// Non-const function that returns a slice of the initialized elements. #[cfg(feature = "alloc")] pub fn as_slice(&self) -> &[T] { &self.full_array[self.start..self.limit] } } -impl ConstArrayBuilder { - pub const fn const_bitor_assign(mut self, index: usize, other: u8) -> Self { - self.full_array[self.start + index] |= other; - self - } - // Can't be generic because T has a destructor - pub const fn const_take_or_panic(self) -> [u8; N] { +// Certain functions that involve dropping `T` require that it be `Copy` +impl ConstArrayBuilder { + /// Takes a fully initialized builder as an array. Panics if the builder is not + /// fully initialized. + pub const fn const_take_or_panic(self) -> [T; N] { if self.start != 0 || self.limit != N { panic!("AsciiTrieBuilder buffer too large"); } self.full_array } - // Can't be generic because T has a destructor - pub const fn const_push_front_or_panic(mut self, value: u8) -> Self { + + /// Prepends an element to the front of the builder, panicking if there is no room. + pub const fn const_push_front_or_panic(mut self, value: T) -> Self { if self.start == 0 { panic!("AsciiTrieBuilder buffer too small"); } @@ -157,8 +177,9 @@ impl ConstArrayBuilder { self.full_array[self.start] = value; self } - // Can't be generic because T has a destructor - pub const fn const_extend_front_or_panic(mut self, other: ConstSlice) -> Self { + + /// Prepends multiple elements to the front of the builder, panicking if there is no room. + pub const fn const_extend_front_or_panic(mut self, other: ConstSlice) -> Self { if self.start < other.len() { panic!("AsciiTrieBuilder buffer too small"); } @@ -172,15 +193,16 @@ impl ConstArrayBuilder { } } -impl ConstArrayBuilder { - pub const fn push_front_or_panic(mut self, value: T) -> Self { - if self.start == 0 { - panic!("AsciiTrieBuilder buffer too small"); - } - self.start -= 1; - self.full_array[self.start] = value; +impl ConstArrayBuilder { + /// Specialized function that performs `self[index] |= other` + pub const fn const_bitor_assign(mut self, index: usize, other: u8) -> Self { + self.full_array[self.start + index] |= other; self } +} + +impl ConstArrayBuilder { + /// Swaps the elements at positions `i` and `j`. #[cfg(feature = "alloc")] pub fn swap_or_panic(mut self, i: usize, j: usize) -> Self { self.full_array.swap(self.start + i, self.start + j); @@ -188,6 +210,11 @@ impl ConstArrayBuilder { } } +/// Evaluates a block over each element of a const slice. Takes three arguments: +/// +/// 1. Expression that resolves to the [`ConstSlice`]. +/// 2. Token that will be assigned the value of the element. +/// 3. Block to evaluate for each element. macro_rules! const_for_each { ($safe_const_slice:expr, $item:tt, $inner:expr) => {{ let mut i = 0; @@ -201,6 +228,7 @@ macro_rules! const_for_each { pub(crate) use const_for_each; +/// A data structure that holds up to N [`BranchMeta`] items. pub(crate) struct ConstLengthsStack { data: [Option; N], idx: usize, @@ -213,6 +241,7 @@ impl core::fmt::Debug for ConstLengthsStack { } impl ConstLengthsStack { + /// Creates a new empty [`ConstLengthsStack`]. pub const fn new() -> Self { Self { data: [None; N], @@ -220,10 +249,12 @@ impl ConstLengthsStack { } } + /// Returns whether the stack is empty. pub const fn is_empty(&self) -> bool { self.idx == 0 } + /// Adds a [`BranchMeta`] to the stack, panicking if there is no room. #[must_use] pub const fn push_or_panic(mut self, meta: BranchMeta) -> Self { if self.idx >= N { @@ -238,6 +269,8 @@ impl ConstLengthsStack { self } + /// Returns a copy of the [`BranchMeta`] on the top of the stack, panicking if + /// the stack is empty. pub const fn peek_or_panic(&self) -> BranchMeta { if self.idx == 0 { panic!("AsciiTrie Builder: Attempted to peek from an empty stack"); @@ -245,6 +278,7 @@ impl ConstLengthsStack { self.get_or_panic(0) } + /// Returns a copy of the [`BranchMeta`] at the specified index. const fn get_or_panic(&self, index: usize) -> BranchMeta { if self.idx <= index { panic!("AsciiTrie Builder: Attempted to get too deep in a stack"); @@ -255,6 +289,7 @@ impl ConstLengthsStack { } } + /// Removes many [`BranchMeta`]s from the stack, returning them in a [`ConstArrayBuilder`]. pub const fn pop_many_or_panic( mut self, len: usize, @@ -267,7 +302,7 @@ impl ConstLengthsStack { break; } let i = self.idx - ix - 1; - result = result.push_front_or_panic(match self.data[i] { + result = result.const_push_front_or_panic(match self.data[i] { Some(x) => x, None => panic!("Not enough items in the ConstLengthsStack"), }); @@ -277,12 +312,14 @@ impl ConstLengthsStack { (self, result) } + /// Non-const function that returns the initialized elements as a slice. fn as_slice(&self) -> &[Option] { &self.data[0..self.idx] } } impl ConstArrayBuilder { + /// Converts this builder-array of [`BranchMeta`] to one of the `ascii` fields. pub const fn map_to_ascii_bytes(&self) -> ConstArrayBuilder { let mut result = ConstArrayBuilder::new_empty([0; N], N); let self_as_slice = self.as_const_slice(); diff --git a/experimental/zerotrie/src/builder/nonconst/store.rs b/experimental/zerotrie/src/builder/nonconst/store.rs index cb6e8fd482b..3a1acb9c086 100644 --- a/experimental/zerotrie/src/builder/nonconst/store.rs +++ b/experimental/zerotrie/src/builder/nonconst/store.rs @@ -9,16 +9,33 @@ use super::super::konst::ConstArrayBuilder; use alloc::collections::VecDeque; use alloc::vec::Vec; +/// A trait applied to a data structure for building a ZeroTrie. pub trait TrieBuilderStore { + /// Create a new empty store. fn atbs_new_empty() -> Self; + + /// Return the length in bytes of the store. fn atbs_len(&self) -> usize; + + /// Push a byte to the front of the store. fn atbs_push_front(&mut self, byte: u8); + + /// Push multiple bytes to the front of the store. fn atbs_extend_front(&mut self, other: &[u8]); + + /// Read the store into a `Vec`. fn atbs_to_bytes(&self) -> Vec; + + /// Perform the operation `self[index] |= other` fn atbs_bitor_assign(&mut self, index: usize, other: u8); + + /// Swap the adjacent ranges `self[start..mid]` and `self[mid..limit]`. fn atbs_swap_ranges(&mut self, start: usize, mid: usize, limit: usize); + + /// Remove and return the first element in the store, or None if empty. fn atbs_pop_front(&mut self) -> Option; + /// Prepend `n` zeros to the front of the store. fn atbs_prepend_n_zeros(&mut self, n: usize) { let mut i = 0; while i < n { @@ -65,6 +82,8 @@ impl TrieBuilderStore for VecDeque { self.len() ); } + // The following algorithm is an in-place swap of two adjacent ranges of potentially + // different lengths. Would make a good coding interview question. loop { if start == mid || mid == limit { return; @@ -92,6 +111,7 @@ impl TrieBuilderStore for VecDeque { } } +/// A data structure that holds any number of [`BranchMeta`] items. pub(crate) struct NonConstLengthsStack { data: Vec, } @@ -103,22 +123,28 @@ impl core::fmt::Debug for NonConstLengthsStack { } impl NonConstLengthsStack { + /// Creates a new empty [`ConstLengthsStack`]. pub const fn new() -> Self { Self { data: Vec::new() } } + /// Returns whether the stack is empty. pub fn is_empty(&self) -> bool { self.data.is_empty() } + /// Adds a [`BranchMeta`] to the stack. pub fn push(&mut self, meta: BranchMeta) { self.data.push(meta); } + /// Returns a copy of the [`BranchMeta`] on the top of the stack, panicking if + /// the stack is empty. pub fn peek_or_panic(&self) -> BranchMeta { *self.data.last().unwrap() } + /// Removes many [`BranchMeta`]s from the stack, returning them in a [`ConstArrayBuilder`]. pub fn pop_many_or_panic(&mut self, len: usize) -> ConstArrayBuilder<256, BranchMeta> { debug_assert!(len <= 256); let mut result = ConstArrayBuilder::new_empty([BranchMeta::const_default(); 256], 256); @@ -129,7 +155,7 @@ impl NonConstLengthsStack { } let i = self.data.len() - ix - 1; // Won't panic because len <= 256 - result = result.push_front_or_panic(match self.data.get(i) { + result = result.const_push_front_or_panic(match self.data.get(i) { Some(x) => *x, None => panic!("Not enough items in the ConstLengthsStack"), }); @@ -139,6 +165,7 @@ impl NonConstLengthsStack { result } + /// Non-const function that returns the initialized elements as a slice. fn as_slice(&self) -> &[BranchMeta] { &self.data } From 7847652fdc8e88db1a99c05f4efa1ef14c96c0b9 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 20:18:42 +0200 Subject: [PATCH 26/31] Some more builder docs --- .../zerotrie/src/builder/konst/builder.rs | 22 ++++++++++++++++++- .../zerotrie/src/builder/nonconst/builder.rs | 12 ++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/experimental/zerotrie/src/builder/konst/builder.rs b/experimental/zerotrie/src/builder/konst/builder.rs index 0cbd5c1ac5f..e0810c9817b 100644 --- a/experimental/zerotrie/src/builder/konst/builder.rs +++ b/experimental/zerotrie/src/builder/konst/builder.rs @@ -17,21 +17,26 @@ pub(crate) struct ZeroTrieBuilderConst { } impl ZeroTrieBuilderConst { + /// Non-const function that returns the current trie data as a slice. #[cfg(feature = "litemap")] pub fn as_bytes(&self) -> &[u8] { self.data.as_const_slice().as_slice() } + /// Returns the trie data, panicking if the buffer is the wrong size. pub const fn take_or_panic(self) -> [u8; N] { self.data.const_take_or_panic() } + /// Creates a new empty builder. pub const fn new() -> Self { Self { data: ConstArrayBuilder::new_empty([0; N], N), } } + /// Prepends an ASCII node to the front of the builder. Returns the new builder + /// and the delta in length, which is always 1. #[must_use] const fn prepend_ascii(self, ascii: u8) -> (Self, usize) { if ascii >= 128 { @@ -41,6 +46,8 @@ impl ZeroTrieBuilderConst { (Self { data }, 1) } + /// Prepends a value node to the front of the builder. Returns the new builder + /// and the delta in length, which depends on the size of the varint. #[must_use] const fn prepend_value(self, value: usize) -> (Self, usize) { let mut data = self.data; @@ -50,6 +57,8 @@ impl ZeroTrieBuilderConst { (Self { data }, varint_array.len()) } + /// Prepends a branch node to the front of the builder. Returns the new builder + /// and the delta in length, which depends on the size of the varint. #[must_use] const fn prepend_branch(self, value: usize) -> (Self, usize) { let mut data = self.data; @@ -59,6 +68,8 @@ impl ZeroTrieBuilderConst { (Self { data }, varint_array.len()) } + /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder + /// and the delta in length, which is the length of the slice. #[must_use] const fn prepend_slice(self, s: ConstSlice) -> (Self, usize) { let mut data = self.data; @@ -70,6 +81,7 @@ impl ZeroTrieBuilderConst { (Self { data }, s.len()) } + /// Prepends multiple zeros to the front of the builder. Returns the new builder. #[must_use] const fn prepend_n_zeros(self, n: usize) -> Self { let mut data = self.data; @@ -81,12 +93,17 @@ impl ZeroTrieBuilderConst { Self { data } } + /// Performs the operation `self[index] |= byte` const fn bitor_assign_at(self, index: usize, byte: u8) -> Self { let mut data = self.data; data = data.const_bitor_assign(index, byte); Self { data } } + /// Creates a new builder containing the elements in the given slice of key/value pairs. + /// + /// # Panics + /// /// Panics if the items are not sorted pub const fn from_tuple_slice<'a, const K: usize>( items: &[(&'a ByteStr, usize)], @@ -107,7 +124,9 @@ impl ZeroTrieBuilderConst { Self::from_sorted_const_tuple_slice::(items) } - /// Assumes that the items are sorted + /// Creates a new builder containing the elements in the given slice of key/value pairs. + /// + /// Assumes that the items are sorted. If they are not, unexpected behavior may occur. pub const fn from_sorted_const_tuple_slice( items: ConstSlice<(&ByteStr, usize)>, ) -> Result { @@ -118,6 +137,7 @@ impl ZeroTrieBuilderConst { Ok(result) } + /// The actual builder algorithm. #[must_use] const fn create_or_panic( mut self, diff --git a/experimental/zerotrie/src/builder/nonconst/builder.rs b/experimental/zerotrie/src/builder/nonconst/builder.rs index c63020cc1a4..78945c01f57 100644 --- a/experimental/zerotrie/src/builder/nonconst/builder.rs +++ b/experimental/zerotrie/src/builder/nonconst/builder.rs @@ -49,10 +49,15 @@ pub(crate) struct ZeroTrieBuilder { } impl ZeroTrieBuilder { + /// Returns the trie data as a `Vec`. pub fn to_bytes(&self) -> Vec { self.data.atbs_to_bytes() } + /// Prepends a byte value to the front of the builder. If it is ASCII, an ASCII + /// node is prepended. If it is non-ASCII, if there is already a span node at + /// the front, we modify the span node to add the new byte; otherwise, we create + /// a new span node. Returns the delta in length, which is either 1 or 2. fn prepend_ascii(&mut self, ascii: u8) -> Result { if ascii <= 127 { self.data.atbs_push_front(ascii); @@ -86,6 +91,8 @@ impl ZeroTrieBuilder { } } + /// Prepends a value node to the front of the builder. Returns the + /// delta in length, which depends on the size of the varint. #[must_use] fn prepend_value(&mut self, value: usize) -> usize { let varint_array = varint::write_varint_meta3(value); @@ -94,6 +101,8 @@ impl ZeroTrieBuilder { varint_array.len() } + /// Prepends a branch node to the front of the builder. Returns the + /// delta in length, which depends on the size of the varint. #[must_use] fn prepend_branch(&mut self, value: usize) -> usize { let varint_array = varint::write_varint_meta2(value); @@ -102,6 +111,8 @@ impl ZeroTrieBuilder { varint_array.len() } + /// Prepends multiple arbitrary bytes to the front of the builder. Returns the + /// delta in length, which is the length of the slice. #[must_use] fn prepend_slice(&mut self, s: &[u8]) -> usize { self.data.atbs_extend_front(s); @@ -143,6 +154,7 @@ impl ZeroTrieBuilder { Ok(result) } + /// The actual builder algorithm. #[allow(clippy::unwrap_used)] // lots of indexing, but all indexes should be in range fn create(&mut self, all_items: &[(&ByteStr, usize)]) -> Result { let mut prefix_len = match all_items.last() { From 1a7ac1a7d48b3bf0afd860c959ae2c541b095b38 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 20:30:58 +0200 Subject: [PATCH 27/31] More code review comments --- experimental/zerotrie/src/builder/konst/builder.rs | 12 +++++++++--- experimental/zerotrie/src/builder/konst/store.rs | 6 +++--- experimental/zerotrie/src/builder/mod.rs | 2 ++ experimental/zerotrie/src/builder/nonconst/store.rs | 8 ++++---- experimental/zerotrie/src/byte_phf/mod.rs | 2 ++ 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/experimental/zerotrie/src/builder/konst/builder.rs b/experimental/zerotrie/src/builder/konst/builder.rs index e0810c9817b..474d726900b 100644 --- a/experimental/zerotrie/src/builder/konst/builder.rs +++ b/experimental/zerotrie/src/builder/konst/builder.rs @@ -93,15 +93,18 @@ impl ZeroTrieBuilderConst { Self { data } } - /// Performs the operation `self[index] |= byte` - const fn bitor_assign_at(self, index: usize, byte: u8) -> Self { + /// Performs the operation `self[index] |= bits` + const fn bitor_assign_at(self, index: usize, bits: u8) -> Self { let mut data = self.data; - data = data.const_bitor_assign(index, byte); + data = data.const_bitor_assign(index, bits); Self { data } } /// Creates a new builder containing the elements in the given slice of key/value pairs. /// + /// `K` is the stack size of the lengths stack. If you get an error such as + /// "AsciiTrie Builder: Need more stack", try increasing `K`. + /// /// # Panics /// /// Panics if the items are not sorted @@ -127,6 +130,9 @@ impl ZeroTrieBuilderConst { /// Creates a new builder containing the elements in the given slice of key/value pairs. /// /// Assumes that the items are sorted. If they are not, unexpected behavior may occur. + /// + /// `K` is the stack size of the lengths stack. If you get an error such as + /// "AsciiTrie Builder: Need more stack", try increasing `K`. pub const fn from_sorted_const_tuple_slice( items: ConstSlice<(&ByteStr, usize)>, ) -> Result { diff --git a/experimental/zerotrie/src/builder/konst/store.rs b/experimental/zerotrie/src/builder/konst/store.rs index d2e6ad43be4..960411dc3da 100644 --- a/experimental/zerotrie/src/builder/konst/store.rs +++ b/experimental/zerotrie/src/builder/konst/store.rs @@ -194,9 +194,9 @@ impl ConstArrayBuilder { } impl ConstArrayBuilder { - /// Specialized function that performs `self[index] |= other` - pub const fn const_bitor_assign(mut self, index: usize, other: u8) -> Self { - self.full_array[self.start + index] |= other; + /// Specialized function that performs `self[index] |= bits` + pub const fn const_bitor_assign(mut self, index: usize, bits: u8) -> Self { + self.full_array[self.start + index] |= bits; self } } diff --git a/experimental/zerotrie/src/builder/mod.rs b/experimental/zerotrie/src/builder/mod.rs index 867fe4b5e2f..e63d2fb86e6 100644 --- a/experimental/zerotrie/src/builder/mod.rs +++ b/experimental/zerotrie/src/builder/mod.rs @@ -131,6 +131,8 @@ impl ZeroTrieSimpleAscii<[u8; N]> { pub const fn from_sorted_str_tuples(tuples: &[(&str, usize)]) -> Self { use konst::*; let byte_str_slice = ByteStr::from_str_slice_with_value(tuples); + // 100 is the value of `K`, the size of the lengths stack. If compile errors are + // encountered, this number may need to be increased. let result = ZeroTrieBuilderConst::::from_tuple_slice::<100>(byte_str_slice); match result { Ok(s) => Self::from_store(s.take_or_panic()), diff --git a/experimental/zerotrie/src/builder/nonconst/store.rs b/experimental/zerotrie/src/builder/nonconst/store.rs index 3a1acb9c086..e1f76b839d8 100644 --- a/experimental/zerotrie/src/builder/nonconst/store.rs +++ b/experimental/zerotrie/src/builder/nonconst/store.rs @@ -26,8 +26,8 @@ pub trait TrieBuilderStore { /// Read the store into a `Vec`. fn atbs_to_bytes(&self) -> Vec; - /// Perform the operation `self[index] |= other` - fn atbs_bitor_assign(&mut self, index: usize, other: u8); + /// Perform the operation `self[index] |= bits` + fn atbs_bitor_assign(&mut self, index: usize, bits: u8); /// Swap the adjacent ranges `self[start..mid]` and `self[mid..limit]`. fn atbs_swap_ranges(&mut self, start: usize, mid: usize, limit: usize); @@ -69,8 +69,8 @@ impl TrieBuilderStore for VecDeque { v.extend(b); v } - fn atbs_bitor_assign(&mut self, index: usize, other: u8) { - self[index] |= other; + fn atbs_bitor_assign(&mut self, index: usize, bits: u8) { + self[index] |= bits; } fn atbs_swap_ranges(&mut self, mut start: usize, mut mid: usize, mut limit: usize) { if start > mid || mid > limit { diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index 4784cb7b0eb..135df77eb04 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -210,6 +210,7 @@ where /// Gets the usize for the given byte, or `None` if it is not in the map. pub fn get(&self, key: u8) -> Option { let (p, buffer) = self.0.as_ref().split_first()?; + // Note: there are N buckets followed by N keys let n = buffer.len() / 2; if n == 0 { return None; @@ -237,6 +238,7 @@ where .map(|s| s.1) .unwrap_or(&[]) } + /// Diagnostic function that returns `p` and the maximum value of `q` #[cfg(test)] pub fn p_qmax(&self) -> Option<(u8, u8)> { let (p, buffer) = self.0.as_ref().split_first()?; From 4973911068ca377fb669a71dc550a3a71d7e82d0 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 14:20:55 -0500 Subject: [PATCH 28/31] Rob feedback --- experimental/zerotrie/Cargo.toml | 2 +- experimental/zerotrie/benches/overview.rs | 10 ++++++++++ experimental/zerotrie/src/zerotrie.rs | 17 ++++++++--------- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/experimental/zerotrie/Cargo.toml b/experimental/zerotrie/Cargo.toml index 82cf01b5896..ef0f45fe98b 100644 --- a/experimental/zerotrie/Cargo.toml +++ b/experimental/zerotrie/Cargo.toml @@ -56,7 +56,7 @@ default = [] bench = [] alloc = [] litemap = ["dep:litemap", "alloc"] -serde = ["dep:serde", "alloc", "litemap/serde", "zerovec?/serde"] +serde = ["dep:serde", "dep:litemap", "alloc", "litemap/serde", "zerovec?/serde"] [[bench]] name = "overview" diff --git a/experimental/zerotrie/benches/overview.rs b/experimental/zerotrie/benches/overview.rs index 3b66bc73ccd..2c6e37f5463 100644 --- a/experimental/zerotrie/benches/overview.rs +++ b/experimental/zerotrie/benches/overview.rs @@ -52,6 +52,7 @@ fn get_basic_bench(c: &mut Criterion) { }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroMap/usize", |b| { let zm: ZeroMap<[u8], usize> = data.iter().copied().collect(); b.iter(|| { @@ -62,6 +63,7 @@ fn get_basic_bench(c: &mut Criterion) { }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroMap/u8", |b| { let zm: ZeroMap<[u8], u8> = data.iter().map(|(k, v)| (*k, *v as u8)).collect(); b.iter(|| { @@ -72,6 +74,7 @@ fn get_basic_bench(c: &mut Criterion) { }); }); + #[cfg(feature = "bench")] g.bench_function("HashMap", |b| { let hm: HashMap<&[u8], usize> = data.iter().copied().map(|(a, b)| (a, b)).collect(); b.iter(|| { @@ -82,6 +85,7 @@ fn get_basic_bench(c: &mut Criterion) { }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroHashMap/usize", |b| { let zhm: ZeroHashMap<[u8], usize> = data .iter() @@ -97,6 +101,7 @@ fn get_basic_bench(c: &mut Criterion) { }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroHashMap/u8", |b| { let zhm: ZeroHashMap<[u8], u8> = data.iter().map(|(k, v)| (*k, *v as u8)).collect(); b.iter(|| { @@ -161,6 +166,7 @@ fn get_subtags_bench_helper( }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroMap/usize", |b| { let zm: ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect(); b.iter(|| { @@ -171,6 +177,7 @@ fn get_subtags_bench_helper( }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroMap/u8", |b| { let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect(); b.iter(|| { @@ -181,6 +188,7 @@ fn get_subtags_bench_helper( }); }); + #[cfg(feature = "bench")] g.bench_function("HashMap", |b| { let hm: HashMap<&[u8], usize> = litemap.iter().map(|(a, b)| (*a, *b)).collect(); b.iter(|| { @@ -191,6 +199,7 @@ fn get_subtags_bench_helper( }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroHashMap/usize", |b| { let zhm: ZeroHashMap<[u8], usize> = litemap .iter() @@ -206,6 +215,7 @@ fn get_subtags_bench_helper( }); }); + #[cfg(feature = "bench")] g.bench_function("ZeroHashMap/u8", |b| { let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect(); b.iter(|| { diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index acbebe47dc4..8f41481ccd6 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -161,10 +161,12 @@ macro_rules! impl_zerotrie_subtype { } /// Maps the store into another type. #[inline] - pub fn map_store(self, f: impl FnOnce(Store) -> X) -> $name { + #[cfg(feature = "serde")] + pub(crate) fn map_store(self, f: impl FnOnce(Store) -> X) -> $name { $name::::from_store(f(self.store)) } #[inline] + #[cfg(feature = "serde")] pub(crate) fn map_store_into_zerotrie(self, f: impl FnOnce(Store) -> X) -> ZeroTrie { $name::::from_store(f(self.store)).into_zerotrie() } @@ -475,11 +477,6 @@ macro_rules! impl_zerotrie_subtype { }; } -#[cfg(feature = "alloc")] -fn vec_u8_to_box_u8(input: Vec) -> Box<[u8]> { - input.into_boxed_slice() -} - #[cfg(feature = "alloc")] fn string_to_box_u8(input: String) -> Box<[u8]> { input.into_boxed_str().into_boxed_bytes() @@ -499,7 +496,7 @@ impl_zerotrie_subtype!( get_phf_limited, Vec, get_iter_phf, - vec_u8_to_box_u8 + Vec::into_boxed_slice ); impl_zerotrie_subtype!( ZeroTrieExtendedCapacity, @@ -507,7 +504,7 @@ impl_zerotrie_subtype!( get_phf_extended, Vec, get_iter_phf, - vec_u8_to_box_u8 + Vec::into_boxed_slice ); macro_rules! impl_dispatch { @@ -547,7 +544,8 @@ impl ZeroTrie { impl_dispatch!(self, take_store()) } /// Maps the store into another type. - pub fn map_store(self, f: impl FnOnce(Store) -> NewStore) -> ZeroTrie { + #[cfg(feature = "serde")] + pub(crate) fn map_store(self, f: impl FnOnce(Store) -> NewStore) -> ZeroTrie { impl_dispatch!(self, map_store_into_zerotrie(f)) } } @@ -615,6 +613,7 @@ where K: AsRef<[u8]>, { fn from_iter>(iter: T) -> Self { + // We need two Vecs because the first one anchors the `K`s that the second one borrows. let items = Vec::from_iter(iter); let mut items: Vec<(&[u8], usize)> = items.iter().map(|(k, v)| (k.as_ref(), *v)).collect(); items.sort(); From 1ccb26ab0c77e77230f36f14cf932c1a31f2e78d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:12:50 -0500 Subject: [PATCH 29/31] Fix criterion dependency --- Cargo.lock | 17 +++++++++++++++++ experimental/zerotrie/Cargo.toml | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index f75787ce723..9c57ba36ba5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4197,6 +4197,23 @@ dependencies = [ "zerovec", ] +[[package]] +name = "zerotrie" +version = "0.1.0" +dependencies = [ + "bincode", + "criterion", + "displaydoc", + "icu_benchmark_macros", + "litemap", + "postcard", + "rand", + "rand_pcg", + "serde", + "serde_json", + "zerovec", +] + [[package]] name = "zerovec" version = "0.9.4" diff --git a/experimental/zerotrie/Cargo.toml b/experimental/zerotrie/Cargo.toml index ef0f45fe98b..ae22f1706d5 100644 --- a/experimental/zerotrie/Cargo.toml +++ b/experimental/zerotrie/Cargo.toml @@ -40,7 +40,7 @@ postcard = { version = "1.0", default-features = false, features = ["alloc"] } serde = { version = "1.0", default-features = false } zerovec = { path = "../../utils/zerovec", features = ["serde", "hashmap"] } litemap = { path = "../../utils/litemap" } -criterion = "0.3" +criterion = "0.4" icu_benchmark_macros = { path = "../../tools/benchmark/macros" } serde_json = "1.0" bincode = "1.0" From c687cd8440dad83f271d6a577195d3ff6509790d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Sun, 16 Jul 2023 17:16:42 -0500 Subject: [PATCH 30/31] fmt & tidy --- experimental/zerotrie/README.md | 8 ++++++++ experimental/zerotrie/src/builder/nonconst/store.rs | 2 +- experimental/zerotrie/src/reader.rs | 2 +- experimental/zerotrie/src/zerotrie.rs | 5 ++++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/experimental/zerotrie/README.md b/experimental/zerotrie/README.md index 61794c5672a..e76c7170051 100644 --- a/experimental/zerotrie/README.md +++ b/experimental/zerotrie/README.md @@ -25,6 +25,14 @@ assert_eq!(trie.get("axyb"), Some(33)); assert_eq!(trie.byte_len(), 18); ``` +## Internal Structure + +To read about the internal structure of [`ZeroTrie`], build the docs with private modules: + +```bash +cargo doc --document-private-items --all-features --no-deps --open +``` + [`LiteMap`]: litemap::LiteMap [`BTreeMap`]: alloc::collections::BTreeMap diff --git a/experimental/zerotrie/src/builder/nonconst/store.rs b/experimental/zerotrie/src/builder/nonconst/store.rs index e1f76b839d8..67b77afd8ee 100644 --- a/experimental/zerotrie/src/builder/nonconst/store.rs +++ b/experimental/zerotrie/src/builder/nonconst/store.rs @@ -16,7 +16,7 @@ pub trait TrieBuilderStore { /// Return the length in bytes of the store. fn atbs_len(&self) -> usize; - + /// Push a byte to the front of the store. fn atbs_push_front(&mut self, byte: u8); diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 21445246c7e..6e7d2351e78 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -204,9 +204,9 @@ //! ``` use crate::byte_phf::PerfectByteHashMap; +use crate::helpers::*; use crate::varint::read_varint_meta2; use crate::varint::read_varint_meta3; -use crate::helpers::*; #[cfg(feature = "alloc")] use alloc::string::String; diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 8f41481ccd6..3646ec98a7e 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -545,7 +545,10 @@ impl ZeroTrie { } /// Maps the store into another type. #[cfg(feature = "serde")] - pub(crate) fn map_store(self, f: impl FnOnce(Store) -> NewStore) -> ZeroTrie { + pub(crate) fn map_store( + self, + f: impl FnOnce(Store) -> NewStore, + ) -> ZeroTrie { impl_dispatch!(self, map_store_into_zerotrie(f)) } } From 3b415bd44a444ac530f7588af679a1ec8989dd3d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 17 Jul 2023 08:56:05 -0500 Subject: [PATCH 31/31] Write docs for the builder and simplify it slightly. --- .../zerotrie/src/builder/konst/builder.rs | 36 +++-- .../zerotrie/src/builder/konst/store.rs | 25 +-- experimental/zerotrie/src/builder/litemap.rs | 2 + experimental/zerotrie/src/builder/mod.rs | 142 ++++++++++++++++++ .../zerotrie/src/builder/nonconst/builder.rs | 37 +++-- .../zerotrie/src/builder/nonconst/store.rs | 2 +- experimental/zerotrie/src/byte_phf/builder.rs | 6 +- 7 files changed, 206 insertions(+), 44 deletions(-) diff --git a/experimental/zerotrie/src/builder/konst/builder.rs b/experimental/zerotrie/src/builder/konst/builder.rs index 474d726900b..f291a859003 100644 --- a/experimental/zerotrie/src/builder/konst/builder.rs +++ b/experimental/zerotrie/src/builder/konst/builder.rs @@ -143,7 +143,7 @@ impl ZeroTrieBuilderConst { Ok(result) } - /// The actual builder algorithm. + /// The actual builder algorithm. For an explanation, see [`crate::builder`]. #[must_use] const fn create_or_panic( mut self, @@ -154,30 +154,33 @@ impl ZeroTrieBuilderConst { // Empty slice: None => return (Self::new(), 0), }; + // Initialize the main loop to point at the last string. let mut lengths_stack = ConstLengthsStack::::new(); let mut i = all_items.len() - 1; let mut j = all_items.len(); let mut current_len = 0; + // Start the main loop. loop { let item_i = all_items.get_or_panic(i); let item_j = all_items.get_or_panic(j - 1); - assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); + debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); + // Check if we need to add a value node here. if item_i.0.len() == prefix_len { let len; (self, len) = self.prepend_value(item_i.1); current_len += len; } if prefix_len == 0 { + // All done! Leave the main loop. break; } + // Reduce the prefix length by 1 and recalculate i and j. prefix_len -= 1; let mut new_i = i; let mut new_j = j; - let mut diff_i = 0; - let mut diff_j = 0; let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len); let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len); - assert!(ascii_i == ascii_j); + debug_assert!(ascii_i == ascii_j); let key_ascii = ascii_i; loop { if new_i == 0 { @@ -194,12 +197,11 @@ impl ZeroTrieBuilderConst { break; } if candidate.len() == prefix_len { - // A string of length prefix_len can't be preceded by another with that prefix + // A string that equals the prefix does not take part in the branch node. break; } let candidate = candidate.byte_at_or_panic(prefix_len); if candidate != ascii_i { - diff_i += 1; ascii_i = candidate; } } @@ -222,21 +224,23 @@ impl ZeroTrieBuilderConst { } let candidate = candidate.byte_at_or_panic(prefix_len); if candidate != ascii_j { - diff_j += 1; ascii_j = candidate; } } - if diff_i == 0 && diff_j == 0 { + // If there are no different bytes at this prefix level, we can add an ASCII or Span + // node and then continue to the next iteration of the main loop. + if ascii_i == key_ascii && ascii_j == key_ascii { let len; (self, len) = self.prepend_ascii(ascii_i); current_len += len; - assert!(i == new_i || i == new_i + 1); + debug_assert!(i == new_i || i == new_i + 1); i = new_i; - assert!(j == new_j); + debug_assert!(j == new_j); continue; } - // Branch - if diff_j == 0 { + // If i and j changed, we are a target of a branch node. + if ascii_j == key_ascii { + // We are the _last_ target of a branch node. lengths_stack = lengths_stack.push_or_panic(BranchMeta { ascii: key_ascii, cumulative_length: current_len, @@ -244,6 +248,7 @@ impl ZeroTrieBuilderConst { count: 1, }); } else { + // We are the _not the last_ target of a branch node. let BranchMeta { cumulative_length, count, @@ -256,7 +261,9 @@ impl ZeroTrieBuilderConst { count: count + 1, }); } - if diff_i != 0 { + if ascii_i != key_ascii { + // We are _not the first_ target of a branch node. + // Set the cursor to the previous string and continue the loop. j = i; i -= 1; prefix_len = all_items.get_or_panic(i).0.len(); @@ -306,6 +313,7 @@ impl ZeroTrieBuilderConst { } k += 1; } + // Write out the lookup table assert!(0 < total_count && total_count <= 256); let branch_value = (w << 8) + (total_count & 0xff); let slice_len; diff --git a/experimental/zerotrie/src/builder/konst/store.rs b/experimental/zerotrie/src/builder/konst/store.rs index 960411dc3da..252875e00bf 100644 --- a/experimental/zerotrie/src/builder/konst/store.rs +++ b/experimental/zerotrie/src/builder/konst/store.rs @@ -228,23 +228,26 @@ macro_rules! const_for_each { pub(crate) use const_for_each; -/// A data structure that holds up to N [`BranchMeta`] items. -pub(crate) struct ConstLengthsStack { - data: [Option; N], +/// A data structure that holds up to K [`BranchMeta`] items. +/// +/// Note: It should be possible to store the required data in the builder buffer itself, +/// which would eliminate the need for this helper struct and the limit it imposes. +pub(crate) struct ConstLengthsStack { + data: [Option; K], idx: usize, } -impl core::fmt::Debug for ConstLengthsStack { +impl core::fmt::Debug for ConstLengthsStack { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { self.as_slice().fmt(f) } } -impl ConstLengthsStack { +impl ConstLengthsStack { /// Creates a new empty [`ConstLengthsStack`]. pub const fn new() -> Self { Self { - data: [None; N], + data: [None; K], idx: 0, } } @@ -257,10 +260,10 @@ impl ConstLengthsStack { /// Adds a [`BranchMeta`] to the stack, panicking if there is no room. #[must_use] pub const fn push_or_panic(mut self, meta: BranchMeta) -> Self { - if self.idx >= N { + if self.idx >= K { panic!(concat!( "AsciiTrie Builder: Need more stack (max ", - stringify!(N), + stringify!(K), ")" )); } @@ -318,10 +321,10 @@ impl ConstLengthsStack { } } -impl ConstArrayBuilder { +impl ConstArrayBuilder { /// Converts this builder-array of [`BranchMeta`] to one of the `ascii` fields. - pub const fn map_to_ascii_bytes(&self) -> ConstArrayBuilder { - let mut result = ConstArrayBuilder::new_empty([0; N], N); + pub const fn map_to_ascii_bytes(&self) -> ConstArrayBuilder { + let mut result = ConstArrayBuilder::new_empty([0; K], K); let self_as_slice = self.as_const_slice(); const_for_each!(self_as_slice, value, { result = result.const_push_front_or_panic(value.ascii); diff --git a/experimental/zerotrie/src/builder/litemap.rs b/experimental/zerotrie/src/builder/litemap.rs index 64a5915303e..9253dd4c279 100644 --- a/experimental/zerotrie/src/builder/litemap.rs +++ b/experimental/zerotrie/src/builder/litemap.rs @@ -2,6 +2,8 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +//! Impls for functions gated on the "litemap" feature. + use super::konst::*; use crate::builder::bytestr::ByteStr; use crate::error::Error; diff --git a/experimental/zerotrie/src/builder/mod.rs b/experimental/zerotrie/src/builder/mod.rs index e63d2fb86e6..8278325a9a8 100644 --- a/experimental/zerotrie/src/builder/mod.rs +++ b/experimental/zerotrie/src/builder/mod.rs @@ -2,6 +2,148 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +//! # ZeroTrie Builder +//! +//! There are two implementations of the ZeroTrie Builder: +//! +//! - [konst::ZeroTrieBuilderConst] allows for human-readable const construction +//! - [nonconst::ZeroTrieBuilder] has the full feaure set but requires `alloc` +//! +//! The two builders follow the same algorithm but have different capabilities. +//! +//! ## Builder Algorithm Overview +//! +//! The tries are built backwards, from the last node to the first node. The key step of the +//! algorithm is **determining what is the next node to prepend.** +//! +//! In the simple case of [`ZeroTrieSimpleAscii`], all nodes are binary-search, so if the input +//! strings are provided in lexicographic order, there is a simple, deterministic method for +//! identifying the next node. This insight is what enables us to make the const builder. +//! +//! The builder works with the following intermediate state variables: +//! +//! - `prefix_len` indicates the byte index we are currently processing. +//! - `i` and `j` bracket a window of strings in the input that share the same prefix. +//! - `current_len` is the length in bytes of the current self-contained trie. +//! - `lengths_stack` contains metadata for branch nodes. +//! +//! What follows is a verbal explanation of the build steps for a trie containing: +//! +//! - "" → 11 +//! - "ad" → 22 +//! - "adef" → 33 +//! - "adghk" → 44 +//! +//! When a node is prepended, it is shown in **boldface**. +//! +//! 1. Initialize the builder by setting `i=3`, `j=4`, `prefix_len=5` (the last string), +//! `current_len=0`, and `lengths_stack` empty. Start the main loop. +//! 2. Top of loop. The string at `i` is equal in length to `prefix_len`, so we prepend +//! our first node: a **value node 44**, which requires a 2-byte varint. Increase +//! `current_len` to 2. +//! 3. Reduce `prefix_len` to 4, read our `key_ascii="k"`, and recalculate `i` and `j` +//! _(this calculation is a long chunk of code in the builder impls)_. Since there is no +//! other string with the prefix "adgh", `i` and `j` stay the same, we prepend an +//! **ASCII node "k"**, increase `current_len` to 3, and continue the main loop. +//! 4. Top of loop. The string at `i` is of length 5, but `prefix_len` is 4, so there is +//! no value node to prepend. +//! 5. Reduce `prefix_len` to 3, read our `key_ascii="h"`, and recalculate `i` and `j`. +//! There are no other strings sharing the prefix "abg", so we prepend an +//! **ASCII node "h"**, increase `current_len` to 4, and continue the main loop. +//! 6. Top of loop. There is still no value node to prepend. +//! 7. Reduce `prefix_len` to 2, read our `key_ascii="g"`, and recalculate `i` and `j`. +//! We find that `i=1` and `j=4`, the range of strings sharing the prefix "ad". Since +//! `i` or `j` changed, proceed to evaluate the branch node. +//! 8. The last branch byte `ascii_j` for this prefix is "g", which is the same as `key_ascii`, +//! so we are the _last_ target of a branch node. Push an entry onto `lengths_stack`: +//! `BranchMeta { ascii: "g", cumulative_length: 4, local_length: 4, count: 1 }`. +//! 9. The first branch byte `ascii_i` for this prefix is "e", which is NOT equal to `key_ascii`, +//! so we are _not the first_ target of a branch node. We therefore start evaluating the +//! string preceding where we were at the top of the current loop. We set `i=2`, `j=3`, +//! `prefix_len=4` (length of the string at `i`), and continue the main loop. +//! 10. Top of loop. Since the string at `i` is equal in length to `prefix_len`, we prepend a +//! **value node 33** (which requires a 2-byte varint) and increase `current_len` to 2. +//! 11. Reduce `prefix_len` to 3, read our `key_ascii="f"`, and recalculate `i` and `j`. +//! They stay the same, so we prepend an **ASCII node "f"**, increase `current_len` to 3, +//! and continue the main loop. +//! 12. Top of loop. No value node this time. +//! 13. Reduce `prefix_len` to 2, read our `key_ascii="e"`, and recalculate `i` and `j`. +//! They go back to `i=1` and `j=4`. +//! 14. The last branch byte `ascii_j` for this prefix is "g", which is NOT equal to `key_ascii`, +//! so we are _not the last_ target of a branch node. We peek at the entry at the front of +//! the lengths stack and use it to push another entry onto the stack: +//! `BranchMeta { ascii: "e", cumulative_length: 7, local_length: 3, count: 2 }` +//! 15. The first branch byte `ascii_i` for this prefix is "e", which is the same as `key_ascii`, +//! wo we are the _first_ target of a branch node. We can therefore proceed to prepend the +//! metadata for the branch node. We peek at the top of the stack and find that there are 2 +//! tries reachable from this branch and they have a total byte length of 5. We then pull off +//! 2 entries from the stack into a local variable `branch_metas`. From here, we write out +//! the **offset table**, **lookup table**, and **branch head node**, which are determined +//! from the metadata entries. We set `current_len` to the length of the two tries plus the +//! metadata, which happens to be 11. Then we return to the top of the main loop. +//! 16. Top of loop. The string at `i` is length 2, which is the same as `prefix_len`, so we +//! prepend a **value node 22** (2-byte varint) and increase `current_len` to 13. +//! 17. Reduce `prefix_len` to 1, read our `key_ascii="d"`, and recalculate `i` and `j`. +//! They stay the same, so we prepend an **ASCII node "d"**, increase `current_len` to 14, +//! and continue the main loop. +//! 18. Top of loop. No value node this time. +//! 19. Reduce `prefix_len` to 0, read our `key_ascii="a"`, and recalculate `i` and `j`. +//! They change to `i=0` and `j=4`, since all strings have the empty string as a prefix. +//! However, `ascii_i` and `ascii_j` both equal `key_ascii`, so we prepend **ASCII node "a"**, +//! increase `current_len` to 15, and continue the main loop. +//! 16. Top of loop. The string at `i` is length 0, which is the same as `prefix_len`, so we +//! prepend a **value node 11** and increase `current_len` to 16. +//! 17. We can no longer reduce `prefix_len`, so our trie is complete. +//! +//! ## Perfect Hash Reordering +//! +//! When the PHF is added to the mix, the main change is that the strings are no longer in sorted +//! order when they are in the trie. To resolve this issue, when adding a branch node, the target +//! tries are rearranged in-place in the buffer to be in the correct order for the PHF. +//! +//! ## Example +//! +//! Here is the output of the trie described above. +//! +//! ``` +//! use zerotrie::ZeroTrieSimpleAscii; +//! +//! const DATA: [(&str, usize); 4] = [ +//! ("", 11), +//! ("ad", 22), +//! ("adef", 33), +//! ("adghk", 44), +//! ]; +//! +//! // As demonstrated above, the required capacity for this trie is 16 bytes +//! const TRIE: ZeroTrieSimpleAscii<[u8; 16]> = ZeroTrieSimpleAscii::from_sorted_str_tuples(&DATA); +//! +//! assert_eq!(TRIE.as_bytes(), &[ +//! 0x8B, // value node 11 +//! b'a', // ASCII node 'a' +//! b'd', // ASCII node 'd' +//! 0x90, // value node 22 lead byte +//! 0x06, // value node 22 trail byte +//! 0xC2, // branch node 2 +//! b'e', // first target of branch +//! b'g', // second target of branch +//! 3, // offset +//! b'f', // ASCII node 'f' +//! 0x90, // value node 33 lead byte +//! 0x11, // value node 33 trail byte +//! b'h', // ASCII node 'h' +//! b'k', // ASCII node 'k' +//! 0x90, // value node 44 lead byte +//! 0x1C, // value node 44 trail byte +//! ]); +//! +//! assert_eq!(TRIE.get(b""), Some(11)); +//! assert_eq!(TRIE.get(b"ad"), Some(22)); +//! assert_eq!(TRIE.get(b"adef"), Some(33)); +//! assert_eq!(TRIE.get(b"adghk"), Some(44)); +//! assert_eq!(TRIE.get(b"unknown"), None); +//! ``` + mod branch_meta; pub(crate) mod bytestr; pub(crate) mod konst; diff --git a/experimental/zerotrie/src/builder/nonconst/builder.rs b/experimental/zerotrie/src/builder/nonconst/builder.rs index 78945c01f57..105be09390e 100644 --- a/experimental/zerotrie/src/builder/nonconst/builder.rs +++ b/experimental/zerotrie/src/builder/nonconst/builder.rs @@ -154,7 +154,7 @@ impl ZeroTrieBuilder { Ok(result) } - /// The actual builder algorithm. + /// The actual builder algorithm. For an explanation, see [`crate::builder`]. #[allow(clippy::unwrap_used)] // lots of indexing, but all indexes should be in range fn create(&mut self, all_items: &[(&ByteStr, usize)]) -> Result { let mut prefix_len = match all_items.last() { @@ -162,29 +162,32 @@ impl ZeroTrieBuilder { // Empty slice: None => return Ok(0), }; + // Initialize the main loop to point at the last string. let mut lengths_stack = NonConstLengthsStack::new(); let mut i = all_items.len() - 1; let mut j = all_items.len(); let mut current_len = 0; + // Start the main loop. loop { let item_i = all_items.get(i).unwrap(); let item_j = all_items.get(j - 1).unwrap(); - assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); + debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len)); + // Check if we need to add a value node here. if item_i.0.len() == prefix_len { let len = self.prepend_value(item_i.1); current_len += len; } if prefix_len == 0 { + // All done! Leave the main loop. break; } + // Reduce the prefix length by 1 and recalculate i and j. prefix_len -= 1; let mut new_i = i; let mut new_j = j; - let mut diff_i = 0; - let mut diff_j = 0; let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len); let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len); - assert_eq!(ascii_i, ascii_j); + debug_assert_eq!(ascii_i, ascii_j); let key_ascii = ascii_i; loop { if new_i == 0 { @@ -201,12 +204,11 @@ impl ZeroTrieBuilder { break; } if candidate.len() == prefix_len { - // A string of length prefix_len can't be preceded by another with that prefix + // A string that equals the prefix does not take part in the branch node. break; } let candidate = candidate.byte_at_or_panic(prefix_len); if candidate != ascii_i { - diff_i += 1; ascii_i = candidate; } } @@ -229,20 +231,22 @@ impl ZeroTrieBuilder { } let candidate = candidate.byte_at_or_panic(prefix_len); if candidate != ascii_j { - diff_j += 1; ascii_j = candidate; } } - if diff_i == 0 && diff_j == 0 { - let len = self.prepend_ascii(ascii_i)?; + // If there are no different bytes at this prefix level, we can add an ASCII or Span + // node and then continue to the next iteration of the main loop. + if ascii_i == key_ascii && ascii_j == key_ascii { + let len = self.prepend_ascii(key_ascii)?; current_len += len; - assert!(i == new_i || i == new_i + 1); + debug_assert!(i == new_i || i == new_i + 1); i = new_i; - assert_eq!(j, new_j); + debug_assert_eq!(j, new_j); continue; } - // Branch - if diff_j == 0 { + // If i and j changed, we are a target of a branch node. + if ascii_j == key_ascii { + // We are the _last_ target of a branch node. lengths_stack.push(BranchMeta { ascii: key_ascii, cumulative_length: current_len, @@ -250,6 +254,7 @@ impl ZeroTrieBuilder { count: 1, }); } else { + // We are the _not the last_ target of a branch node. let BranchMeta { cumulative_length, count, @@ -262,7 +267,9 @@ impl ZeroTrieBuilder { count: count + 1, }); } - if diff_i != 0 { + if ascii_i != key_ascii { + // We are _not the first_ target of a branch node. + // Set the cursor to the previous string and continue the loop. j = i; i -= 1; prefix_len = all_items.get(i).unwrap().0.len(); diff --git a/experimental/zerotrie/src/builder/nonconst/store.rs b/experimental/zerotrie/src/builder/nonconst/store.rs index 67b77afd8ee..bc2db7506f1 100644 --- a/experimental/zerotrie/src/builder/nonconst/store.rs +++ b/experimental/zerotrie/src/builder/nonconst/store.rs @@ -123,7 +123,7 @@ impl core::fmt::Debug for NonConstLengthsStack { } impl NonConstLengthsStack { - /// Creates a new empty [`ConstLengthsStack`]. + /// Creates a new empty [`NonConstLengthsStack`]. pub const fn new() -> Self { Self { data: Vec::new() } } diff --git a/experimental/zerotrie/src/byte_phf/builder.rs b/experimental/zerotrie/src/byte_phf/builder.rs index 0a846eb285a..75decfbe3a1 100644 --- a/experimental/zerotrie/src/byte_phf/builder.rs +++ b/experimental/zerotrie/src/byte_phf/builder.rs @@ -8,9 +8,9 @@ use alloc::vec; use alloc::vec::Vec; /// To speed up the search algorithm, we limit the number of times the level-2 parameter (q) -/// can hit its max value of 255 before we try the next level-1 parameter (p). In practice, -/// this has a small impact on the resulting perfect hash, resulting in about 1 in 10000 -/// hash maps that fall back to the slow path. +/// can hit its max value (initially Q_FAST_MAX) before we try the next level-1 parameter (p). +/// In practice, this has a small impact on the resulting perfect hash, resulting in about +/// 1 in 10000 hash maps that fall back to the slow path. const MAX_L2_SEARCH_MISSES: usize = 24; /// Directly compute the perfect hash function.