Skip to content

Commit 1af4452

Browse files
authored
Merge pull request #1199 from epage/aho
feat(dictgen): Add aho-corasick support
2 parents 44cf2f8 + 7984d47 commit 1af4452

File tree

9 files changed

+138282
-3
lines changed

9 files changed

+138282
-3
lines changed

Cargo.lock

+3-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/dictgen/Cargo.toml

+2
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ default = ["std"]
1919
std = []
2020
codegen = ["std", "dep:phf_codegen"]
2121
map = ["dep:phf", "dep:phf_shared"]
22+
aho-corasick = ["dep:aho-corasick"]
2223

2324
[dependencies]
2425
unicase = "2.7"
2526
phf = { version = "0.11", features = ["unicase"], optional = true }
2627
phf_shared = { version = "0.11", optional = true }
2728
phf_codegen = { version = "0.11", optional = true }
29+
aho-corasick = { version = "1.1.3", optional = true }
2830

2931
[lints]
3032
workspace = true

crates/dictgen/src/aho_corasick.rs

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
pub use ::aho_corasick::automaton::Automaton;
2+
pub use ::aho_corasick::dfa::Builder;
3+
pub use ::aho_corasick::dfa::DFA;
4+
pub use ::aho_corasick::Anchored;
5+
pub use ::aho_corasick::Input;
6+
pub use ::aho_corasick::MatchKind;
7+
pub use ::aho_corasick::StartKind;
8+
9+
#[cfg(feature = "codegen")]
10+
pub struct AhoCorasickGen<'g> {
11+
pub(crate) gen: crate::DictGen<'g>,
12+
}
13+
14+
#[cfg(feature = "codegen")]
15+
impl AhoCorasickGen<'_> {
16+
pub fn write<W: std::io::Write, V: std::fmt::Display>(
17+
&self,
18+
file: &mut W,
19+
data: impl Iterator<Item = (impl AsRef<str>, V)>,
20+
) -> Result<(), std::io::Error> {
21+
let mut data: Vec<_> = data.collect();
22+
data.sort_unstable_by_key(|v| unicase::UniCase::new(v.0.as_ref().to_owned()));
23+
24+
let name = self.gen.name;
25+
let value_type = self.gen.value_type;
26+
27+
writeln!(file, "pub struct {name} {{")?;
28+
writeln!(file, " dfa: dictgen::aho_corasick::DFA,")?;
29+
writeln!(file, " unicode: &'static dictgen::OrderedMap<dictgen::InsensitiveStr<'static>, {value_type}>,")?;
30+
writeln!(file, "}}")?;
31+
writeln!(file)?;
32+
writeln!(file, "impl {name} {{")?;
33+
writeln!(file, " pub fn new() -> Self {{")?;
34+
writeln!(
35+
file,
36+
" static NEEDLES: &'static [&'static [u8]] = &["
37+
)?;
38+
for (key, _value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
39+
let key = key.as_ref();
40+
writeln!(file, " b{key:?},")?;
41+
}
42+
writeln!(file, " ];")?;
43+
writeln!(
44+
file,
45+
" let dfa = dictgen::aho_corasick::Builder::new()"
46+
)?;
47+
writeln!(
48+
file,
49+
" .match_kind(dictgen::aho_corasick::MatchKind::LeftmostLongest)"
50+
)?;
51+
writeln!(
52+
file,
53+
" .start_kind(dictgen::aho_corasick::StartKind::Anchored)"
54+
)?;
55+
writeln!(file, " .ascii_case_insensitive(true)")?;
56+
writeln!(file, " .build(NEEDLES)")?;
57+
writeln!(file, " .unwrap();")?;
58+
crate::DictGen::new()
59+
.name("UNICODE_TABLE")
60+
.value_type(value_type)
61+
.ordered_map()
62+
.write(
63+
file,
64+
data.iter()
65+
.filter(|(k, _)| !k.as_ref().is_ascii())
66+
.map(|(k, v)| (k.as_ref(), v)),
67+
)?;
68+
writeln!(file)?;
69+
writeln!(file, " Self {{")?;
70+
writeln!(file, " dfa,")?;
71+
writeln!(file, " unicode: &UNICODE_TABLE,")?;
72+
writeln!(file, " }}")?;
73+
writeln!(file, " }}")?;
74+
writeln!(file)?;
75+
writeln!(
76+
file,
77+
" pub fn find(&self, word: &'_ unicase::UniCase<&str>) -> Option<&'static {value_type}> {{"
78+
)?;
79+
writeln!(
80+
file,
81+
" static PATTERNID_MAP: &'static [{value_type}] = &["
82+
)?;
83+
for (_key, value) in data.iter().filter(|(k, _)| k.as_ref().is_ascii()) {
84+
writeln!(file, " {value},")?;
85+
}
86+
writeln!(file, " ];")?;
87+
writeln!(file, " if word.is_ascii() {{")?;
88+
writeln!(
89+
file,
90+
" use dictgen::aho_corasick::Automaton as _;"
91+
)?;
92+
writeln!(file, " let input = dictgen::aho_corasick::Input::new(word.into_inner().as_bytes()).anchored(dictgen::aho_corasick::Anchored::Yes);")?;
93+
writeln!(
94+
file,
95+
" let mat = self.dfa.try_find(&input).unwrap()?;"
96+
)?;
97+
writeln!(
98+
file,
99+
" if mat.end() == word.into_inner().len() {{"
100+
)?;
101+
writeln!(file, " return None;")?;
102+
writeln!(file, " }}")?;
103+
writeln!(file, " Some(&PATTERNID_MAP[mat.pattern()])")?;
104+
writeln!(file, " }} else {{")?;
105+
writeln!(file, " self.unicode.find(word)")?;
106+
writeln!(file, " }}")?;
107+
writeln!(file, " }}")?;
108+
writeln!(file, "}}")?;
109+
110+
Ok(())
111+
}
112+
}

crates/dictgen/src/gen.rs

+5
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ impl<'g> DictGen<'g> {
6161
pub fn r#match(self) -> crate::MatchGen<'g> {
6262
crate::MatchGen { gen: self }
6363
}
64+
65+
#[cfg(feature = "aho-corasick")]
66+
pub fn aho_corasick(self) -> crate::AhoCorasickGen<'g> {
67+
crate::AhoCorasickGen { gen: self }
68+
}
6469
}
6570

6671
impl Default for DictGen<'static> {

crates/dictgen/src/lib.rs

+5
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
#![warn(clippy::print_stderr)]
33
#![warn(clippy::print_stdout)]
44

5+
#[cfg(feature = "aho-corasick")]
6+
pub mod aho_corasick;
57
#[cfg(feature = "codegen")]
68
mod gen;
79
mod insensitive;
@@ -12,6 +14,9 @@ mod r#match;
1214
mod ordered_map;
1315
mod trie;
1416

17+
#[cfg(feature = "aho-corasick")]
18+
#[cfg(feature = "codegen")]
19+
pub use aho_corasick::AhoCorasickGen;
1520
#[cfg(feature = "codegen")]
1621
pub use gen::*;
1722
pub use insensitive::*;

crates/typos-dict/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ itertools = "0.13"
2525
edit-distance = "2.1"
2626
unicase = "2.7"
2727
codegenrs = "3.0"
28-
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map"] }
28+
dictgen = { version = "^0.2", path = "../dictgen", features = ["codegen", "map", "aho-corasick"] }
2929
varcon = { version = "^1.0", path = "../varcon" }
3030
snapbox = "0.6.5"
3131
indexmap = "2.2.6"

0 commit comments

Comments
 (0)