Merge meilisearch#170

170: Use cargo workspace r=ManyTheFish a=choznerol # Pull Request ## Related issue Fixes meilisearch/product#582 (part 1) ## What does this PR do? Convert the codebase structure to [cargo workspace](https://doc.rust-lang.org/cargo/reference/workspaces.html) according to the conclusion in meilisearch/product#582 (reply in thread) . I found the changing related to cargo workspace can accumulate merge conflict pretty easily, so I'm opening this as a standalone PR. The actual work to bring in https://github.com/choznerol/kvariants will be another follow-up PR meilisearch#171 base on this branch. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Lawrence Chou <choznerol@protonmail.com>
goodhoko · Jan 17, 2023 · a3eab30 · a3eab30
2 parents 037f912 + 7220046
commit a3eab30
Show file tree

Hide file tree

Showing 30 changed files with 72 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 /target
-meilisearch-core/target
+charabia/target
 **/*.csv
 **/*.json_lines
 **/*.rs.bk

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -45,10 +45,10 @@ cargo bench
 ### Implement a `Segmenter`
 A `Segmenter` is a Script or Language specialized struct that segment a text in several [lemmas](https://en.wikipedia.org/wiki/Lemma_(morphology)) that will be classified as a separator or a word later in the tokenization pipeline.
 A Segmenter will never change, add, or skip a lemma, that means that concatenating all lemmas must be equal to the original text.
-All Segmenters implementation are stored in `src/segmenter`.
+All Segmenters implementation are stored in `charabia/src/segmenter`.
 
 #### Start the implementation
-We highly recommend to start the implementation by copy-pasting the dummy example (`src/segmenter/dummy_example.rs`) and follow the instructions in comments.
+We highly recommend to start the implementation by copy-pasting the dummy example (`charabia/src/segmenter/dummy_example.rs`) and follow the instructions in comments.
 
 #### Add a Benchmark
 The only thing needed is 2 texts detected as the `Segmenter`'s Script or Language by the tokenizer.
@@ -70,7 +70,7 @@ static DATA_SET: &[((usize, Script, Language), &str)] = &[
 A `Normalizer` is a struct used to alterate the lemma contained in a Token in order to remove features that doesn't sygnificantly impact the sens like lowecasing, removing accents, or converting Traditionnal Chinese characteres into Simplified Chinese characteres.
 
 #### Start the implementation
-We highly recommend to start the implementation by copy-pasting the dummy example (`src/normalizer/dummy_example.rs`) and follow the instructions in comments.
+We highly recommend to start the implementation by copy-pasting the dummy example (`charabia/src/normalizer/dummy_example.rs`) and follow the instructions in comments.
 
 ## Git Guidelines
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,57 +1,5 @@
-[package]
-name = "charabia"
-version = "0.7.0"
-license = "MIT"
-authors = ["Many <many@meilisearch.com>"]
-edition = "2021"
-description = "A simple library to detect the language, tokenize the text and normalize the tokens"
-documentation = "https://docs.rs/charabia"
-repository = "https://github.com/meilisearch/charabia"
-keywords = ["segmenter", "tokenizer", "normalize", "language"]
-categories = ["text-processing"]
-exclude = ["/dictionaries/txt/thai/words.txt"]
+[workspace]
+resolver = "2"
+members = ["charabia"]
+default-members = ["charabia"]
 
-[dependencies]
-cow-utils = "0.1"
-csv = "1.1"
-deunicode = "1.1.1"
-fst = "0.4"
-jieba-rs = { version = "0.6", optional = true }
-once_cell = "1.5.2"
-serde = "1.0"
-slice-group-by = "0.3.0"
-unicode-segmentation = "1.6.0"
-whatlang = "0.16.1"
-lindera = { version = "=0.17.0", default-features = false, optional = true }
-pinyin = { version = "0.9", default-features = false, features = [
-  "with_tone",
-], optional = true }
-wana_kana = { version = "2.1.0", optional = true }
-unicode-normalization = "0.1.22"
-
-[features]
-default = ["chinese", "hebrew", "japanese", "thai", "korean"]
-
-# allow chinese specialized tokenization
-chinese = ["dep:pinyin", "dep:jieba-rs"]
-
-# allow hebrew specialized tokenization
-hebrew = []
-
-# allow japanese specialized tokenization
-japanese = ["lindera/ipadic"]
-japanese-transliteration = ["dep:wana_kana"]
-
-# allow korean specialized tokenization
-korean = ["lindera/ko-dic"]
-
-# allow thai specialized tokenization
-thai = []
-
-[dev-dependencies]
-criterion = "0.3"
-jemallocator = "0.3.0"
-
-[[bench]]
-name = "bench"
-harness = false
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -0,0 +1,57 @@
+[package]
+name = "charabia"
+version = "0.7.0"
+license = "MIT"
+authors = ["Many <many@meilisearch.com>"]
+edition = "2021"
+description = "A simple library to detect the language, tokenize the text and normalize the tokens"
+documentation = "https://docs.rs/charabia"
+repository = "https://github.com/meilisearch/charabia"
+keywords = ["segmenter", "tokenizer", "normalize", "language"]
+categories = ["text-processing"]
+exclude = ["../dictionaries/txt/thai/words.txt"]
+
+[dependencies]
+cow-utils = "0.1"
+csv = "1.1"
+deunicode = "1.1.1"
+fst = "0.4"
+jieba-rs = { version = "0.6", optional = true }
+once_cell = "1.5.2"
+serde = "1.0"
+slice-group-by = "0.3.0"
+unicode-segmentation = "1.6.0"
+whatlang = "0.16.1"
+lindera = { version = "=0.17.0", default-features = false, optional = true }
+pinyin = { version = "0.9", default-features = false, features = [
+  "with_tone",
+], optional = true }
+wana_kana = { version = "2.1.0", optional = true }
+unicode-normalization = "0.1.22"
+
+[features]
+default = ["chinese", "hebrew", "japanese", "thai", "korean"]
+
+# allow chinese specialized tokenization
+chinese = ["dep:pinyin", "dep:jieba-rs"]
+
+# allow hebrew specialized tokenization
+hebrew = []
+
+# allow japanese specialized tokenization
+japanese = ["lindera/ipadic"]
+japanese-transliteration = ["dep:wana_kana"]
+
+# allow korean specialized tokenization
+korean = ["lindera/ko-dic"]
+
+# allow thai specialized tokenization
+thai = []
+
+[dev-dependencies]
+criterion = "0.3"
+jemallocator = "0.3.0"
+
+[[bench]]
+name = "bench"
+harness = false
diff --git a/benches/bench.rs → charabia/benches/bench.rs b/benches/bench.rs → charabia/benches/bench.rs
diff --git a/src/classifier.rs → charabia/src/classifier.rs b/src/classifier.rs → charabia/src/classifier.rs
diff --git a/src/detection/chars.rs → charabia/src/detection/chars.rs b/src/detection/chars.rs → charabia/src/detection/chars.rs
diff --git a/src/detection/mod.rs → charabia/src/detection/mod.rs b/src/detection/mod.rs → charabia/src/detection/mod.rs
diff --git a/src/detection/script_language.rs → charabia/src/detection/script_language.rs b/src/detection/script_language.rs → charabia/src/detection/script_language.rs
diff --git a/src/lib.rs → charabia/src/lib.rs b/src/lib.rs → charabia/src/lib.rs
diff --git a/src/normalizer/chinese.rs → charabia/src/normalizer/chinese.rs b/src/normalizer/chinese.rs → charabia/src/normalizer/chinese.rs
diff --git a/src/normalizer/chinese/kvariants.rs → charabia/src/normalizer/chinese/kvariants.rs b/src/normalizer/chinese/kvariants.rs → charabia/src/normalizer/chinese/kvariants.rs
@@ -34,7 +34,7 @@ pub static KVARIANTS: Lazy<HashMap<char, KVariant>> = Lazy::new(|| {
     //   㓻 (U+34FB)	sem	    剛 (U+525B)
     //   ...
     //
-    let tsv = include_str!("../../../dictionaries/txt/chinese/kVariants.tsv");
+    let tsv = include_str!("../../../../dictionaries/txt/chinese/kVariants.tsv");
     let mut reader =
         csv::ReaderBuilder::new().delimiter(b'\t').has_headers(false).from_reader(tsv.as_bytes());
 

diff --git a/...normalizer/compatibility_decomposition.rs → ...normalizer/compatibility_decomposition.rs b/...normalizer/compatibility_decomposition.rs → ...normalizer/compatibility_decomposition.rs
diff --git a/src/normalizer/control_char.rs → charabia/src/normalizer/control_char.rs b/src/normalizer/control_char.rs → charabia/src/normalizer/control_char.rs
diff --git a/src/normalizer/dummy_example.rs → charabia/src/normalizer/dummy_example.rs b/src/normalizer/dummy_example.rs → charabia/src/normalizer/dummy_example.rs
diff --git a/src/normalizer/japanese.rs → charabia/src/normalizer/japanese.rs b/src/normalizer/japanese.rs → charabia/src/normalizer/japanese.rs
diff --git a/src/normalizer/lowercase.rs → charabia/src/normalizer/lowercase.rs b/src/normalizer/lowercase.rs → charabia/src/normalizer/lowercase.rs
diff --git a/src/normalizer/mod.rs → charabia/src/normalizer/mod.rs b/src/normalizer/mod.rs → charabia/src/normalizer/mod.rs
@@ -251,7 +251,7 @@ it's probably due to a bug in the normalizer or a mistake in the provided normal
 Global normalization pipeline didn't normalize tokens as expected.
 
 help: The `global_result` provided to `test_normalizer!` does not corresponds to the output of the normalizer pipeline, it's probably because the normalizer is missing from `NORMALIZERS` list or because an other normalizer has alterated the token.
-Check if the `NORMALIZERS` list in `src/normalizer/mod.rs` contains the tested Normalizer.
+Check if the `NORMALIZERS` list in `charabia/src/normalizer/mod.rs` contains the tested Normalizer.
 Make sure that normalized tokens are valid or change the trigger condition of the noisy normalizers by updating `should_normalize`.
 "#
                 );

diff --git a/src/normalizer/nonspacing_mark.rs → charabia/src/normalizer/nonspacing_mark.rs b/src/normalizer/nonspacing_mark.rs → charabia/src/normalizer/nonspacing_mark.rs
@@ -8,7 +8,7 @@ use crate::normalizer::CharOrStr;
 use crate::Token;
 
 static NONSPACING_MARKS: Lazy<HashSet<u32>> = Lazy::new(|| {
-    let bytes = include_bytes!("../../dictionaries/bin/nonspacing_mark/marks.bin");
+    let bytes = include_bytes!("../../../dictionaries/bin/nonspacing_mark/marks.bin");
 
     HashSet::from_iter(
         bytes.chunks_exact(4).map(|chunk| u32::from_ne_bytes(chunk.try_into().unwrap())),

diff --git a/src/segmenter/chinese.rs → charabia/src/segmenter/chinese.rs b/src/segmenter/chinese.rs → charabia/src/segmenter/chinese.rs
diff --git a/src/segmenter/dummy_example.rs → charabia/src/segmenter/dummy_example.rs b/src/segmenter/dummy_example.rs → charabia/src/segmenter/dummy_example.rs
diff --git a/src/segmenter/hebrew.rs → charabia/src/segmenter/hebrew.rs b/src/segmenter/hebrew.rs → charabia/src/segmenter/hebrew.rs
diff --git a/src/segmenter/japanese.rs → charabia/src/segmenter/japanese.rs b/src/segmenter/japanese.rs → charabia/src/segmenter/japanese.rs
diff --git a/src/segmenter/korean.rs → charabia/src/segmenter/korean.rs b/src/segmenter/korean.rs → charabia/src/segmenter/korean.rs
diff --git a/src/segmenter/latin.rs → charabia/src/segmenter/latin.rs b/src/segmenter/latin.rs → charabia/src/segmenter/latin.rs
diff --git a/src/segmenter/mod.rs → charabia/src/segmenter/mod.rs b/src/segmenter/mod.rs → charabia/src/segmenter/mod.rs
@@ -332,7 +332,7 @@ Check if the expected Script/Language corresponds to the detected Script/Languag
 Segmenter chosen by global segment() function, didn't segment the text as expected.
 
 help: The selected segmenter is probably the wrong one.
-Check if the tested segmenter is assigned to the good Script/Language in `SEGMENTERS` global in `src/segmenter/mod.rs`.
+Check if the tested segmenter is assigned to the good Script/Language in `SEGMENTERS` global in `charabia/src/segmenter/mod.rs`.
 "#);
             }
 

diff --git a/src/segmenter/thai.rs → charabia/src/segmenter/thai.rs b/src/segmenter/thai.rs → charabia/src/segmenter/thai.rs
@@ -11,8 +11,9 @@ use crate::segmenter::Segmenter;
 /// Dictionary source: PyThaiNLP project on https://github.com/PyThaiNLP/nlpo3
 pub struct ThaiSegmenter;
 
-static WORDS_FST: Lazy<Fst<&[u8]>> =
-    Lazy::new(|| Fst::new(&include_bytes!("../../dictionaries/fst/thai/words.fst")[..]).unwrap());
+static WORDS_FST: Lazy<Fst<&[u8]>> = Lazy::new(|| {
+    Fst::new(&include_bytes!("../../../dictionaries/fst/thai/words.fst")[..]).unwrap()
+});
 
 static FST_SEGMENTER: Lazy<FstSegmenter> = Lazy::new(|| FstSegmenter::new(&WORDS_FST));
 

diff --git a/src/segmenter/utils.rs → charabia/src/segmenter/utils.rs b/src/segmenter/utils.rs → charabia/src/segmenter/utils.rs
diff --git a/src/token.rs → charabia/src/token.rs b/src/token.rs → charabia/src/token.rs
diff --git a/src/tokenizer.rs → charabia/src/tokenizer.rs b/src/tokenizer.rs → charabia/src/tokenizer.rs