compenguy · LucaCappelletti94 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,7 @@ Cargo.lock
 .vscode/
 test.txt
 rustc-ice-*
-*.graph
+*.graph
+*.offsets
+*.ef
+*.properties
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,17 +17,16 @@ name = "ngrammatic"
 
 [dependencies]
 serde = { version = "1.0", features = ["derive"], optional = true }
-mem_dbg = { version = "0.1.8"}
-sux = {git = "https://github.com/LucaCappelletti94/sux-rs.git", no-default-features = true}
+mem_dbg = { version = "0.2"}
+sux = {git = "https://github.com/vigna/sux-rs.git"}
 half = {version="2.4.0", optional = true, features = ["zerocopy"]}
 rayon = {version="1.10.0", optional=true}
-trie-rs = {git = "https://github.com/LucaCappelletti94/trie-rs.git", optional = true, features = ["mem_dbg"]}
-webgraph = {git="https://github.com/vigna/webgraph-rs.git" }
+webgraph = {git="https://github.com/vigna/webgraph-rs.git", branch="mem_dbg"}
 
 fxhash = "0.2.1"
 tempfile = "3.10.1"
-dsi-bitstream = "0.4.2"
-epserde = "0.4"
+dsi-bitstream = {git="https://github.com/vigna/dsi-bitstream-rs.git", features=["mem_dbg"]}
+epserde = "0.6"
 log = "0.4.21"
 lender = "0.2.9"
 rand = "0.8.5"
@@ -43,8 +42,8 @@ paste = "1.0.14"
 
 [features]
 default = ["rayon"]
-serde = ["dep:serde", "half/serde", "trie-rs/serde"]
-rayon = ["dep:rayon", "sux/rayon", "trie-rs/rayon"]
+serde = ["dep:serde", "half/serde"]
+rayon = ["dep:rayon", "sux/rayon"]
 
 [profile.release]
 overflow-checks = false   # Disable integer overflow checks.

diff --git a/README.md b/README.md
@@ -47,6 +47,29 @@ for search_result in search_results {
 }
 ```
 
+To use a different graph data structure such as the `WeightedVecBipartiteGraph`, you can use the following code:
+
+```rust
+use ngrammatic::prelude::*;
+
+let corpus: Corpus<[&str; 699], TriGram<char>, Lowercase<str>, WeightedVecBipartiteGraph> = Corpus::from(ANIMALS);
+
+// We setup the search configuration
+let search_config = NgramSearchConfig::default()
+    .set_minimum_similarity_score(0.3).unwrap()
+    .set_maximum_number_of_results(5);
+
+// We search for a word similar to "catt"
+let search_results: Vec<SearchResult<&&str, f32>> = corpus.ngram_search("Cattos", search_config);
+
+assert!(!search_results.is_empty());
+
+// We print the search results
+for search_result in search_results {
+    println!("{}: {}", search_result.key(), search_result.score());
+}
+```
+
 ### Text normalization
 Natural language processing is notoriously difficult, and one of the first steps is to normalize the text. You can add any normalization you want by creating new struct markers that implement [`std::convert::AsRef`] to the type of the keys you want to use, which may be for instance [`str`] or [`String`]. In this case, we use the [`Lowercase`] struct marker to normalize the text to lowercase. By default, text represented in [`str`] or [`String`] is padded with [`NULL`](https://theasciicode.com.ar/ascii-control-characters/null-character-ascii-code-0.html) characters to ensure that the n-grams minimum length is respected by default, we drop all non-alphanumeric characters, remove duplicated spaces and trim both spaces and [`NULL`](https://theasciicode.com.ar/ascii-control-characters/null-character-ascii-code-0.html) characters from the sides of the text. You can use struct markers to customize the normalization process to remove or add any other normalization steps you may need.
 

diff --git a/benches/README.md b/benches/README.md
@@ -6,9 +6,11 @@ Since cargo bench will run the benchmarks multiple times, we will use only a sub
 To run the time benchmarks, run from the root of the repository the following command:
 
 ```bash
-RUSTFLAGS="-C target-cpu=native" cargo bench
+RUSTFLAGS="-C target-cpu=native" cargo bench  
 ```
 
+## Benchmarks 26 July 2024, 05:00 PM
+
 ## Benchmarks 9 April 2024, 06:00 PM
 The sixth benchmark was run on a 32-core machine with 64 threads and with 256 GBs of RAM. In this iteration, we run the benchmarks relative to loading the first `10_000` taxons from the dataset into memory. The novelty of this benchmark is the use of the RCL data structure for holding the strings of the dataset itself.