From 8959ebf26ec80f50f02445b54d85863828a1c680 Mon Sep 17 00:00:00 2001 From: Nathaniel Daniel Date: Fri, 28 Oct 2022 19:34:57 -0700 Subject: [PATCH] Improve Compression Performance (#36) * Add `rustc-hash` feature * Add features section to readme * Fix compression with `rustc-hash` * Add tests for `rustc-hash` on CI * Remove `cargo-deny` deny section * Expose `rustc-hash` to python binding * Expose `rustc-hash` feature on `lz-str-wasm`` * Add ability to specify features via makefile * Avoid copy in `lz-str-wasm` compress interface * fmt * Add note about removing `IntoWideIter` for `Vec` * Reduce hash lookups with entry api * Reorganize produce_w order to avoid double map lookup --- .github/workflows/Build.yml | 8 +++- Cargo.lock | 7 ++++ Cargo.toml | 1 + Makefile | 6 ++- README.md | 4 ++ benches/compress.rs | 5 ++- bindings/lz-str-py/Cargo.toml | 5 +++ bindings/lz-str-wasm/Cargo.toml | 9 ++++- bindings/lz-str-wasm/src/inline.js | 10 ++--- bindings/lz-str-wasm/src/lib.rs | 13 ++----- deny.toml | 28 -------------- src/compress.rs | 62 +++++++++++++++++++----------- src/lib.rs | 2 + 13 files changed, 89 insertions(+), 71 deletions(-) diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml index ca86149..4b24eb8 100644 --- a/.github/workflows/Build.yml +++ b/.github/workflows/Build.yml @@ -60,5 +60,11 @@ jobs: - name: Build run: cargo build --verbose - - name: Run Tests + - name: Build with `rustc-hash` + run: cargo build --verbose --features=rustc-hash + + - name: Run All Tests run: cargo test --all --verbose + + - name: Run Tests for `lz-str` with `rustc-hash` + run: cargo test --verbose --features=rustc-hash diff --git a/Cargo.lock b/Cargo.lock index 3e1db07..f4e22b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -291,6 +291,7 @@ version = "0.2.0" dependencies = [ "criterion", "rand", + "rustc-hash", ] [[package]] @@ -569,6 +570,12 @@ version = "0.6.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3f87b73ce11b1619a3c6332f45341e0047173771e8b8b73f87bfeefb7b56244" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "ryu" version = "1.0.11" diff --git a/Cargo.toml b/Cargo.toml index fbac11b..932f5f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ exclude = [ ] [dependencies] +rustc-hash = { version = "1.1.0", optional = true } [dev-dependencies] rand = "0.8.3" diff --git a/Makefile b/Makefile index f9b1bd5..86f0a3c 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ +export WASM_FEATURES = + .PHONY: build-wasm # --reference-types build-wasm: - wasm-pack build --target nodejs bindings/lz-str-wasm + wasm-pack build --target nodejs bindings/lz-str-wasm --features=$(WASM_FEATURES) cd bindings/lz-str-wasm && python inject-inline-js.py build-wasm-browser: - wasm-pack build --target web bindings/lz-str-wasm + wasm-pack build --target web bindings/lz-str-wasm --features=$(WASM_FEATURES) cd bindings/lz-str-wasm && python inject-inline-js.py \ No newline at end of file diff --git a/README.md b/README.md index 620c353..fdcff0c 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,10 @@ fn main() { See the [examples](https://github.com/adumbidiot/lz-str-rs/tree/master/examples) directory for more examples. +## Features +`rustc-hash`: This feature will replace some internal maps' hashers with rustc-hash, +boosting performance at the cost of not using a DOS-resistant hasher. + ## Testing ```bash cargo test diff --git a/benches/compress.rs b/benches/compress.rs index bf1f612..8fba154 100644 --- a/benches/compress.rs +++ b/benches/compress.rs @@ -1,4 +1,7 @@ -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use criterion::criterion_group; +use criterion::criterion_main; +use criterion::BenchmarkId; +use criterion::Criterion; const TEST_PHRASE: &str = "During tattooing, ink is injected into the skin, initiating an immune response, and cells called \"macrophages\" move into the area and \"eat up\" the ink. The macrophages carry some of the ink to the body\'s lymph nodes, but some that are filled with ink stay put, embedded in the skin. That\'s what makes the tattoo visible under the skin. Dalhousie Uiversity\'s Alec Falkenham is developing a topical cream that works by targeting the macrophages that have remained at the site of the tattoo. New macrophages move in to consume the previously pigment-filled macrophages and then migrate to the lymph nodes, eventually taking all the dye with them. \"When comparing it to laser-based tattoo removal, in which you see the burns, the scarring, the blisters, in this case, we\'ve designed a drug that doesn\'t really have much off-target effect,\" he said. \"We\'re not targeting any of the normal skin cells, so you won\'t see a lot of inflammation. In fact, based on the process that we\'re actually using, we don\'t think there will be any inflammation at all and it would actually be anti-inflammatory."; diff --git a/bindings/lz-str-py/Cargo.toml b/bindings/lz-str-py/Cargo.toml index 7c06139..63880a0 100644 --- a/bindings/lz-str-py/Cargo.toml +++ b/bindings/lz-str-py/Cargo.toml @@ -10,3 +10,8 @@ crate-type = [ "cdylib" ] [dependencies] lz-str = { path = "../.." } pyo3 = { version = "0.17.2", features = [ "extension-module", "abi3", "abi3-py37" ] } + +[features] +rustc-hash = [ + "lz-str/rustc-hash", +] diff --git a/bindings/lz-str-wasm/Cargo.toml b/bindings/lz-str-wasm/Cargo.toml index d93a525..d886c54 100644 --- a/bindings/lz-str-wasm/Cargo.toml +++ b/bindings/lz-str-wasm/Cargo.toml @@ -7,7 +7,7 @@ repository = "https://github.com/adumbidiot/lz-str-rs" license = "MIT OR Apache-2.0" [lib] -crate-type = ["cdylib", "rlib"] +crate-type = [ "cdylib", "rlib" ] [dependencies] js-sys = "0.3.47" @@ -15,4 +15,9 @@ lz-str = { path = "../.." } wasm-bindgen = "0.2.70" [package.metadata.wasm-pack.profile.release] -wasm-opt = [ '-O4' ] \ No newline at end of file +wasm-opt = [ '-O4' ] + +[features] +rustc-hash = [ + "lz-str/rustc-hash", +] \ No newline at end of file diff --git a/bindings/lz-str-wasm/src/inline.js b/bindings/lz-str-wasm/src/inline.js index c7cb1e9..bab9d98 100644 --- a/bindings/lz-str-wasm/src/inline.js +++ b/bindings/lz-str-wasm/src/inline.js @@ -1,11 +1,11 @@ const CHUNK_SIZE = 10_000; -function convertUint16ArrayToString(array) { +function convertU16SliceToString(slice) { let ret = ''; - let num_chunks = Math.ceil(array.length / CHUNK_SIZE); - for(let i = 0; i < array.length; i += CHUNK_SIZE) { - let end_index = Math.min(i + CHUNK_SIZE, i + array.length); - ret += String.fromCharCode(...array.subarray(i, end_index)); + let num_chunks = Math.ceil(slice.length / CHUNK_SIZE); + for(let i = 0; i < slice.length; i += CHUNK_SIZE) { + let end_index = Math.min(i + CHUNK_SIZE, i + slice.length); + ret += String.fromCharCode(...slice.subarray(i, end_index)); } return ret; diff --git a/bindings/lz-str-wasm/src/lib.rs b/bindings/lz-str-wasm/src/lib.rs index c138087..7cd32c2 100644 --- a/bindings/lz-str-wasm/src/lib.rs +++ b/bindings/lz-str-wasm/src/lib.rs @@ -1,12 +1,11 @@ use js_sys::JsString; -use js_sys::Uint16Array; use wasm_bindgen::prelude::*; use wasm_bindgen::JsCast; #[wasm_bindgen] extern "C" { - #[wasm_bindgen(js_name = "convertUint16ArrayToString")] - fn convert_uint16_array_to_string(array: &Uint16Array) -> JsString; + #[wasm_bindgen(js_name = "convertU16SliceToString")] + fn convert_u16_slice_to_string(slice: &[u16]) -> JsString; } /// Compress a [`JsString`]. @@ -20,8 +19,7 @@ pub fn compress(data: &JsValue) -> JsValue { }; let data: Vec = data.iter().collect(); let compressed = lz_str::compress(&data); - let array: Uint16Array = compressed.as_slice().into(); - convert_uint16_array_to_string(&array).into() + convert_u16_slice_to_string(&compressed).into() } /// Decompress a [`JsString`]. @@ -35,9 +33,6 @@ pub fn decompress(data: &JsValue) -> JsValue { }; let data: Vec = data.iter().collect(); lz_str::decompress(&data) - .map(|decompressed| { - let array: Uint16Array = decompressed.as_slice().into(); - convert_uint16_array_to_string(&array).into() - }) + .map(|decompressed| convert_u16_slice_to_string(&decompressed).into()) .unwrap_or(JsValue::NULL) } diff --git a/deny.toml b/deny.toml index 3b2464c..6177166 100644 --- a/deny.toml +++ b/deny.toml @@ -29,34 +29,6 @@ multiple-versions = "warn" highlight = "all" skip = [] -# Mostly soundness denies since the advisory lacks a section for soundess bugs -deny = [ - # https://github.com/RustSec/advisory-db/issues/298 - { name = "linked-hash-map", version = "<0.5.3" }, - - # https://github.com/RustSec/advisory-db/pull/290 - { name = "bigint", version = "*" }, - - # https://github.com/RustSec/advisory-db/pull/293 - # NOTE: May be sound in the future: https://github.com/RustSec/advisory-db/pull/293#issuecomment-641898680 - { name = "rio", version = "*" }, - - # https://github.com/RustSec/advisory-db/issues/299 - { name = "smallvec", version = "<0.6.13" }, - - # https://github.com/RustSec/advisory-db/pull/268 - { name = "plutonium", version = "*" }, - - # https://github.com/RustSec/advisory-db/pull/308 - { name = "traitobject", version = "*" }, - - # https://github.com/RustSec/advisory-db/issues/305 - { name = "rental", version = "*" }, - - # Appears to be moving towards integrating rio more tightly for io_uring support - { name = "sled", version = "*" }, -] - [sources] unknown-registry = "deny" unknown-git = "deny" diff --git a/src/compress.rs b/src/compress.rs index ed73273..7b36832 100644 --- a/src/compress.rs +++ b/src/compress.rs @@ -5,10 +5,21 @@ use crate::constants::U16_CODE; use crate::constants::U8_CODE; use crate::constants::URI_KEY; use crate::IntoWideIter; -use std::collections::HashMap; -use std::collections::HashSet; +use std::collections::hash_map::Entry as HashMapEntry; use std::convert::TryInto; +#[cfg(not(feature = "rustc-hash"))] +type HashMap = std::collections::HashMap; + +#[cfg(not(feature = "rustc-hash"))] +type HashSet = std::collections::HashSet; + +#[cfg(feature = "rustc-hash")] +type HashMap = rustc_hash::FxHashMap; + +#[cfg(feature = "rustc-hash")] +type HashSet = rustc_hash::FxHashSet; + /// The number of "base codes", /// the default codes of all streams. /// @@ -72,8 +83,8 @@ where assert!(usize::from(bits_per_char) <= std::mem::size_of::() * 8); CompressContext { - dictionary: HashMap::with_capacity(16), - dictionary_to_create: HashSet::with_capacity(16), + dictionary: HashMap::default(), + dictionary_to_create: HashSet::default(), w_start_idx: 0, w_end_idx: 0, @@ -154,30 +165,35 @@ where #[inline] pub fn write_u16(&mut self, i: usize) { let c = &self.input[i]; - if !self.dictionary.contains_key(std::slice::from_ref(c)) { - self.dictionary.insert( - std::slice::from_ref(c), - (self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(), - ); + + let dictionary_len = self.dictionary.len(); + if let HashMapEntry::Vacant(entry) = self.dictionary.entry(std::slice::from_ref(c)) { + entry.insert((dictionary_len + NUM_BASE_CODES).try_into().unwrap()); self.dictionary_to_create.insert(*c); } // wc = w + c. let wc = &self.input[self.w_start_idx..self.w_end_idx + 1]; - if self.dictionary.contains_key(wc) { - // w = wc. - self.w_end_idx += 1; - } else { - self.produce_w(); - // Add wc to the dictionary. - self.dictionary.insert( - wc, - (self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(), - ); - - // w = c. - self.w_start_idx = i; - self.w_end_idx = i + 1; + + let dictionary_len = self.dictionary.len(); + match self.dictionary.entry(wc) { + HashMapEntry::Occupied(_entry) => { + // w = wc. + self.w_end_idx += 1; + } + HashMapEntry::Vacant(entry) => { + // Add wc to the dictionary. + entry.insert((dictionary_len + NUM_BASE_CODES).try_into().unwrap()); + + // Originally, this was before adding wc to the dict. + // However, we only use the dict for a lookup that will crash if it fails in produce_w. + // Therefore, moving it here should be fine. + self.produce_w(); + + // w = c. + self.w_start_idx = i; + self.w_end_idx = i + 1; + } } } diff --git a/src/lib.rs b/src/lib.rs index cd56268..94da9f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -102,6 +102,8 @@ impl<'a> IntoWideIter for &'a [u16] { } } +// TODO: Remove this in the next version. +// We do not benefit from taking ownership of the buffer. impl IntoWideIter for Vec { type Iter = std::vec::IntoIter;