Skip to content

Commit

Permalink
Improve Compression Performance (#36)
Browse files Browse the repository at this point in the history
* Add `rustc-hash` feature

* Add features section to readme

* Fix compression with `rustc-hash`

* Add tests for `rustc-hash` on CI

* Remove `cargo-deny` deny section

* Expose `rustc-hash` to python binding

* Expose `rustc-hash` feature on `lz-str-wasm``

* Add ability to specify features via makefile

* Avoid copy in `lz-str-wasm` compress interface

* fmt

* Add note about removing `IntoWideIter` for `Vec`

* Reduce hash lookups with entry api

* Reorganize produce_w order to avoid double map lookup
  • Loading branch information
adumbidiot authored Oct 29, 2022
1 parent 4eb08c5 commit 8959ebf
Show file tree
Hide file tree
Showing 13 changed files with 89 additions and 71 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/Build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,11 @@ jobs:
- name: Build
run: cargo build --verbose

- name: Run Tests
- name: Build with `rustc-hash`
run: cargo build --verbose --features=rustc-hash

- name: Run All Tests
run: cargo test --all --verbose

- name: Run Tests for `lz-str` with `rustc-hash`
run: cargo test --verbose --features=rustc-hash
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ exclude = [
]

[dependencies]
rustc-hash = { version = "1.1.0", optional = true }

[dev-dependencies]
rand = "0.8.3"
Expand Down
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
export WASM_FEATURES =

.PHONY: build-wasm

# --reference-types

build-wasm:
wasm-pack build --target nodejs bindings/lz-str-wasm
wasm-pack build --target nodejs bindings/lz-str-wasm --features=$(WASM_FEATURES)
cd bindings/lz-str-wasm && python inject-inline-js.py

build-wasm-browser:
wasm-pack build --target web bindings/lz-str-wasm
wasm-pack build --target web bindings/lz-str-wasm --features=$(WASM_FEATURES)
cd bindings/lz-str-wasm && python inject-inline-js.py
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ fn main() {

See the [examples](https://github.com/adumbidiot/lz-str-rs/tree/master/examples) directory for more examples.

## Features
`rustc-hash`: This feature will replace some internal maps' hashers with rustc-hash,
boosting performance at the cost of not using a DOS-resistant hasher.

## Testing
```bash
cargo test
Expand Down
5 changes: 4 additions & 1 deletion benches/compress.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use criterion::criterion_group;
use criterion::criterion_main;
use criterion::BenchmarkId;
use criterion::Criterion;

const TEST_PHRASE: &str = "During tattooing, ink is injected into the skin, initiating an immune response, and cells called \"macrophages\" move into the area and \"eat up\" the ink. The macrophages carry some of the ink to the body\'s lymph nodes, but some that are filled with ink stay put, embedded in the skin. That\'s what makes the tattoo visible under the skin. Dalhousie Uiversity\'s Alec Falkenham is developing a topical cream that works by targeting the macrophages that have remained at the site of the tattoo. New macrophages move in to consume the previously pigment-filled macrophages and then migrate to the lymph nodes, eventually taking all the dye with them. \"When comparing it to laser-based tattoo removal, in which you see the burns, the scarring, the blisters, in this case, we\'ve designed a drug that doesn\'t really have much off-target effect,\" he said. \"We\'re not targeting any of the normal skin cells, so you won\'t see a lot of inflammation. In fact, based on the process that we\'re actually using, we don\'t think there will be any inflammation at all and it would actually be anti-inflammatory.";

Expand Down
5 changes: 5 additions & 0 deletions bindings/lz-str-py/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,8 @@ crate-type = [ "cdylib" ]
[dependencies]
lz-str = { path = "../.." }
pyo3 = { version = "0.17.2", features = [ "extension-module", "abi3", "abi3-py37" ] }

[features]
rustc-hash = [
"lz-str/rustc-hash",
]
9 changes: 7 additions & 2 deletions bindings/lz-str-wasm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@ repository = "https://github.com/adumbidiot/lz-str-rs"
license = "MIT OR Apache-2.0"

[lib]
crate-type = ["cdylib", "rlib"]
crate-type = [ "cdylib", "rlib" ]

[dependencies]
js-sys = "0.3.47"
lz-str = { path = "../.." }
wasm-bindgen = "0.2.70"

[package.metadata.wasm-pack.profile.release]
wasm-opt = [ '-O4' ]
wasm-opt = [ '-O4' ]

[features]
rustc-hash = [
"lz-str/rustc-hash",
]
10 changes: 5 additions & 5 deletions bindings/lz-str-wasm/src/inline.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
const CHUNK_SIZE = 10_000;

function convertUint16ArrayToString(array) {
function convertU16SliceToString(slice) {
let ret = '';
let num_chunks = Math.ceil(array.length / CHUNK_SIZE);
for(let i = 0; i < array.length; i += CHUNK_SIZE) {
let end_index = Math.min(i + CHUNK_SIZE, i + array.length);
ret += String.fromCharCode(...array.subarray(i, end_index));
let num_chunks = Math.ceil(slice.length / CHUNK_SIZE);
for(let i = 0; i < slice.length; i += CHUNK_SIZE) {
let end_index = Math.min(i + CHUNK_SIZE, i + slice.length);
ret += String.fromCharCode(...slice.subarray(i, end_index));
}

return ret;
Expand Down
13 changes: 4 additions & 9 deletions bindings/lz-str-wasm/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
use js_sys::JsString;
use js_sys::Uint16Array;
use wasm_bindgen::prelude::*;
use wasm_bindgen::JsCast;

#[wasm_bindgen]
extern "C" {
#[wasm_bindgen(js_name = "convertUint16ArrayToString")]
fn convert_uint16_array_to_string(array: &Uint16Array) -> JsString;
#[wasm_bindgen(js_name = "convertU16SliceToString")]
fn convert_u16_slice_to_string(slice: &[u16]) -> JsString;
}

/// Compress a [`JsString`].
Expand All @@ -20,8 +19,7 @@ pub fn compress(data: &JsValue) -> JsValue {
};
let data: Vec<u16> = data.iter().collect();
let compressed = lz_str::compress(&data);
let array: Uint16Array = compressed.as_slice().into();
convert_uint16_array_to_string(&array).into()
convert_u16_slice_to_string(&compressed).into()
}

/// Decompress a [`JsString`].
Expand All @@ -35,9 +33,6 @@ pub fn decompress(data: &JsValue) -> JsValue {
};
let data: Vec<u16> = data.iter().collect();
lz_str::decompress(&data)
.map(|decompressed| {
let array: Uint16Array = decompressed.as_slice().into();
convert_uint16_array_to_string(&array).into()
})
.map(|decompressed| convert_u16_slice_to_string(&decompressed).into())
.unwrap_or(JsValue::NULL)
}
28 changes: 0 additions & 28 deletions deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,34 +29,6 @@ multiple-versions = "warn"
highlight = "all"
skip = []

# Mostly soundness denies since the advisory lacks a section for soundess bugs
deny = [
# https://github.com/RustSec/advisory-db/issues/298
{ name = "linked-hash-map", version = "<0.5.3" },

# https://github.com/RustSec/advisory-db/pull/290
{ name = "bigint", version = "*" },

# https://github.com/RustSec/advisory-db/pull/293
# NOTE: May be sound in the future: https://github.com/RustSec/advisory-db/pull/293#issuecomment-641898680
{ name = "rio", version = "*" },

# https://github.com/RustSec/advisory-db/issues/299
{ name = "smallvec", version = "<0.6.13" },

# https://github.com/RustSec/advisory-db/pull/268
{ name = "plutonium", version = "*" },

# https://github.com/RustSec/advisory-db/pull/308
{ name = "traitobject", version = "*" },

# https://github.com/RustSec/advisory-db/issues/305
{ name = "rental", version = "*" },

# Appears to be moving towards integrating rio more tightly for io_uring support
{ name = "sled", version = "*" },
]

[sources]
unknown-registry = "deny"
unknown-git = "deny"
Expand Down
62 changes: 39 additions & 23 deletions src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,21 @@ use crate::constants::U16_CODE;
use crate::constants::U8_CODE;
use crate::constants::URI_KEY;
use crate::IntoWideIter;
use std::collections::HashMap;
use std::collections::HashSet;
use std::collections::hash_map::Entry as HashMapEntry;
use std::convert::TryInto;

#[cfg(not(feature = "rustc-hash"))]
type HashMap<K, V> = std::collections::HashMap<K, V>;

#[cfg(not(feature = "rustc-hash"))]
type HashSet<T> = std::collections::HashSet<T>;

#[cfg(feature = "rustc-hash")]
type HashMap<K, V> = rustc_hash::FxHashMap<K, V>;

#[cfg(feature = "rustc-hash")]
type HashSet<T> = rustc_hash::FxHashSet<T>;

/// The number of "base codes",
/// the default codes of all streams.
///
Expand Down Expand Up @@ -72,8 +83,8 @@ where
assert!(usize::from(bits_per_char) <= std::mem::size_of::<u16>() * 8);

CompressContext {
dictionary: HashMap::with_capacity(16),
dictionary_to_create: HashSet::with_capacity(16),
dictionary: HashMap::default(),
dictionary_to_create: HashSet::default(),

w_start_idx: 0,
w_end_idx: 0,
Expand Down Expand Up @@ -154,30 +165,35 @@ where
#[inline]
pub fn write_u16(&mut self, i: usize) {
let c = &self.input[i];
if !self.dictionary.contains_key(std::slice::from_ref(c)) {
self.dictionary.insert(
std::slice::from_ref(c),
(self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(),
);

let dictionary_len = self.dictionary.len();
if let HashMapEntry::Vacant(entry) = self.dictionary.entry(std::slice::from_ref(c)) {
entry.insert((dictionary_len + NUM_BASE_CODES).try_into().unwrap());
self.dictionary_to_create.insert(*c);
}

// wc = w + c.
let wc = &self.input[self.w_start_idx..self.w_end_idx + 1];
if self.dictionary.contains_key(wc) {
// w = wc.
self.w_end_idx += 1;
} else {
self.produce_w();
// Add wc to the dictionary.
self.dictionary.insert(
wc,
(self.dictionary.len() + NUM_BASE_CODES).try_into().unwrap(),
);

// w = c.
self.w_start_idx = i;
self.w_end_idx = i + 1;

let dictionary_len = self.dictionary.len();
match self.dictionary.entry(wc) {
HashMapEntry::Occupied(_entry) => {
// w = wc.
self.w_end_idx += 1;
}
HashMapEntry::Vacant(entry) => {
// Add wc to the dictionary.
entry.insert((dictionary_len + NUM_BASE_CODES).try_into().unwrap());

// Originally, this was before adding wc to the dict.
// However, we only use the dict for a lookup that will crash if it fails in produce_w.
// Therefore, moving it here should be fine.
self.produce_w();

// w = c.
self.w_start_idx = i;
self.w_end_idx = i + 1;
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ impl<'a> IntoWideIter for &'a [u16] {
}
}

// TODO: Remove this in the next version.
// We do not benefit from taking ownership of the buffer.
impl IntoWideIter for Vec<u16> {
type Iter = std::vec::IntoIter<u16>;

Expand Down

0 comments on commit 8959ebf

Please sign in to comment.