Skip to content

Commit

Permalink
Merge pull request #44 from EmbarkStudios/master
Browse files Browse the repository at this point in the history
Use zstd over gzip when not compiling for WASM
  • Loading branch information
jpeddicord authored Jul 1, 2019
2 parents 8f71892 + c8fe939 commit f6d98b6
Show file tree
Hide file tree
Showing 22 changed files with 345 additions and 252 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.env
*.bin
*.bin.zstd
*.bin.gz
/Cargo.lock
gh-pages/
Expand Down
12 changes: 7 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,22 @@ exclude = [

[dependencies]
failure = "0.1.5"
flate2 = { version = "1.0.7", features = ["rust_backend"], default_features = false }
lazy_static = "1.3.0"
log = "0.4.6"
regex = "1.1.2"
regex = "1.1.7"
rmp-serde = "0.13.7"
serde = "1.0.89"
serde_derive = "1.0.89"
serde = { version = "1.0.92", features = ["derive"] }
unicode-normalization = "0.1.8"

# spdx deps
serde_json = { version = "1.0.39", optional = true }

[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
rayon = "1.0.3"
rayon = "1.1.0"
zstd = "0.4.24+zstd.1.4.0"

[target.'cfg(target_arch = "wasm32")'.dependencies]
zstd = { version = "0.4.24+zstd.1.4.0", default-features = false, features = ["wasm"] }

[dev-dependencies]
env_logger = "0.6.1"
Expand Down
127 changes: 78 additions & 49 deletions cli/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ include = [
"/src/**/*",
"/build.rs",
"/Cargo.*",
"/embedded-cache.bin.gz",
"/embedded-cache.bin.zstd",
]

[dependencies]
Expand Down
2 changes: 1 addition & 1 deletion cli/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::path::Path;

use askalono::Store;

const EMBEDDED_CACHE: &str = "embedded-cache.bin.gz";
const EMBEDDED_CACHE: &str = "embedded-cache.bin.zstd";

fn main() {
if env::var("CARGO_FEATURE_EMBEDDED_CACHE").is_err() {
Expand Down
1 change: 0 additions & 1 deletion cli/src/formats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ impl<'a> FileResult<'a> {
}

fn as_json(&self) -> String {
use serde_json;
serde_json::to_string(self).expect("must produce valid json output")
}
}
Expand Down
6 changes: 3 additions & 3 deletions cli/src/identify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ pub fn identify_data(
score: cr.score,
license: CLIIdentifiedLicense {
aliases: store.aliases(&cr.license.name).unwrap().clone(),
name: cr.license.name.clone(),
name: cr.license.name.to_owned(),
kind: cr.license.kind,
},
line_range: cr.line_range,
Expand All @@ -126,7 +126,7 @@ pub fn identify_data(
if let Some(license) = result.license {
output.license = Some(CLIIdentifiedLicense {
aliases: store.aliases(&license.name).unwrap().clone(),
name: license.name,
name: license.name.to_owned(),
kind: license.kind,
});

Expand All @@ -138,7 +138,7 @@ pub fn identify_data(
}

// not a good enough match overall, but maybe inside
if output.containing.len() > 0 {
if !output.containing.is_empty() {
if want_diff {
diff_result(&text_data, &result.containing[0].license.data);
}
Expand Down
2 changes: 1 addition & 1 deletion cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fn main() {

let cache_file: PathBuf = options
.cache
.unwrap_or_else(|| "./askalono-cache.bin.gz".into());
.unwrap_or_else(|| "./askalono-cache.bin.zstd".into());

let output_format = options.format.unwrap_or(OutputFormat::text);

Expand Down
4 changes: 2 additions & 2 deletions examples/annotate-text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ enum Annotation {
fn main() {
let args: Vec<_> = std::env::args().collect();
if args.len() != 2 {
eprintln!("usage: annotate-text cache.bin.gz < input.txt > output.html");
eprintln!("usage: annotate-text cache.bin.zstd < input.txt > output.html");
std::process::exit(1);
}

Expand All @@ -37,7 +37,7 @@ fn main() {
for result in &results.containing {
annotations.insert(
result.line_range.0,
Annotation::Begin(result.license.name.clone()),
Annotation::Begin(result.license.name.to_owned()),
);
annotations.insert(result.line_range.1, Annotation::End);
}
Expand Down
2 changes: 1 addition & 1 deletion src/license.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

use std::{collections::HashMap, fmt};

use serde_derive::{Deserialize, Serialize};
use serde::{Deserialize, Serialize};

use crate::{
ngram::NgramSet,
Expand Down
2 changes: 1 addition & 1 deletion src/ngram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::{
collections::{hash_map::Iter, HashMap, VecDeque},
};

use serde_derive::{Deserialize, Serialize};
use serde::{Deserialize, Serialize};

#[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
pub struct NgramSet {
Expand Down
2 changes: 2 additions & 0 deletions src/preproc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ fn trim_line(input: &str) -> String {

// Aggressive preprocessors

#[allow(dead_code)]
fn lcs_substr(fstr: &str, sstr: &str) -> Option<String> {
let mut f_chars = fstr.chars();
let mut s_chars = sstr.chars();
Expand Down Expand Up @@ -161,6 +162,7 @@ fn lcs_substr(fstr: &str, sstr: &str) -> Option<String> {
}
}

#[allow(dead_code)]
fn remove_common_tokens(text: &str) -> String {
let lines: Vec<&str> = text.split('\n').collect();
let mut largest_substr = String::new();
Expand Down
83 changes: 40 additions & 43 deletions src/store/analyze.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

use std::{cmp::Ordering, fmt};

use crate::{license::LicenseType, license::TextData, store::base::Store};
use crate::{
license::LicenseType,
license::TextData,
store::base::{LicenseEntry, Store},
};

/// Information about text that was compared against licenses in the store.
///
Expand All @@ -17,7 +21,7 @@ pub struct Match<'a> {
pub score: f32,
/// The name of the closest matching license in the `Store`. This will
/// always be something that exists in the store, regardless of the score.
pub name: String,
pub name: &'a str,
/// The type of the license that matched. Useful to know if the match was
/// the complete text, a header, or something else.
pub license_type: LicenseType,
Expand Down Expand Up @@ -59,46 +63,41 @@ impl<'a> fmt::Debug for Match<'a> {
}
}

// this could probably be a stand-alone closure, but I was hitting lifetime
// hell, so a macro it is. feel free to attempt it yourself.
macro_rules! analyze_fold_closure {
($text:ident) => {
|mut acc: Vec<PartialMatch<'_>>, (name, data)| {
acc.push(PartialMatch {
score: data.original.match_score($text),
name,
license_type: LicenseType::Original,
data: &data.original,
});
data.alternates.iter().for_each(|alt| {
acc.push(PartialMatch {
score: alt.match_score($text),
name,
license_type: LicenseType::Alternate,
data: alt,
})
});
data.headers.iter().for_each(|head| {
acc.push(PartialMatch {
score: head.match_score($text),
name,
license_type: LicenseType::Header,
data: head,
})
});
acc
}
};
}

impl Store {
/// Compare the given `TextData` against all licenses in the `Store`.
///
/// This parallelizes the search as much as it can to find the best match.
/// Once a match is obtained, it can be optimized further; see methods on
/// `TextData` for more information.
pub fn analyze(&self, text: &TextData) -> Match<'_> {
let mut res: Vec<PartialMatch<'_>>;
pub fn analyze<'a>(&'a self, text: &TextData) -> Match<'a> {
let mut res: Vec<PartialMatch<'a>>;

let analyze_fold =
|mut acc: Vec<PartialMatch<'a>>, (name, data): (&'a String, &'a LicenseEntry)| {
acc.push(PartialMatch {
score: data.original.match_score(text),
name,
license_type: LicenseType::Original,
data: &data.original,
});
data.alternates.iter().for_each(|alt| {
acc.push(PartialMatch {
score: alt.match_score(text),
name,
license_type: LicenseType::Alternate,
data: alt,
})
});
data.headers.iter().for_each(|head| {
acc.push(PartialMatch {
score: head.match_score(text),
name,
license_type: LicenseType::Header,
data: head,
})
});
acc
};

// parallel analysis
#[cfg(not(target_arch = "wasm32"))]
Expand All @@ -107,10 +106,10 @@ impl Store {
res = self
.licenses
.par_iter()
.fold(Vec::new, analyze_fold_closure!(text))
.fold(Vec::new, analyze_fold)
.reduce(
Vec::new,
|mut a: Vec<PartialMatch<'_>>, b: Vec<PartialMatch<'_>>| {
|mut a: Vec<PartialMatch<'a>>, b: Vec<PartialMatch<'a>>| {
a.extend(b);
a
},
Expand All @@ -125,17 +124,15 @@ impl Store {
.licenses
.iter()
// len of licenses isn't strictly correct, but it'll do
.fold(
Vec::with_capacity(self.licenses.len()),
analyze_fold_closure!(text),
);
.fold(Vec::with_capacity(self.licenses.len()), analyze_fold);
res.sort_unstable_by(|a, b| b.partial_cmp(a).unwrap());
}

let m = &res[0];

Match {
score: m.score,
name: m.name.to_string(),
name: m.name,
license_type: m.license_type,
data: m.data,
}
Expand Down
4 changes: 2 additions & 2 deletions src/store/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use std::collections::HashMap;

use failure::{format_err, Error};
use serde_derive::{Deserialize, Serialize};
use serde::{Deserialize, Serialize};

use crate::{license::LicenseType, license::TextData};

Expand All @@ -31,7 +31,7 @@ pub(crate) struct LicenseEntry {
/// use askalono::{Store, TextData};
///
/// # fn main() -> Result<(), Box<Error>> {
/// let store = Store::from_cache(File::open("askalono-cache.bin.gz")?)?;
/// let store = Store::from_cache(File::open("askalono-cache.bin.zstd")?)?;
/// let result = store.analyze(&TextData::from("what's this"));
/// # Ok(())
/// # }
Expand Down
44 changes: 20 additions & 24 deletions src/store/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@

use std::{io::copy, io::prelude::*};

use failure::{bail, format_err, Error};
use flate2::{read::GzDecoder, Compression, GzBuilder};
use failure::Error;
use log::info;
use rmp_serde::Serializer;
use serde::Serialize;

use crate::store::base::Store;

const CACHE_VERSION: &[u8] = b"askalono-03";
const CACHE_VERSION: &[u8] = b"askalono-04";

impl Store {
/// Create a store from a cache file.
Expand All @@ -21,48 +20,45 @@ impl Store {
/// the full SPDX set from disk in 200-300 ms. The cache will be
/// sanity-checked to ensure it was generated with a similar version of
/// askalono.
pub fn from_cache<R>(readable: R) -> Result<Store, Error>
pub fn from_cache<R>(mut readable: R) -> Result<Store, Error>
where
R: Read + Sized,
{
use rmp_serde::decode::from_read;
let mut header = [0u8; 11];
readable.read_exact(&mut header)?;

let dec = GzDecoder::new(readable);
{
let extra = dec
.header()
.ok_or_else(|| format_err!("cache gzip header invalid"))?
.extra()
.ok_or_else(|| format_err!("cache gzip extra header missing"))?;
if extra != CACHE_VERSION {
bail!("cache version mismatch");
}
if header != CACHE_VERSION {
failure::bail!("cache version mismatch");
}

let store = from_read(dec)?;
let dec = zstd::Decoder::new(readable)?;
let store = rmp_serde::decode::from_read(dec)?;
Ok(store)
}

/// Serialize the current store.
///
/// The output will be a MessagePack'd gzip'd binary stream that should be
/// The output will be a MessagePack'd gzip'd or zstd'd binary stream that should be
/// written to disk.
pub fn to_cache<W>(&self, mut writable: W) -> Result<(), Error>
where
W: Write + Sized,
{
let mut buf = Vec::new();
{
let buf = {
// This currently sits around 3.7MiB, so go up to 4 to fit comfortably
let mut buf = Vec::with_capacity(4 * 1024 * 1024);
let mut serializer = Serializer::new(&mut buf);
self.serialize(&mut serializer)?;
}
buf
};

info!("Pre-compressed output is {} bytes", buf.len());

let mut gz = GzBuilder::new()
.extra(CACHE_VERSION)
.write(&mut writable, Compression::best());
copy(&mut buf.as_slice(), &mut gz)?;
writable.write_all(CACHE_VERSION)?;
let mut zenc = zstd::Encoder::new(writable, 21)?;

copy(&mut buf.as_slice(), &mut zenc)?;
zenc.finish()?;

Ok(())
}
Expand Down
Loading

0 comments on commit f6d98b6

Please sign in to comment.