Merge pull request #44 from EmbarkStudios/master

Use zstd over gzip when not compiling for WASM
jpeddicord · Jul 1, 2019 · f6d98b6 · f6d98b6
2 parents 8f71892 + c8fe939
commit f6d98b6
Show file tree

Hide file tree

Showing 22 changed files with 345 additions and 252 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 .env
 *.bin
+*.bin.zstd
 *.bin.gz
 /Cargo.lock
 gh-pages/

diff --git a/Cargo.toml b/Cargo.toml
@@ -15,20 +15,22 @@ exclude = [
 
 [dependencies]
 failure = "0.1.5"
-flate2 = { version = "1.0.7", features = ["rust_backend"], default_features = false }
 lazy_static = "1.3.0"
 log = "0.4.6"
-regex = "1.1.2"
+regex = "1.1.7"
 rmp-serde = "0.13.7"
-serde = "1.0.89"
-serde_derive = "1.0.89"
+serde = { version = "1.0.92", features = ["derive"] }
 unicode-normalization = "0.1.8"
 
 # spdx deps
 serde_json = { version = "1.0.39", optional = true }
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
-rayon = "1.0.3"
+rayon = "1.1.0"
+zstd = "0.4.24+zstd.1.4.0"
+
+[target.'cfg(target_arch = "wasm32")'.dependencies]
+zstd = { version = "0.4.24+zstd.1.4.0", default-features = false, features = ["wasm"] }
 
 [dev-dependencies]
 env_logger = "0.6.1"

diff --git a/cli/Cargo.lock b/cli/Cargo.lock
diff --git a/cli/Cargo.toml b/cli/Cargo.toml
@@ -11,7 +11,7 @@ include = [
     "/src/**/*",
     "/build.rs",
     "/Cargo.*",
-    "/embedded-cache.bin.gz",
+    "/embedded-cache.bin.zstd",
 ]
 
 [dependencies]

diff --git a/cli/build.rs b/cli/build.rs
@@ -7,7 +7,7 @@ use std::path::Path;
 
 use askalono::Store;
 
-const EMBEDDED_CACHE: &str = "embedded-cache.bin.gz";
+const EMBEDDED_CACHE: &str = "embedded-cache.bin.zstd";
 
 fn main() {
     if env::var("CARGO_FEATURE_EMBEDDED_CACHE").is_err() {

diff --git a/cli/src/formats.rs b/cli/src/formats.rs
@@ -98,7 +98,6 @@ impl<'a> FileResult<'a> {
     }
 
     fn as_json(&self) -> String {
-        use serde_json;
         serde_json::to_string(self).expect("must produce valid json output")
     }
 }

diff --git a/cli/src/identify.rs b/cli/src/identify.rs
@@ -114,7 +114,7 @@ pub fn identify_data(
                 score: cr.score,
                 license: CLIIdentifiedLicense {
                     aliases: store.aliases(&cr.license.name).unwrap().clone(),
-                    name: cr.license.name.clone(),
+                    name: cr.license.name.to_owned(),
                     kind: cr.license.kind,
                 },
                 line_range: cr.line_range,
@@ -126,7 +126,7 @@ pub fn identify_data(
     if let Some(license) = result.license {
         output.license = Some(CLIIdentifiedLicense {
             aliases: store.aliases(&license.name).unwrap().clone(),
-            name: license.name,
+            name: license.name.to_owned(),
             kind: license.kind,
         });
 
@@ -138,7 +138,7 @@ pub fn identify_data(
     }
 
     // not a good enough match overall, but maybe inside
-    if output.containing.len() > 0 {
+    if !output.containing.is_empty() {
         if want_diff {
             diff_result(&text_data, &result.containing[0].license.data);
         }

diff --git a/cli/src/main.rs b/cli/src/main.rs
@@ -26,7 +26,7 @@ fn main() {
 
     let cache_file: PathBuf = options
         .cache
-        .unwrap_or_else(|| "./askalono-cache.bin.gz".into());
+        .unwrap_or_else(|| "./askalono-cache.bin.zstd".into());
 
     let output_format = options.format.unwrap_or(OutputFormat::text);
 

diff --git a/examples/annotate-text.rs b/examples/annotate-text.rs
@@ -14,7 +14,7 @@ enum Annotation {
 fn main() {
     let args: Vec<_> = std::env::args().collect();
     if args.len() != 2 {
-        eprintln!("usage: annotate-text cache.bin.gz < input.txt > output.html");
+        eprintln!("usage: annotate-text cache.bin.zstd < input.txt > output.html");
         std::process::exit(1);
     }
 
@@ -37,7 +37,7 @@ fn main() {
     for result in &results.containing {
         annotations.insert(
             result.line_range.0,
-            Annotation::Begin(result.license.name.clone()),
+            Annotation::Begin(result.license.name.to_owned()),
         );
         annotations.insert(result.line_range.1, Annotation::End);
     }

diff --git a/src/license.rs b/src/license.rs
@@ -3,7 +3,7 @@
 
 use std::{collections::HashMap, fmt};
 
-use serde_derive::{Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 
 use crate::{
     ngram::NgramSet,

diff --git a/src/ngram.rs b/src/ngram.rs
@@ -6,7 +6,7 @@ use std::{
     collections::{hash_map::Iter, HashMap, VecDeque},
 };
 
-use serde_derive::{Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 
 #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)]
 pub struct NgramSet {

diff --git a/src/preproc.rs b/src/preproc.rs
@@ -109,6 +109,7 @@ fn trim_line(input: &str) -> String {
 
 // Aggressive preprocessors
 
+#[allow(dead_code)]
 fn lcs_substr(fstr: &str, sstr: &str) -> Option<String> {
     let mut f_chars = fstr.chars();
     let mut s_chars = sstr.chars();
@@ -161,6 +162,7 @@ fn lcs_substr(fstr: &str, sstr: &str) -> Option<String> {
     }
 }
 
+#[allow(dead_code)]
 fn remove_common_tokens(text: &str) -> String {
     let lines: Vec<&str> = text.split('\n').collect();
     let mut largest_substr = String::new();

diff --git a/src/store/analyze.rs b/src/store/analyze.rs
@@ -3,7 +3,11 @@
 
 use std::{cmp::Ordering, fmt};
 
-use crate::{license::LicenseType, license::TextData, store::base::Store};
+use crate::{
+    license::LicenseType,
+    license::TextData,
+    store::base::{LicenseEntry, Store},
+};
 
 /// Information about text that was compared against licenses in the store.
 ///
@@ -17,7 +21,7 @@ pub struct Match<'a> {
     pub score: f32,
     /// The name of the closest matching license in the `Store`. This will
     /// always be something that exists in the store, regardless of the score.
-    pub name: String,
+    pub name: &'a str,
     /// The type of the license that matched. Useful to know if the match was
     /// the complete text, a header, or something else.
     pub license_type: LicenseType,
@@ -59,46 +63,41 @@ impl<'a> fmt::Debug for Match<'a> {
     }
 }
 
-// this could probably be a stand-alone closure, but I was hitting lifetime
-// hell, so a macro it is. feel free to attempt it yourself.
-macro_rules! analyze_fold_closure {
-    ($text:ident) => {
-        |mut acc: Vec<PartialMatch<'_>>, (name, data)| {
-            acc.push(PartialMatch {
-                score: data.original.match_score($text),
-                name,
-                license_type: LicenseType::Original,
-                data: &data.original,
-            });
-            data.alternates.iter().for_each(|alt| {
-                acc.push(PartialMatch {
-                    score: alt.match_score($text),
-                    name,
-                    license_type: LicenseType::Alternate,
-                    data: alt,
-                })
-            });
-            data.headers.iter().for_each(|head| {
-                acc.push(PartialMatch {
-                    score: head.match_score($text),
-                    name,
-                    license_type: LicenseType::Header,
-                    data: head,
-                })
-            });
-            acc
-        }
-    };
-}
-
 impl Store {
     /// Compare the given `TextData` against all licenses in the `Store`.
     ///
     /// This parallelizes the search as much as it can to find the best match.
     /// Once a match is obtained, it can be optimized further; see methods on
     /// `TextData` for more information.
-    pub fn analyze(&self, text: &TextData) -> Match<'_> {
-        let mut res: Vec<PartialMatch<'_>>;
+    pub fn analyze<'a>(&'a self, text: &TextData) -> Match<'a> {
+        let mut res: Vec<PartialMatch<'a>>;
+
+        let analyze_fold =
+            |mut acc: Vec<PartialMatch<'a>>, (name, data): (&'a String, &'a LicenseEntry)| {
+                acc.push(PartialMatch {
+                    score: data.original.match_score(text),
+                    name,
+                    license_type: LicenseType::Original,
+                    data: &data.original,
+                });
+                data.alternates.iter().for_each(|alt| {
+                    acc.push(PartialMatch {
+                        score: alt.match_score(text),
+                        name,
+                        license_type: LicenseType::Alternate,
+                        data: alt,
+                    })
+                });
+                data.headers.iter().for_each(|head| {
+                    acc.push(PartialMatch {
+                        score: head.match_score(text),
+                        name,
+                        license_type: LicenseType::Header,
+                        data: head,
+                    })
+                });
+                acc
+            };
 
         // parallel analysis
         #[cfg(not(target_arch = "wasm32"))]
@@ -107,10 +106,10 @@ impl Store {
             res = self
                 .licenses
                 .par_iter()
-                .fold(Vec::new, analyze_fold_closure!(text))
+                .fold(Vec::new, analyze_fold)
                 .reduce(
                     Vec::new,
-                    |mut a: Vec<PartialMatch<'_>>, b: Vec<PartialMatch<'_>>| {
+                    |mut a: Vec<PartialMatch<'a>>, b: Vec<PartialMatch<'a>>| {
                         a.extend(b);
                         a
                     },
@@ -125,17 +124,15 @@ impl Store {
                 .licenses
                 .iter()
                 // len of licenses isn't strictly correct, but it'll do
-                .fold(
-                    Vec::with_capacity(self.licenses.len()),
-                    analyze_fold_closure!(text),
-                );
+                .fold(Vec::with_capacity(self.licenses.len()), analyze_fold);
             res.sort_unstable_by(|a, b| b.partial_cmp(a).unwrap());
         }
 
         let m = &res[0];
+
         Match {
             score: m.score,
-            name: m.name.to_string(),
+            name: m.name,
             license_type: m.license_type,
             data: m.data,
         }

diff --git a/src/store/base.rs b/src/store/base.rs
@@ -4,7 +4,7 @@
 use std::collections::HashMap;
 
 use failure::{format_err, Error};
-use serde_derive::{Deserialize, Serialize};
+use serde::{Deserialize, Serialize};
 
 use crate::{license::LicenseType, license::TextData};
 
@@ -31,7 +31,7 @@ pub(crate) struct LicenseEntry {
 /// use askalono::{Store, TextData};
 ///
 /// # fn main() -> Result<(), Box<Error>> {
-/// let store = Store::from_cache(File::open("askalono-cache.bin.gz")?)?;
+/// let store = Store::from_cache(File::open("askalono-cache.bin.zstd")?)?;
 /// let result = store.analyze(&TextData::from("what's this"));
 /// # Ok(())
 /// # }

diff --git a/src/store/cache.rs b/src/store/cache.rs
@@ -3,15 +3,14 @@
 
 use std::{io::copy, io::prelude::*};
 
-use failure::{bail, format_err, Error};
-use flate2::{read::GzDecoder, Compression, GzBuilder};
+use failure::Error;
 use log::info;
 use rmp_serde::Serializer;
 use serde::Serialize;
 
 use crate::store::base::Store;
 
-const CACHE_VERSION: &[u8] = b"askalono-03";
+const CACHE_VERSION: &[u8] = b"askalono-04";
 
 impl Store {
     /// Create a store from a cache file.
@@ -21,48 +20,45 @@ impl Store {
     /// the full SPDX set from disk in 200-300 ms. The cache will be
     /// sanity-checked to ensure it was generated with a similar version of
     /// askalono.
-    pub fn from_cache<R>(readable: R) -> Result<Store, Error>
+    pub fn from_cache<R>(mut readable: R) -> Result<Store, Error>
     where
         R: Read + Sized,
     {
-        use rmp_serde::decode::from_read;
+        let mut header = [0u8; 11];
+        readable.read_exact(&mut header)?;
 
-        let dec = GzDecoder::new(readable);
-        {
-            let extra = dec
-                .header()
-                .ok_or_else(|| format_err!("cache gzip header invalid"))?
-                .extra()
-                .ok_or_else(|| format_err!("cache gzip extra header missing"))?;
-            if extra != CACHE_VERSION {
-                bail!("cache version mismatch");
-            }
+        if header != CACHE_VERSION {
+            failure::bail!("cache version mismatch");
         }
 
-        let store = from_read(dec)?;
+        let dec = zstd::Decoder::new(readable)?;
+        let store = rmp_serde::decode::from_read(dec)?;
         Ok(store)
     }
 
     /// Serialize the current store.
     ///
-    /// The output will be a MessagePack'd gzip'd binary stream that should be
+    /// The output will be a MessagePack'd gzip'd or zstd'd binary stream that should be
     /// written to disk.
     pub fn to_cache<W>(&self, mut writable: W) -> Result<(), Error>
     where
         W: Write + Sized,
     {
-        let mut buf = Vec::new();
-        {
+        let buf = {
+            // This currently sits around 3.7MiB, so go up to 4 to fit comfortably
+            let mut buf = Vec::with_capacity(4 * 1024 * 1024);
             let mut serializer = Serializer::new(&mut buf);
             self.serialize(&mut serializer)?;
-        }
+            buf
+        };
 
         info!("Pre-compressed output is {} bytes", buf.len());
 
-        let mut gz = GzBuilder::new()
-            .extra(CACHE_VERSION)
-            .write(&mut writable, Compression::best());
-        copy(&mut buf.as_slice(), &mut gz)?;
+        writable.write_all(CACHE_VERSION)?;
+        let mut zenc = zstd::Encoder::new(writable, 21)?;
+
+        copy(&mut buf.as_slice(), &mut zenc)?;
+        zenc.finish()?;
 
         Ok(())
     }
-Original file line number
+Diff line change
@@ Expand Up / @@ -98,7 +98,6 @@ impl<'a> FileResult<'a> { @@
         }
         fn as_json(&self) -> String {
-            use serde_json;
             serde_json::to_string(self).expect("must produce valid json output")
         }
     }
@@ Expand Down @@