diff --git a/Cargo.lock b/Cargo.lock index e8714c13..f7207b6a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,3 +1,5 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. [[package]] name = "aho-corasick" version = "0.6.6" diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 64572a7b..557be0f7 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -47,6 +47,7 @@ stats options: This requires storing all CSV data in memory. --median Show the median. This requires storing all CSV data in memory. + --nullcount Show the number of NULLs. --nulls Include NULLs in the population size for computing mean and standard deviation. -j, --jobs The number of jobs to run in parallel. @@ -76,6 +77,7 @@ struct Args { flag_cardinality: bool, flag_median: bool, flag_nulls: bool, + flag_nullcount: bool, flag_jobs: usize, flag_output: Option, flag_no_headers: bool, @@ -209,6 +211,7 @@ impl Args { range: true, dist: true, cardinality: self.flag_cardinality || self.flag_everything, + nullcount: self.flag_nullcount || self.flag_everything, median: self.flag_median || self.flag_everything, mode: self.flag_mode || self.flag_everything, })).take(record_len).collect() @@ -223,6 +226,7 @@ impl Args { if self.flag_median || all { fields.push("median"); } if self.flag_mode || all { fields.push("mode"); } if self.flag_cardinality || all { fields.push("cardinality"); } + if self.flag_nullcount || all { fields.push("nullcount"); } csv::StringRecord::from(fields) } } @@ -234,6 +238,7 @@ struct WhichStats { range: bool, dist: bool, cardinality: bool, + nullcount: bool, median: bool, mode: bool, } @@ -252,6 +257,7 @@ struct Stats { online: Option, mode: Option>>, median: Option>, + nullcount: u64, which: WhichStats, } @@ -271,6 +277,7 @@ impl Stats { online: online, mode: mode, median: median, + nullcount: 0, which: which, } } @@ -283,6 +290,7 @@ impl Stats { self.sum.as_mut().map(|v| v.add(t, sample)); self.minmax.as_mut().map(|v| v.add(t, sample)); self.mode.as_mut().map(|v| v.add(sample.to_vec())); + if sample_type.is_null() { self.nullcount += 1; } match self.typ { TUnknown => {} TNull => { @@ -365,6 +373,9 @@ impl Stats { } } } + if self.which.nullcount { + pieces.push(self.nullcount.to_string()); + } csv::StringRecord::from(pieces) } } @@ -377,6 +388,7 @@ impl Commute for Stats { self.online.merge(other.online); self.mode.merge(other.mode); self.median.merge(other.median); + self.nullcount += other.nullcount; self.which.merge(other.which); } } diff --git a/tests/test_stats.rs b/tests/test_stats.rs index b63b396e..714eb2b1 100644 --- a/tests/test_stats.rs +++ b/tests/test_stats.rs @@ -90,6 +90,7 @@ fn setup(name: S, rows: &[&str], headers: bool, fn get_field_value(wrk: &Workdir, cmd: &mut process::Command, field: &str) -> String { + if field == "nullcount" { cmd.arg("--nullcount"); } if field == "median" { cmd.arg("--median"); } if field == "cardinality" { cmd.arg("--cardinality"); } if field == "mode" { cmd.arg("--mode"); } @@ -181,6 +182,11 @@ stats_tests!(stats_median_even_null, "median", &["", "1", "2", "3", "4"], "2.5"); stats_tests!(stats_median_mix, "median", &["1", "2.5", "3"], "2.5"); +stats_tests!(stats_nullcount, "nullcount", &["", "1", "2"], "1"); +stats_tests!(stats_nullcount_none, "nullcount", &["a", "1", "2"], "0"); +stats_tests!(stats_nullcount_spacenotnull, "nullcount", &[" ", "1", "2"], "0"); +stats_tests!(stats_nullcount_all, "nullcount", &["", "", ""], "3"); + mod stats_infer_nothing { // Only test CSV data with headers. // Empty CSV data with no headers won't produce any statistical analysis.