diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs index a83300c24b..a13e90afda 100644 --- a/analytic_engine/src/lib.rs +++ b/analytic_engine/src/lib.rs @@ -2,6 +2,7 @@ //! Analytic table engine implementations +#![feature(option_get_or_insert_default)] mod compaction; mod context; mod engine; diff --git a/analytic_engine/src/sst/parquet/meta_data.rs b/analytic_engine/src/sst/parquet/meta_data.rs index f34f59b2b3..65ea5204a6 100644 --- a/analytic_engine/src/sst/parquet/meta_data.rs +++ b/analytic_engine/src/sst/parquet/meta_data.rs @@ -122,9 +122,7 @@ impl RowGroupFilterBuilder { } pub(crate) fn add_key(&mut self, col_idx: usize, key: &[u8]) { - if let Some(b) = self.builders[col_idx].as_mut() { - b.insert(key) - } + self.builders[col_idx].get_or_insert_default().insert(key) } pub(crate) fn build(self) -> Result { @@ -422,4 +420,22 @@ mod tests { let decoded_parquet_filter = ParquetFilter::try_from(parquet_filter_pb).unwrap(); assert_eq!(decoded_parquet_filter, parquet_filter); } + + #[test] + fn test_row_group_filter_builder() { + let mut builders = RowGroupFilterBuilder::with_num_columns(1); + for key in ["host-123", "host-456", "host-789"] { + builders.add_key(0, key.as_bytes()); + } + let row_group_filter = builders.build().unwrap(); + + let testcase = [("host-123", true), ("host-321", false)]; + for (key, expected) in testcase { + let actual = row_group_filter + .contains_column_data(0, key.as_bytes()) + .unwrap(); + + assert_eq!(expected, actual); + } + } } diff --git a/components/parquet_ext/src/prune/min_max.rs b/components/parquet_ext/src/prune/min_max.rs index df7838080a..6695e3ac05 100644 --- a/components/parquet_ext/src/prune/min_max.rs +++ b/components/parquet_ext/src/prune/min_max.rs @@ -243,43 +243,58 @@ mod test { .unwrap() } + fn int32_stat(min: i32, max: i32) -> Statistics { + Statistics::int32(Some(min), Some(max), None, 0, false) + } + + fn string_stat(min: &str, max: &str) -> Statistics { + Statistics::byte_array(Some(min.into()), Some(max.into()), None, 0, false) + } + #[test] fn test_row_group_filter() { let testcases = vec![ // (expr, min, max, schema, expected) ( col("a").eq(lit(5i64)), // a == 5 - 10, - 20, + int32_stat(10, 20), vec![("a", ArrowDataType::Int64)], vec![], ), ( col("a").eq(lit(14i64)), // a == 14 - 10, - 20, + int32_stat(10, 20), vec![("a", ArrowDataType::Int64)], vec![0], ), ( col("a").lt(col("b")), // a < b - 10, - 20, + int32_stat(10, 20), vec![("a", ArrowDataType::Int32), ("b", ArrowDataType::Int32)], // nothing actually gets calculated. vec![0], ), ( col("a").in_list(vec![lit(17i64), lit(100i64)], false), // a in (17, 100) - 101, - 200, + int32_stat(101, 200), vec![("a", ArrowDataType::Int64)], vec![], ), + ( + col("hostname").eq(lit("host-1794")), // hostname == host-1794 + string_stat("host-18000", "host-20000"), + vec![("hostname", ArrowDataType::Utf8)], + vec![], + ), + ( + col("hostname").eq(lit("host-1794")), // hostname == host-1794 + string_stat("host-1000", "host-20000"), + vec![("hostname", ArrowDataType::Utf8)], + vec![0], + ), ]; - for (expr, min, max, schema, expected) in testcases { - let stat = Statistics::int32(Some(min), Some(max), None, 0, false); + for (expr, stat, schema, expected) in testcases { let schema = prepare_arrow_schema(schema); let metadata = prepare_metadata(&schema, stat);