From a352cf6ae1a2323a011a387080b8648b93e44309 Mon Sep 17 00:00:00 2001 From: zouxiang Date: Fri, 23 Jun 2023 17:48:28 +0800 Subject: [PATCH 1/4] feat: add --stats in sst-metadat tool to query sst file & field statistics. --- tools/src/bin/sst-metadata.rs | 80 ++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/tools/src/bin/sst-metadata.rs b/tools/src/bin/sst-metadata.rs index 16ae26ab94..9c0807220d 100644 --- a/tools/src/bin/sst-metadata.rs +++ b/tools/src/bin/sst-metadata.rs @@ -2,7 +2,7 @@ //! A cli to query sst meta data -use std::sync::Arc; +use std::{collections::HashMap, sync::Arc}; use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter}; use anyhow::{Context, Result}; @@ -27,6 +27,10 @@ struct Args { #[clap(short, long, required(false))] verbose: bool, + /// File & Field Statistics print + #[clap(short, long, required(false))] + stats: bool, + /// Thread num, 0 means cpu num #[clap(short, long, default_value_t = 0)] threads: usize, @@ -36,6 +40,34 @@ struct Args { page_indexes: bool, } +#[derive(Default, Debug)] +struct FileStatistics { + file_count: u64, + size: usize, + metadata_size: usize, + kv_size: usize, + filter_size: usize, + row_num: i64, +} + +impl ToString for FileStatistics { + fn to_string(&self) -> String { + format!("FileStatistics {{\n\tfile_count: {},\n\tsize: {:.2},\n\tmetadata_size: {:.2}, \n\tkv_size: {:.2},\n\tfilter_size: {:.2},\n\trow_num: {},\n}}", + self.file_count, + as_mb(self.size), + as_mb(self.metadata_size), + as_mb(self.kv_size), + as_mb(self.filter_size), + self.row_num) + } +} + +#[derive(Default, Debug)] +struct FieldStatistics { + compressed_size: i64, + uncompressed_size: i64, +} + fn new_runtime(thread_num: usize) -> Runtime { runtime::Builder::default() .thread_name("sst-metadata") @@ -91,6 +123,52 @@ async fn run(args: Args) -> Result<()> { metas.push(meta); } + if args.stats { + let mut file_stats = FileStatistics::default(); + let mut field_stats_map = HashMap::new(); + for (object_meta, sst_metadata, metadata_size, kv_size) in metas { + let parquet_meta = sst_metadata.parquet(); + + file_stats.file_count += 1; + file_stats.size += object_meta.size; + file_stats.metadata_size += metadata_size; + file_stats.kv_size += kv_size; + let filter_size = sst_metadata + .custom() + .parquet_filter + .as_ref() + .map(|f| f.size()) + .unwrap_or(0); + file_stats.filter_size += filter_size; + file_stats.row_num += parquet_meta.file_metadata().num_rows(); + + let fields = parquet_meta.file_metadata().schema().get_fields(); + for (_, row_group) in parquet_meta.row_groups().iter().enumerate() { + for i in 0..fields.len() { + let column_meta = row_group.column(i); + let field_name = fields.get(i).unwrap().get_basic_info().name().to_string(); + if !field_stats_map.contains_key(&field_name) { + field_stats_map.insert(field_name.clone(), FieldStatistics::default()); + } + let field_stats = field_stats_map.get_mut(&field_name).unwrap(); + field_stats.compressed_size += column_meta.compressed_size(); + field_stats.uncompressed_size += column_meta.uncompressed_size(); + } + } + } + println!("{}", file_stats.to_string()); + + println!("FieldStatistics: "); + for (k, v) in field_stats_map.iter() { + println!("{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}", + k, + as_mb(v.compressed_size as usize), + as_mb(v.uncompressed_size as usize), + v.uncompressed_size as f64 / v.compressed_size as f64); + } + return Ok(()); + } + // sort by time_range asc metas.sort_by(|a, b| { a.1.custom() From fc58e30daa68b1013f932174e14990b97ecf99c6 Mon Sep 17 00:00:00 2001 From: zouxiang Date: Sun, 25 Jun 2023 16:34:11 +0800 Subject: [PATCH 2/4] remove --stats --- tools/src/bin/sst-metadata.rs | 83 ++++++++++++++--------------------- 1 file changed, 33 insertions(+), 50 deletions(-) diff --git a/tools/src/bin/sst-metadata.rs b/tools/src/bin/sst-metadata.rs index 9c0807220d..dc7dda337a 100644 --- a/tools/src/bin/sst-metadata.rs +++ b/tools/src/bin/sst-metadata.rs @@ -27,10 +27,6 @@ struct Args { #[clap(short, long, required(false))] verbose: bool, - /// File & Field Statistics print - #[clap(short, long, required(false))] - stats: bool, - /// Thread num, 0 means cpu num #[clap(short, long, default_value_t = 0)] threads: usize, @@ -123,52 +119,6 @@ async fn run(args: Args) -> Result<()> { metas.push(meta); } - if args.stats { - let mut file_stats = FileStatistics::default(); - let mut field_stats_map = HashMap::new(); - for (object_meta, sst_metadata, metadata_size, kv_size) in metas { - let parquet_meta = sst_metadata.parquet(); - - file_stats.file_count += 1; - file_stats.size += object_meta.size; - file_stats.metadata_size += metadata_size; - file_stats.kv_size += kv_size; - let filter_size = sst_metadata - .custom() - .parquet_filter - .as_ref() - .map(|f| f.size()) - .unwrap_or(0); - file_stats.filter_size += filter_size; - file_stats.row_num += parquet_meta.file_metadata().num_rows(); - - let fields = parquet_meta.file_metadata().schema().get_fields(); - for (_, row_group) in parquet_meta.row_groups().iter().enumerate() { - for i in 0..fields.len() { - let column_meta = row_group.column(i); - let field_name = fields.get(i).unwrap().get_basic_info().name().to_string(); - if !field_stats_map.contains_key(&field_name) { - field_stats_map.insert(field_name.clone(), FieldStatistics::default()); - } - let field_stats = field_stats_map.get_mut(&field_name).unwrap(); - field_stats.compressed_size += column_meta.compressed_size(); - field_stats.uncompressed_size += column_meta.uncompressed_size(); - } - } - } - println!("{}", file_stats.to_string()); - - println!("FieldStatistics: "); - for (k, v) in field_stats_map.iter() { - println!("{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}", - k, - as_mb(v.compressed_size as usize), - as_mb(v.uncompressed_size as usize), - v.uncompressed_size as f64 / v.compressed_size as f64); - } - return Ok(()); - } - // sort by time_range asc metas.sort_by(|a, b| { a.1.custom() @@ -177,6 +127,8 @@ async fn run(args: Args) -> Result<()> { .cmp(&b.1.custom().time_range.inclusive_start()) }); + let mut file_stats = FileStatistics::default(); + let mut field_stats_map = HashMap::new(); for (object_meta, sst_metadata, metadata_size, kv_size) in metas { let ObjectMeta { location, size, .. } = &object_meta; let custom_meta = sst_metadata.custom(); @@ -192,6 +144,28 @@ async fn run(args: Args) -> Result<()> { .unwrap_or(0); let file_metadata = parquet_meta.file_metadata(); let row_num = file_metadata.num_rows(); + + file_stats.file_count += 1; + file_stats.size += object_meta.size; + file_stats.metadata_size += metadata_size; + file_stats.kv_size += kv_size; + file_stats.filter_size += filter_size; + file_stats.row_num += row_num; + + let fields = file_metadata.schema().get_fields(); + for (_, row_group) in parquet_meta.row_groups().iter().enumerate() { + for i in 0..fields.len() { + let column_meta = row_group.column(i); + let field_name = fields.get(i).unwrap().get_basic_info().name().to_string(); + if !field_stats_map.contains_key(&field_name) { + field_stats_map.insert(field_name.clone(), FieldStatistics::default()); + } + let field_stats = field_stats_map.get_mut(&field_name).unwrap(); + field_stats.compressed_size += column_meta.compressed_size(); + field_stats.uncompressed_size += column_meta.uncompressed_size(); + } + } + if verbose { println!("object_meta:{object_meta:?}, parquet_meta:{parquet_meta:?}, custom_meta:{custom_meta:?}"); } else { @@ -205,6 +179,15 @@ async fn run(args: Args) -> Result<()> { } } + println!("{}", file_stats.to_string()); + println!("FieldStatistics: "); + for (k, v) in field_stats_map.iter() { + println!("{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}", + k, + as_mb(v.compressed_size as usize), + as_mb(v.uncompressed_size as usize), + v.uncompressed_size as f64 / v.compressed_size as f64); + } Ok(()) } From 5c32d25642743b517b282216cf090a1fbe6d8811 Mon Sep 17 00:00:00 2001 From: zouxiang Date: Sun, 25 Jun 2023 17:06:45 +0800 Subject: [PATCH 3/4] fix: style check --- tools/src/bin/sst-metadata.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/src/bin/sst-metadata.rs b/tools/src/bin/sst-metadata.rs index dc7dda337a..bacfe180c6 100644 --- a/tools/src/bin/sst-metadata.rs +++ b/tools/src/bin/sst-metadata.rs @@ -182,11 +182,13 @@ async fn run(args: Args) -> Result<()> { println!("{}", file_stats.to_string()); println!("FieldStatistics: "); for (k, v) in field_stats_map.iter() { - println!("{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}", - k, - as_mb(v.compressed_size as usize), - as_mb(v.uncompressed_size as usize), - v.uncompressed_size as f64 / v.compressed_size as f64); + println!( + "{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}", + k, + as_mb(v.compressed_size as usize), + as_mb(v.uncompressed_size as usize), + v.uncompressed_size as f64 / v.compressed_size as f64 + ); } Ok(()) } From 0a6471eea7d25ed883ab672a275b9f75f9b81597 Mon Sep 17 00:00:00 2001 From: zouxiang Date: Sun, 25 Jun 2023 17:47:25 +0800 Subject: [PATCH 4/4] fix: simplify usage of map --- tools/src/bin/sst-metadata.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/src/bin/sst-metadata.rs b/tools/src/bin/sst-metadata.rs index bacfe180c6..a089ad2da5 100644 --- a/tools/src/bin/sst-metadata.rs +++ b/tools/src/bin/sst-metadata.rs @@ -157,10 +157,9 @@ async fn run(args: Args) -> Result<()> { for i in 0..fields.len() { let column_meta = row_group.column(i); let field_name = fields.get(i).unwrap().get_basic_info().name().to_string(); - if !field_stats_map.contains_key(&field_name) { - field_stats_map.insert(field_name.clone(), FieldStatistics::default()); - } - let field_stats = field_stats_map.get_mut(&field_name).unwrap(); + let mut field_stats = field_stats_map + .entry(field_name) + .or_insert(FieldStatistics::default()); field_stats.compressed_size += column_meta.compressed_size(); field_stats.uncompressed_size += column_meta.uncompressed_size(); }