-
Notifications
You must be signed in to change notification settings - Fork 209
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add sst-metadata tool to query sst metadata (#927)
## Related Issues Closes # When debug compaction related issue, it's helpful to check sst metadata. ## Detailed Changes - Add new cli tool `sst-metadata`. ## Test Plan Manually ``` $ ./target/release/sst-metadata --dir ~/bench/data/store/2/2199023255817/ Location:2/2199023255817/51491.sst, time_range:[2022-09-05 10:00:00, 2022-09-05 12:00:00), max_seq:305309, size:440.000M, metadata:55.219M, kv:51.565M, filter:38.470M, row_num:14480000 Location:2/2199023255817/53873.sst, time_range:[2022-09-05 10:00:00, 2022-09-05 12:00:00), max_seq:319320, size:666.969M, metadata:83.759M, kv:78.198M, filter:58.342M, row_num:21960000 Location:2/2199023255817/51538.sst, time_range:[2022-09-05 10:00:00, 2022-09-05 12:00:00), max_seq:305001, size:1161.511M, metadata:145.723M, kv:136.025M, filter:101.489M, row_num:38200000 Location:2/2199023255817/53269.sst, time_range:[2022-09-05 10:00:00, 2022-09-05 12:00:00), max_seq:315057, size:1176.751M, metadata:147.631M, kv:137.805M, filter:102.817M, row_num:38700000 Location:2/2199023255817/53973.sst, time_range:[2022-09-05 10:00:00, 2022-09-05 12:00:00), max_seq:320198, size:1183.643M, metadata:148.508M, kv:138.623M, filter:103.428M, row_num:38930000 ```
- Loading branch information
1 parent
6303f4c
commit 7692507
Showing
11 changed files
with
216 additions
and
16 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0. | ||
|
||
pub use hex::encode; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0. | ||
|
||
//! A cli to query sst meta data | ||
use std::sync::Arc; | ||
|
||
use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter}; | ||
use anyhow::{Context, Result}; | ||
use clap::Parser; | ||
use common_util::{ | ||
runtime::{self, Runtime}, | ||
time::format_as_ymdhms, | ||
}; | ||
use futures::StreamExt; | ||
use object_store::{LocalFileSystem, ObjectMeta, ObjectStoreRef, Path}; | ||
use parquet_ext::meta_data::fetch_parquet_metadata; | ||
use tokio::{runtime::Handle, task::JoinSet}; | ||
|
||
#[derive(Parser, Debug)] | ||
#[clap(author, version, about, long_about = None)] | ||
struct Args { | ||
/// SST directory | ||
#[clap(short, long, required(true))] | ||
dir: String, | ||
|
||
/// Verbose print | ||
#[clap(short, long, required(false))] | ||
verbose: bool, | ||
|
||
/// Thread num, 0 means cpu num | ||
#[clap(short, long, default_value_t = 0)] | ||
threads: usize, | ||
} | ||
|
||
fn new_runtime(thread_num: usize) -> Runtime { | ||
runtime::Builder::default() | ||
.thread_name("sst-metadata") | ||
.worker_threads(thread_num) | ||
.enable_all() | ||
.build() | ||
.unwrap() | ||
} | ||
|
||
fn main() { | ||
let args = Args::parse(); | ||
let thread_num = if args.threads == 0 { | ||
num_cpus::get() | ||
} else { | ||
args.threads | ||
}; | ||
let rt = Arc::new(new_runtime(thread_num)); | ||
rt.block_on(async move { | ||
if let Err(e) = run(args).await { | ||
eprintln!("Run failed, err:{e}"); | ||
} | ||
}); | ||
} | ||
|
||
async fn run(args: Args) -> Result<()> { | ||
let handle = Handle::current(); | ||
let storage = LocalFileSystem::new_with_prefix(&args.dir)?; | ||
let storage: ObjectStoreRef = Arc::new(storage); | ||
|
||
let mut join_set = JoinSet::new(); | ||
let mut ssts = storage.list(None).await?; | ||
while let Some(object_meta) = ssts.next().await { | ||
let object_meta = object_meta?; | ||
let storage = storage.clone(); | ||
let location = object_meta.location.clone(); | ||
join_set.spawn_on( | ||
async move { | ||
let (metadata, metadata_size, kv_size) = | ||
parse_metadata(storage, location, object_meta.size).await?; | ||
Ok::<_, anyhow::Error>((object_meta, metadata, metadata_size, kv_size)) | ||
}, | ||
&handle, | ||
); | ||
} | ||
|
||
let mut metas = Vec::with_capacity(join_set.len()); | ||
while let Some(meta) = join_set.join_next().await { | ||
let meta = meta.context("join err")?; | ||
let meta = meta.context("parse metadata err")?; | ||
metas.push(meta); | ||
} | ||
|
||
// sort by time_range asc | ||
metas.sort_by(|a, b| { | ||
a.1.custom() | ||
.time_range | ||
.inclusive_start() | ||
.cmp(&b.1.custom().time_range.inclusive_start()) | ||
}); | ||
|
||
for (object_meta, sst_metadata, metadata_size, kv_size) in metas { | ||
let ObjectMeta { location, size, .. } = &object_meta; | ||
let custom_meta = sst_metadata.custom(); | ||
let parquet_meta = sst_metadata.parquet(); | ||
let time_range = custom_meta.time_range; | ||
let start = format_as_ymdhms(time_range.inclusive_start().as_i64()); | ||
let end = format_as_ymdhms(time_range.exclusive_end().as_i64()); | ||
let seq = custom_meta.max_sequence; | ||
let filter_size = custom_meta | ||
.parquet_filter | ||
.as_ref() | ||
.map(|f| f.size()) | ||
.unwrap_or(0); | ||
let file_metadata = parquet_meta.file_metadata(); | ||
let row_num = file_metadata.num_rows(); | ||
if args.verbose { | ||
println!("object_meta:{object_meta:?}, parquet_meta:{parquet_meta:?}"); | ||
} else { | ||
let size_mb = as_mb(*size); | ||
let metadata_mb = as_mb(metadata_size); | ||
let filter_mb = as_mb(filter_size); | ||
let kv_mb = as_mb(kv_size); | ||
println!( | ||
"Location:{location}, time_range:[{start}, {end}), max_seq:{seq}, size:{size_mb:.3}M, metadata:{metadata_mb:.3}M, kv:{kv_mb:.3}M, filter:{filter_mb:.3}M, row_num:{row_num}" | ||
); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
fn as_mb(v: usize) -> f64 { | ||
v as f64 / 1024.0 / 1024.0 | ||
} | ||
|
||
async fn parse_metadata( | ||
storage: ObjectStoreRef, | ||
path: Path, | ||
size: usize, | ||
) -> Result<(MetaData, usize, usize)> { | ||
let reader = ChunkReaderAdapter::new(&path, &storage); | ||
let (parquet_metadata, metadata_size) = fetch_parquet_metadata(size, &reader).await?; | ||
let kv_metadata = parquet_metadata.file_metadata().key_value_metadata(); | ||
let kv_size = kv_metadata | ||
.map(|kvs| { | ||
kvs.iter() | ||
.map(|kv| kv.key.as_bytes().len() + kv.value.as_ref().map(|v| v.len()).unwrap_or(0)) | ||
.sum() | ||
}) | ||
.unwrap_or(0); | ||
|
||
let md = MetaData::try_new(&parquet_metadata, false)?; | ||
Ok((md, metadata_size, kv_size)) | ||
} |