Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parquet: Add row_groups_matched_{statistics,bloom_filter} statistics #9640

Merged
merged 2 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions datafusion/core/src/datasource/physical_plan/parquet/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,12 @@ use crate::physical_plan::metrics::{
pub struct ParquetFileMetrics {
/// Number of times the predicate could not be evaluated
pub predicate_evaluation_errors: Count,
/// Number of row groups whose bloom filters were checked and matched
pub row_groups_matched_bloom_filter: Count,
/// Number of row groups pruned by bloom filters
pub row_groups_pruned_bloom_filter: Count,
/// Number of row groups whose statistics were checked and matched
pub row_groups_matched_statistics: Count,
/// Number of row groups pruned by statistics
pub row_groups_pruned_statistics: Count,
/// Total number of bytes scanned
Expand All @@ -56,10 +60,18 @@ impl ParquetFileMetrics {
.with_new_label("filename", filename.to_string())
.counter("predicate_evaluation_errors", partition);

let row_groups_matched_bloom_filter = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_matched_bloom_filter", partition);

let row_groups_pruned_bloom_filter = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_pruned_bloom_filter", partition);

let row_groups_matched_statistics = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_matched_statistics", partition);

let row_groups_pruned_statistics = MetricBuilder::new(metrics)
.with_new_label("filename", filename.to_string())
.counter("row_groups_pruned_statistics", partition);
Expand All @@ -85,7 +97,9 @@ impl ParquetFileMetrics {

Self {
predicate_evaluation_errors,
row_groups_matched_bloom_filter,
row_groups_pruned_bloom_filter,
row_groups_matched_statistics,
row_groups_pruned_statistics,
bytes_scanned,
pushdown_rows_filtered,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ pub(crate) fn prune_row_groups_by_statistics(
metrics.predicate_evaluation_errors.add(1);
}
}
metrics.row_groups_matched_statistics.add(1);
}

filtered.push(idx)
Expand Down Expand Up @@ -166,6 +167,9 @@ pub(crate) async fn prune_row_groups_by_bloom_filters<
if prune_group {
metrics.row_groups_pruned_bloom_filter.add(1);
} else {
if !stats.column_sbbf.is_empty() {
metrics.row_groups_matched_bloom_filter.add(1);
}
filtered.push(*idx);
}
}
Expand Down
17 changes: 17 additions & 0 deletions datafusion/core/tests/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,16 +117,33 @@ impl TestOutput {
self.metric_value("predicate_evaluation_errors")
}

/// The number of row_groups matched by bloom filter
fn row_groups_matched_bloom_filter(&self) -> Option<usize> {
self.metric_value("row_groups_matched_bloom_filter")
}

/// The number of row_groups pruned by bloom filter
fn row_groups_pruned_bloom_filter(&self) -> Option<usize> {
self.metric_value("row_groups_pruned_bloom_filter")
}

/// The number of row_groups matched by statistics
fn row_groups_matched_statistics(&self) -> Option<usize> {
self.metric_value("row_groups_matched_statistics")
}

/// The number of row_groups pruned by statistics
fn row_groups_pruned_statistics(&self) -> Option<usize> {
self.metric_value("row_groups_pruned_statistics")
}

/// The number of row_groups matched by bloom filter or statistics
fn row_groups_matched(&self) -> Option<usize> {
self.row_groups_matched_bloom_filter()
.zip(self.row_groups_matched_statistics())
.map(|(a, b)| a + b)
}

/// The number of row_groups pruned
fn row_groups_pruned(&self) -> Option<usize> {
self.row_groups_pruned_bloom_filter()
Expand Down
Loading
Loading