apache
diff --git a/‎datafusion-examples/examples/json_shredding.rs‎
Lines changed: 1 addition & 1 deletion b/‎datafusion-examples/examples/json_shredding.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datafusion/core/src/datasource/physical_plan/parquet.rs‎
Lines changed: 37 additions & 10 deletions b/‎datafusion/core/src/datasource/physical_plan/parquet.rs‎
Lines changed: 37 additions & 10 deletions
diff --git a/‎datafusion/core/tests/parquet/external_access_plan.rs‎
Lines changed: 16 additions & 7 deletions b/‎datafusion/core/tests/parquet/external_access_plan.rs‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎datafusion/core/tests/parquet/filter_pushdown.rs‎
Lines changed: 23 additions & 8 deletions b/‎datafusion/core/tests/parquet/filter_pushdown.rs‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎datafusion/core/tests/parquet/mod.rs‎
Lines changed: 54 additions & 36 deletions b/‎datafusion/core/tests/parquet/mod.rs‎
Lines changed: 54 additions & 36 deletions
@@ -142,7 +142,7 @@ async fn main() -> Result<()> {
         .await?;
     let plan = format!("{}", arrow::util::pretty::pretty_format_batches(&batches)?);
     println!("{plan}");
-    assert_contains!(&plan, "row_groups_pruned_statistics=1");
+    assert_contains!(&plan, "row_groups_pruned_statistics=2 total → 1 matched");
     assert_contains!(&plan, "pushdown_rows_pruned=1");
 
     Ok(())
 
@@ -65,7 +65,7 @@ mod tests {
     use datafusion_physical_plan::analyze::AnalyzeExec;
     use datafusion_physical_plan::collect;
     use datafusion_physical_plan::metrics::{
-        ExecutionPlanMetricsSet, MetricType, MetricsSet,
+        ExecutionPlanMetricsSet, MetricType, MetricValue, MetricsSet,
     };
     use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties};
 
@@ -1175,8 +1175,10 @@ mod tests {
         // There are 4 rows pruned in each of batch2, batch3, and
         // batch4 for a total of 12. batch1 had no pruning as c2 was
         // filled in as null
-        assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 12);
-        assert_eq!(get_value(&metrics, "page_index_rows_matched"), 6);
+        let (page_index_pruned, page_index_matched) =
+            get_pruning_metric(&metrics, "page_index_rows_pruned");
+        assert_eq!(page_index_pruned, 12);
+        assert_eq!(page_index_matched, 6);
     }
 
     #[tokio::test]
@@ -1776,8 +1778,10 @@ mod tests {
             | 5   |
             +-----+
         "###);
-        assert_eq!(get_value(&metrics, "page_index_rows_pruned"), 4);
-        assert_eq!(get_value(&metrics, "page_index_rows_matched"), 2);
+        let (page_index_pruned, page_index_matched) =
+            get_pruning_metric(&metrics, "page_index_rows_pruned");
+        assert_eq!(page_index_pruned, 4);
+        assert_eq!(page_index_matched, 2);
         assert!(
             get_value(&metrics, "page_index_eval_time") > 0,
             "no eval time in metrics: {metrics:#?}"
@@ -1866,8 +1870,10 @@ mod tests {
         assert_contains!(&explain, "predicate=c1@0 != bar");
 
         // there's a single row group, but we can check that it matched
-        // if no pruning was done this would be 0 instead of 1
-        assert_contains!(&explain, "row_groups_matched_statistics=1");
+        assert_contains!(
+            &explain,
+            "row_groups_pruned_statistics=1 total \u{2192} 1 matched"
+        );
 
         // check the projection
         assert_contains!(&explain, "projection=[c1]");
@@ -1898,8 +1904,10 @@ mod tests {
 
         // When both matched and pruned are 0, it means that the pruning predicate
         // was not used at all.
-        assert_contains!(&explain, "row_groups_matched_statistics=0");
-        assert_contains!(&explain, "row_groups_pruned_statistics=0");
+        assert_contains!(
+            &explain,
+            "row_groups_pruned_statistics=1 total \u{2192} 1 matched"
+        );
 
         // But pushdown predicate should be present
         assert_contains!(
@@ -1952,7 +1960,12 @@ mod tests {
     /// Panics if no such metric.
     fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize {
         match metrics.sum_by_name(metric_name) {
-            Some(v) => v.as_usize(),
+            Some(v) => match v {
+                MetricValue::PruningMetrics {
+                    pruning_metrics, ..
+                } => pruning_metrics.pruned(),
+                _ => v.as_usize(),
+            },
             _ => {
                 panic!(
                     "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
@@ -1961,6 +1974,20 @@ mod tests {
         }
     }
 
+    fn get_pruning_metric(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) {
+        match metrics.sum_by_name(metric_name) {
+            Some(MetricValue::PruningMetrics {
+                pruning_metrics, ..
+            }) => (pruning_metrics.pruned(), pruning_metrics.matched()),
+            Some(_) => panic!(
+                "Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}"
+            ),
+            None => panic!(
+                "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
+            ),
+        }
+    }
+
     fn populate_csv_partitions(
         tmp_dir: &TempDir,
         partition_count: usize,
 
@@ -33,7 +33,7 @@ use datafusion_common::{assert_contains, DFSchema};
 use datafusion_datasource_parquet::{ParquetAccessPlan, RowGroupAccess};
 use datafusion_execution::object_store::ObjectStoreUrl;
 use datafusion_expr::{col, lit, Expr};
-use datafusion_physical_plan::metrics::MetricsSet;
+use datafusion_physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion_physical_plan::ExecutionPlan;
 
 use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
@@ -178,12 +178,21 @@ async fn plan_and_filter() {
     .unwrap();
 
     // Verify that row group pruning still happens for just that group
-    let row_groups_pruned_statistics =
-        metric_value(&parquet_metrics, "row_groups_pruned_statistics").unwrap();
-    assert_eq!(
-        row_groups_pruned_statistics, 1,
-        "metrics : {parquet_metrics:#?}",
-    );
+    let row_groups_pruned_statistics = parquet_metrics
+        .sum_by_name("row_groups_pruned_statistics")
+        .unwrap();
+    if let MetricValue::PruningMetrics {
+        pruning_metrics, ..
+    } = row_groups_pruned_statistics
+    {
+        assert_eq!(
+            pruning_metrics.pruned(),
+            1,
+            "metrics : {parquet_metrics:#?}",
+        );
+    } else {
+        unreachable!("metrics `row_groups_pruned_statistics` should exist")
+    }
 }
 
 #[tokio::test]
 
@@ -29,7 +29,7 @@
 use arrow::compute::concat_batches;
 use arrow::record_batch::RecordBatch;
 use datafusion::physical_plan::collect;
-use datafusion::physical_plan::metrics::MetricsSet;
+use datafusion::physical_plan::metrics::{MetricValue, MetricsSet};
 use datafusion::prelude::{
     col, lit, lit_timestamp_nano, Expr, ParquetReadOptions, SessionContext,
 };
@@ -563,9 +563,9 @@ impl<'a> TestCase<'a> {
             }
         };
 
-        let page_index_rows_pruned = get_value(&metrics, "page_index_rows_pruned");
+        let (page_index_rows_pruned, page_index_rows_matched) =
+            get_pruning_metrics(&metrics, "page_index_rows_pruned");
         println!(" page_index_rows_pruned: {page_index_rows_pruned}");
-        let page_index_rows_matched = get_value(&metrics, "page_index_rows_matched");
         println!(" page_index_rows_matched: {page_index_rows_matched}");
 
         let page_index_filtering_expected = if scan_options.enable_page_index {
@@ -592,14 +592,29 @@ impl<'a> TestCase<'a> {
     }
 }
 
+fn get_pruning_metrics(metrics: &MetricsSet, metric_name: &str) -> (usize, usize) {
+    match metrics.sum_by_name(metric_name) {
+        Some(MetricValue::PruningMetrics {
+            pruning_metrics, ..
+        }) => (pruning_metrics.pruned(), pruning_metrics.matched()),
+        Some(_) => {
+            panic!("Metric '{metric_name}' is not a pruning metric in\n\n{metrics:#?}")
+        }
+        None => panic!(
+            "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
+        ),
+    }
+}
+
 fn get_value(metrics: &MetricsSet, metric_name: &str) -> usize {
     match metrics.sum_by_name(metric_name) {
+        Some(MetricValue::PruningMetrics {
+            pruning_metrics, ..
+        }) => pruning_metrics.pruned(),
         Some(v) => v.as_usize(),
-        _ => {
-            panic!(
-                "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
-            );
-        }
+        None => panic!(
+            "Expected metric not found. Looking for '{metric_name}' in\n\n{metrics:#?}"
+        ),
     }
 }
 
 
@@ -126,79 +126,97 @@ struct TestOutput {
 impl TestOutput {
     /// retrieve the value of the named metric, if any
     fn metric_value(&self, metric_name: &str) -> Option<usize> {
+        if let Some((pruned, _matched)) = self.pruning_metric(metric_name) {
+            return Some(pruned);
+        }
+
         self.parquet_metrics
             .sum(|metric| metric.value().name() == metric_name)
-            .map(|v| v.as_usize())
-    }
-
-    /// The number of times the pruning predicate evaluation errors
-    fn predicate_evaluation_errors(&self) -> Option<usize> {
-        self.metric_value("predicate_evaluation_errors")
-    }
-
-    /// The number of row_groups matched by bloom filter
-    fn row_groups_matched_bloom_filter(&self) -> Option<usize> {
-        self.metric_value("row_groups_matched_bloom_filter")
-    }
-
-    /// The number of row_groups pruned by bloom filter
-    fn row_groups_pruned_bloom_filter(&self) -> Option<usize> {
-        self.metric_value("row_groups_pruned_bloom_filter")
-    }
-
-    /// The number of row_groups matched by statistics
-    fn row_groups_matched_statistics(&self) -> Option<usize> {
-        self.metric_value("row_groups_matched_statistics")
-    }
-
-    /// The number of row_groups pruned by statistics
-    fn row_groups_pruned_statistics(&self) -> Option<usize> {
-        self.metric_value("row_groups_pruned_statistics")
+            .map(|v| match v {
+                MetricValue::PruningMetrics {
+                    pruning_metrics, ..
+                } => pruning_metrics.pruned(),
+                _ => v.as_usize(),
+            })
     }
 
-    /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
-    /// for testing purpose, here it only aggregate the `pruned` count.
-    fn files_ranges_pruned_statistics(&self) -> Option<usize> {
+    fn pruning_metric(&self, metric_name: &str) -> Option<(usize, usize)> {
         let mut total_pruned = 0;
+        let mut total_matched = 0;
         let mut found = false;
 
         for metric in self.parquet_metrics.iter() {
             let metric = metric.as_ref();
-            if metric.value().name() == "files_ranges_pruned_statistics" {
+            if metric.value().name() == metric_name {
                 if let MetricValue::PruningMetrics {
                     pruning_metrics, ..
                 } = metric.value()
                 {
                     total_pruned += pruning_metrics.pruned();
+                    total_matched += pruning_metrics.matched();
                     found = true;
                 }
             }
         }
 
         if found {
-            Some(total_pruned)
+            Some((total_pruned, total_matched))
         } else {
             None
         }
     }
 
+    /// The number of times the pruning predicate evaluation errors
+    fn predicate_evaluation_errors(&self) -> Option<usize> {
+        self.metric_value("predicate_evaluation_errors")
+    }
+
+    /// The number of row_groups pruned / matched by bloom filter
+    fn row_groups_bloom_filter(&self) -> Option<(usize, usize)> {
+        self.pruning_metric("row_groups_pruned_bloom_filter")
+    }
+
+    /// The number of row_groups matched by statistics
+    fn row_groups_matched_statistics(&self) -> Option<usize> {
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|(_pruned, matched)| matched)
+    }
+
+    /// The number of row_groups pruned by statistics
+    fn row_groups_pruned_statistics(&self) -> Option<usize> {
+        self.pruning_metric("row_groups_pruned_statistics")
+            .map(|(pruned, _matched)| pruned)
+    }
+
+    /// Metric `files_ranges_pruned_statistics` tracks both pruned and matched count,
+    /// for testing purpose, here it only aggregate the `pruned` count.
+    fn files_ranges_pruned_statistics(&self) -> Option<usize> {
+        self.pruning_metric("files_ranges_pruned_statistics")
+            .map(|(pruned, _matched)| pruned)
+    }
+
     /// The number of row_groups matched by bloom filter or statistics
+    ///
+    /// E.g. starting with 10 row groups, statistics: 10 total -> 7 matched, bloom
+    /// filter: 7 total -> 3 matched, this function returns 3 for the final matched
+    /// count.
     fn row_groups_matched(&self) -> Option<usize> {
-        self.row_groups_matched_bloom_filter()
-            .zip(self.row_groups_matched_statistics())
-            .map(|(a, b)| a + b)
+        self.row_groups_bloom_filter()
+            .map(|(_pruned, matched)| matched)
     }
 
     /// The number of row_groups pruned
     fn row_groups_pruned(&self) -> Option<usize> {
-        self.row_groups_pruned_bloom_filter()
+        self.row_groups_bloom_filter()
+            .map(|(pruned, _matched)| pruned)
             .zip(self.row_groups_pruned_statistics())
             .map(|(a, b)| a + b)
     }
 
     /// The number of row pages pruned
     fn row_pages_pruned(&self) -> Option<usize> {
-        self.metric_value("page_index_rows_pruned")
+        self.pruning_metric("page_index_rows_pruned")
+            .map(|(pruned, _matched)| pruned)
     }
 
     fn description(&self) -> String {