diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 1db4f8ede692..683cb809a5b1 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1430,12 +1430,12 @@ impl TableProvider for DataFrameTableProvider { Some(&self.plan) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> Result { + filters: &[&Expr], + ) -> Result> { // A filter is added on the DataFrame when given - Ok(TableProviderFilterPushDown::Exact) + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } fn schema(&self) -> SchemaRef { diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/core/src/datasource/cte_worktable.rs index 71075839b9a0..f8fd94d4d3fd 100644 --- a/datafusion/core/src/datasource/cte_worktable.rs +++ b/datafusion/core/src/datasource/cte_worktable.rs @@ -89,11 +89,14 @@ impl TableProvider for CteWorkTable { ))) } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> Result { + filters: &[&Expr], + ) -> Result> { // TODO: should we support filter pushdown? - Ok(TableProviderFilterPushDown::Unsupported) + Ok(vec![ + TableProviderFilterPushDown::Unsupported; + filters.len() + ]) } } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index c1e337b5c44a..5ef7b6241b60 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -685,26 +685,32 @@ impl TableProvider for ListingTable { .await } - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - filter: &Expr, - ) -> Result { - if expr_applicable_for_cols( - &self - .options - .table_partition_cols - .iter() - .map(|x| x.0.clone()) - .collect::>(), - filter, - ) { - // if filter can be handled by partiton pruning, it is exact - Ok(TableProviderFilterPushDown::Exact) - } else { - // otherwise, we still might be able to handle the filter with file - // level mechanisms such as Parquet row group pruning. - Ok(TableProviderFilterPushDown::Inexact) - } + filters: &[&Expr], + ) -> Result> { + let support: Vec<_> = filters + .iter() + .map(|filter| { + if expr_applicable_for_cols( + &self + .options + .table_partition_cols + .iter() + .map(|x| x.0.clone()) + .collect::>(), + filter, + ) { + // if filter can be handled by partition pruning, it is exact + TableProviderFilterPushDown::Exact + } else { + // otherwise, we still might be able to handle the filter with file + // level mechanisms such as Parquet row group pruning. + TableProviderFilterPushDown::Inexact + } + }) + .collect(); + Ok(support) } fn get_table_definition(&self) -> Option<&str> { diff --git a/datafusion/core/src/datasource/provider.rs b/datafusion/core/src/datasource/provider.rs index f2e3e907e5ce..9aac072ed4e2 100644 --- a/datafusion/core/src/datasource/provider.rs +++ b/datafusion/core/src/datasource/provider.rs @@ -96,6 +96,10 @@ pub trait TableProvider: Sync + Send { /// which *all* of the `Expr`s evaluate to `true` must be returned (aka the /// expressions are `AND`ed together). /// + /// To enable filter pushdown you must override + /// [`Self::supports_filters_pushdown`] as the default implementation does + /// not and `filters` will be empty. + /// /// DataFusion pushes filtering into the scans whenever possible /// ("Filter Pushdown"), and depending on the format and the /// implementation of the format, evaluating the predicate during the scan @@ -154,28 +158,31 @@ pub trait TableProvider: Sync + Send { limit: Option, ) -> Result>; - /// Tests whether the table provider can make use of a filter expression - /// to optimise data retrieval. - #[deprecated(since = "20.0.0", note = "use supports_filters_pushdown instead")] - fn supports_filter_pushdown( - &self, - _filter: &Expr, - ) -> Result { - Ok(TableProviderFilterPushDown::Unsupported) - } - - /// Tests whether the table provider can make use of any or all filter expressions - /// to optimise data retrieval. - /// Note: the returned vector much have the same size as the filters argument. - #[allow(deprecated)] + /// Specify if DataFusion should provide filter expressions to the + /// TableProvider to apply *during* the scan. + /// + /// The return value must have one element for each filter expression passed + /// in. The value of each element indicates if the TableProvider can apply + /// that particular filter during the scan. + /// + /// Some TableProviders can evaluate filters more efficiently than the + /// `Filter` operator in DataFusion, for example by using an index. + /// + /// By default, returns [`Unsupported`] for all filters, meaning no filters + /// will be provided to [`Self::scan`]. If the TableProvider can implement + /// filter pushdown, it should return either [`Exact`] or [`Inexact`]. + /// + /// [`Unsupported`]: TableProviderFilterPushDown::Unsupported + /// [`Exact`]: TableProviderFilterPushDown::Exact + /// [`Inexact`]: TableProviderFilterPushDown::Inexact fn supports_filters_pushdown( &self, filters: &[&Expr], ) -> Result> { - filters - .iter() - .map(|f| self.supports_filter_pushdown(f)) - .collect() + Ok(vec![ + TableProviderFilterPushDown::Unsupported; + filters.len() + ]) } /// Get statistics for this table, if available diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index d1b7dad15225..31e812332c94 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -93,13 +93,12 @@ impl TableProvider for ViewTable { fn get_table_definition(&self) -> Option<&str> { self.definition.as_deref() } - - fn supports_filter_pushdown( + fn supports_filters_pushdown( &self, - _filter: &Expr, - ) -> Result { + filters: &[&Expr], + ) -> Result> { // A filter is added on the View when given - Ok(TableProviderFilterPushDown::Exact) + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } async fn scan( diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index bc6d85a74a51..2ae41391f42d 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -214,8 +214,11 @@ impl TableProvider for CustomProvider { } } - fn supports_filter_pushdown(&self, _: &Expr) -> Result { - Ok(TableProviderFilterPushDown::Exact) + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } } diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 565f48c1c5a9..f662f4d9f77d 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -24,20 +24,29 @@ use datafusion_common::{Constraints, Result}; use std::any::Any; -/// Indicates whether and how a filter expression can be handled by a -/// TableProvider for table scans. +/// Indicates how a filter expression is handled by +/// [`TableProvider::scan`]. +/// +/// Filter expressions are boolean expressions used to reduce the number of +/// rows that are read from a table. Only rows that evaluate to `true` ("pass +/// the filter") are returned. Rows that evaluate to `false` or `NULL` are +/// omitted. +/// +/// [`TableProvider::scan`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html#tymethod.scan #[derive(Debug, Clone, PartialEq, Eq)] pub enum TableProviderFilterPushDown { - /// The expression cannot be used by the provider. + /// The filter cannot be used by the provider and will not be pushed down. Unsupported, - /// The expression can be used to reduce the data retrieved, - /// but the provider cannot guarantee it will omit all tuples that - /// may be filtered. In this case, DataFusion will apply an additional - /// `Filter` operation after the scan to ensure all rows are filtered correctly. + /// The filter can be used, but the provider might still return some tuples + /// that do not pass the filter. + /// + /// In this case, DataFusion applies an additional `Filter` operation + /// after the scan to ensure all rows are filtered correctly. Inexact, - /// The provider **guarantees** that it will omit **all** tuples that are - /// filtered by the filter expression. This is the fastest option, if available - /// as DataFusion will not apply additional filtering. + /// The provider **guarantees** that it will omit **only** tuples which + /// pass the filter. + /// + /// In this case, DataFusion will not apply additional filtering. Exact, }