Skip to content

Commit 5aaebd6

Browse files
adriangbclaude
authored andcommitted
Fix predicate simplification for incompatible types in push_down_filter (#17521)
* Fix predicate simplification for incompatible types in push_down_filter Handle the case where scalar values with incompatible types (e.g., TimestampMicrosecond vs Time64Nanosecond) cannot be compared during predicate simplification. The fix catches the comparison error in find_most_restrictive_predicate and returns None to indicate we can't simplify when types are incompatible, preventing the "Uncomparable values" error. Fixes #17512 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com> * minimize diff * use proposed fix, add test * revert other changes --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent a7e1472 commit 5aaebd6

File tree

3 files changed

+121
-6
lines changed

3 files changed

+121
-6
lines changed

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder {
166166
}
167167
}
168168

169-
170169
impl ParquetOptions {
171170
/// Convert the global session options, [`ParquetOptions`], into a single write action's [`WriterPropertiesBuilder`].
172171
///
@@ -502,7 +501,7 @@ mod tests {
502501
EnabledStatistics::Chunk => "chunk",
503502
EnabledStatistics::Page => "page",
504503
}
505-
.into(),
504+
.into(),
506505
),
507506
bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp),
508507
bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv),
@@ -647,7 +646,7 @@ mod tests {
647646
COL_NAME.into(),
648647
column_options_with_non_defaults(&parquet_options),
649648
)]
650-
.into(),
649+
.into(),
651650
key_value_metadata: [(key, value)].into(),
652651
crypto: Default::default(),
653652
};

datafusion/optimizer/src/simplify_expressions/simplify_predicates.rs

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
//! encompasses the former, resulting in fewer checks during query execution.
2727
2828
use datafusion_common::{Column, Result, ScalarValue};
29-
use datafusion_expr::{BinaryExpr, Cast, Expr, Operator};
29+
use datafusion_expr::{BinaryExpr, Expr, Operator};
3030
use std::collections::BTreeMap;
3131

3232
/// Simplifies a list of predicates by removing redundancies.
@@ -239,8 +239,99 @@ fn find_most_restrictive_predicate(
239239
fn extract_column_from_expr(expr: &Expr) -> Option<Column> {
240240
match expr {
241241
Expr::Column(col) => Some(col.clone()),
242-
// Handle cases where the column might be wrapped in a cast or other operation
243-
Expr::Cast(Cast { expr, .. }) => extract_column_from_expr(expr),
244242
_ => None,
245243
}
246244
}
245+
246+
#[cfg(test)]
247+
mod tests {
248+
use super::*;
249+
use arrow::datatypes::DataType;
250+
use datafusion_expr::{cast, col, lit};
251+
252+
#[test]
253+
fn test_simplify_predicates_with_cast() {
254+
// Test that predicates on cast expressions are not grouped with predicates on the raw column
255+
// a < 5 AND CAST(a AS varchar) < 'abc' AND a < 6
256+
// Should simplify to:
257+
// a < 5 AND CAST(a AS varchar) < 'abc'
258+
259+
let predicates = vec![
260+
col("a").lt(lit(5i32)),
261+
cast(col("a"), DataType::Utf8).lt(lit("abc")),
262+
col("a").lt(lit(6i32)),
263+
];
264+
265+
let result = simplify_predicates(predicates).unwrap();
266+
267+
// Should have 2 predicates: a < 5 and CAST(a AS varchar) < 'abc'
268+
assert_eq!(result.len(), 2);
269+
270+
// Check that the cast predicate is preserved
271+
let has_cast_predicate = result.iter().any(|p| {
272+
matches!(p, Expr::BinaryExpr(BinaryExpr {
273+
left,
274+
op: Operator::Lt,
275+
right
276+
}) if matches!(left.as_ref(), Expr::Cast(_)) && right == &Box::new(lit("abc")))
277+
});
278+
assert!(has_cast_predicate, "Cast predicate should be preserved");
279+
280+
// Check that we have the more restrictive column predicate (a < 5)
281+
let has_column_predicate = result.iter().any(|p| {
282+
matches!(p, Expr::BinaryExpr(BinaryExpr {
283+
left,
284+
op: Operator::Lt,
285+
right
286+
}) if left == &Box::new(col("a")) && right == &Box::new(lit(5i32)))
287+
});
288+
assert!(has_column_predicate, "Should have a < 5 predicate");
289+
}
290+
291+
#[test]
292+
fn test_extract_column_ignores_cast() {
293+
// Test that extract_column_from_expr does not extract columns from cast expressions
294+
let cast_expr = cast(col("a"), DataType::Utf8);
295+
assert_eq!(extract_column_from_expr(&cast_expr), None);
296+
297+
// Test that it still extracts from direct column references
298+
let col_expr = col("a");
299+
assert_eq!(extract_column_from_expr(&col_expr), Some(Column::from("a")));
300+
}
301+
302+
#[test]
303+
fn test_simplify_predicates_direct_columns_only() {
304+
// Test that only predicates on direct columns are simplified together
305+
let predicates = vec![
306+
col("a").lt(lit(5i32)),
307+
col("a").lt(lit(3i32)),
308+
col("b").gt(lit(10i32)),
309+
col("b").gt(lit(20i32)),
310+
];
311+
312+
let result = simplify_predicates(predicates).unwrap();
313+
314+
// Should have 2 predicates: a < 3 and b > 20 (most restrictive for each column)
315+
assert_eq!(result.len(), 2);
316+
317+
// Check for a < 3
318+
let has_a_predicate = result.iter().any(|p| {
319+
matches!(p, Expr::BinaryExpr(BinaryExpr {
320+
left,
321+
op: Operator::Lt,
322+
right
323+
}) if left == &Box::new(col("a")) && right == &Box::new(lit(3i32)))
324+
});
325+
assert!(has_a_predicate, "Should have a < 3 predicate");
326+
327+
// Check for b > 20
328+
let has_b_predicate = result.iter().any(|p| {
329+
matches!(p, Expr::BinaryExpr(BinaryExpr {
330+
left,
331+
op: Operator::Gt,
332+
right
333+
}) if left == &Box::new(col("b")) && right == &Box::new(lit(20i32)))
334+
});
335+
assert!(has_b_predicate, "Should have b > 20 predicate");
336+
}
337+
}

datafusion/sqllogictest/test_files/push_down_filter.slt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,3 +395,28 @@ order by t1.k, t2.v;
395395
----
396396
1 1 1
397397
10000000 10000000 10000000
398+
399+
# Regression test for https://github.com/apache/datafusion/issues/17512
400+
401+
query I
402+
COPY (
403+
SELECT arrow_cast('2025-01-01T00:00:00Z'::timestamptz, 'Timestamp(Microsecond, Some("UTC"))') AS start_timestamp
404+
)
405+
TO 'test_files/scratch/push_down_filter/17512.parquet'
406+
STORED AS PARQUET;
407+
----
408+
1
409+
410+
statement ok
411+
CREATE EXTERNAL TABLE records STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter/17512.parquet';
412+
413+
query I
414+
SELECT 1
415+
FROM (
416+
SELECT start_timestamp
417+
FROM records
418+
WHERE start_timestamp <= '2025-01-01T00:00:00Z'::timestamptz
419+
) AS t
420+
WHERE t.start_timestamp::time < '00:00:01'::time;
421+
----
422+
1

0 commit comments

Comments
 (0)