From 216c4a87deedab6c9fd2ca4401865c740a93c725 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 11 Feb 2026 16:04:29 -0700 Subject: [PATCH] fix: [df52] route timestamp timezone mismatches through spark_parquet_convert INT96 Parquet timestamps are coerced to Timestamp(us, None) by DataFusion but the logical schema expects Timestamp(us, Some("UTC")). The schema adapter was routing this mismatch through Spark's Cast expression, which incorrectly treats None-timezone values as TimestampNTZ (local time) and applies a timezone conversion. This caused results to be shifted by the session timezone offset (e.g., -5h45m for Asia/Kathmandu). Route Timestamp->Timestamp mismatches through CometCastColumnExpr which delegates to spark_parquet_convert, handling this as a metadata-only timezone relabel without modifying the underlying values. Co-Authored-By: Claude Opus 4.6 --- native/core/src/parquet/schema_adapter.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/native/core/src/parquet/schema_adapter.rs b/native/core/src/parquet/schema_adapter.rs index 2f0ecb8e87..2a10829427 100644 --- a/native/core/src/parquet/schema_adapter.rs +++ b/native/core/src/parquet/schema_adapter.rs @@ -194,14 +194,23 @@ impl SparkPhysicalExprAdapter { let physical_type = cast.input_field().data_type(); let target_type = cast.target_field().data_type(); - // For complex nested types (Struct, List, Map), use CometCastColumnExpr - // with spark_parquet_convert which handles field-name-based selection, - // reordering, and nested type casting correctly. + // For complex nested types (Struct, List, Map) and Timestamp timezone + // mismatches, use CometCastColumnExpr with spark_parquet_convert which + // handles field-name-based selection, reordering, nested type casting, + // and metadata-only timestamp timezone relabeling correctly. + // + // Timestamp mismatches (e.g., Timestamp(us, None) -> Timestamp(us, Some("UTC"))) + // occur when INT96 Parquet timestamps are coerced to Timestamp(us, None) by + // DataFusion but the logical schema expects Timestamp(us, Some("UTC")). + // Using Spark's Cast here would incorrectly treat the None-timezone values as + // local time (TimestampNTZ) and apply a timezone conversion, but the values are + // already in UTC. spark_parquet_convert handles this as a metadata-only change. if matches!( (physical_type, target_type), (DataType::Struct(_), DataType::Struct(_)) | (DataType::List(_), DataType::List(_)) | (DataType::Map(_, _), DataType::Map(_, _)) + | (DataType::Timestamp(_, _), DataType::Timestamp(_, _)) ) { let comet_cast: Arc = Arc::new( CometCastColumnExpr::new(