From b9b4779c76cf56d93a21b31799d3c7d8a282bba1 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Wed, 22 May 2024 16:26:15 -0700 Subject: [PATCH 1/3] fix Incorrect statistics read for i8 i16 columns in parquet --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 6 ++++++ datafusion/core/tests/parquet/arrow_statistics.rs | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 0ebf7dfe2384..489c855dd01f 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -75,6 +75,12 @@ macro_rules! get_statistic { *scale, )) } + Some(DataType::Int8) => { + Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap()))) + } + Some(DataType::Int16) => { + Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap()))) + } _ => Some(ScalarValue::Int32(Some(*s.$func()))), } } diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index 36fffe5ac4d9..6235fcd71843 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -383,7 +383,6 @@ async fn test_int_32() { // Note that the file has 4 columns named "i8", "i16", "i32", "i64". // - The tests on column i32 and i64 passed. // - The tests on column i8 and i16 failed. -#[ignore] #[tokio::test] async fn test_int_16() { // This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64" @@ -419,7 +418,6 @@ async fn test_int_16() { // BUG (same as above): ignore this test for now // https://github.com/apache/datafusion/issues/10585 -#[ignore] #[tokio::test] async fn test_int_8() { // This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64" From ca5141f892b49953a01680c503ea55d4175d8624 Mon Sep 17 00:00:00 2001 From: Yanxin Xiang Date: Wed, 22 May 2024 16:56:41 -0700 Subject: [PATCH 2/3] fix failed test --- .../physical_plan/parquet/statistics.rs | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index 489c855dd01f..260f75ec96c2 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -371,8 +371,8 @@ mod test { use super::*; use arrow_array::{ new_null_array, Array, BinaryArray, BooleanArray, Decimal128Array, Float32Array, - Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, StructArray, - TimestampNanosecondArray, + Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, + StringArray, StructArray, TimestampNanosecondArray, }; use arrow_schema::{Field, SchemaRef}; use bytes::Bytes; @@ -792,13 +792,13 @@ mod test { }) .with_column(ExpectedColumn { name: "tinyint_col", - expected_min: i32_array([Some(0)]), - expected_max: i32_array([Some(9)]), + expected_min: i8_array([Some(0)]), + expected_max: i8_array([Some(9)]), }) .with_column(ExpectedColumn { name: "smallint_col", - expected_min: i32_array([Some(0)]), - expected_max: i32_array([Some(9)]), + expected_min: i16_array([Some(0)]), + expected_max: i16_array([Some(9)]), }) .with_column(ExpectedColumn { name: "int_col", @@ -1024,6 +1024,16 @@ mod test { Arc::new(array) } + fn i8_array(input: impl IntoIterator>) -> ArrayRef { + let array: Int8Array = input.into_iter().collect(); + Arc::new(array) + } + + fn i16_array(input: impl IntoIterator>) -> ArrayRef { + let array: Int16Array = input.into_iter().collect(); + Arc::new(array) + } + fn i32_array(input: impl IntoIterator>) -> ArrayRef { let array: Int32Array = input.into_iter().collect(); Arc::new(array) From c0ddc2dc43318b42b7de53717bbc5f4ca2078a06 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 23 May 2024 13:49:51 -0400 Subject: [PATCH 3/3] Fix merge problem --- .../core/src/datasource/physical_plan/parquet/statistics.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs index b248a5a9c82d..8d17354839a8 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs @@ -379,8 +379,8 @@ mod test { use arrow::datatypes::{Date32Type, Date64Type}; use arrow_array::{ new_null_array, Array, BinaryArray, BooleanArray, Date32Array, Date64Array, - Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, RecordBatch, - StringArray, StructArray, TimestampNanosecondArray, + Decimal128Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, RecordBatch, StringArray, StructArray, TimestampNanosecondArray, }; use arrow_schema::{Field, SchemaRef}; use bytes::Bytes;