Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Fixed panic in deserializing nested statistics #1139

Merged
merged 1 commit into from
Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 93 additions & 32 deletions src/io/parquet/read/statistics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,12 @@ use super::get_field_columns;
/// Enum of a count statistics
#[derive(Debug, PartialEq)]
pub enum Count {
/// simple arrays (every type not a Struct) have a count of UInt64
/// simple arrays have a count of UInt64
Single(UInt64Array),
/// list arrays have a count as a list of UInt64
List(ListArray<i32>),
/// list arrays have a count as a list of UInt64
LargeList(ListArray<i64>),
/// struct arrays have a count as a struct of UInt64
Struct(StructArray),
/// map arrays have a count as a map of UInt64
Expand Down Expand Up @@ -88,6 +92,24 @@ impl From<MutableStatistics> for Statistics {
.unwrap()
.clone();
Count::Map(a)
} else if let PhysicalType::List = s.null_count.data_type().to_physical_type() {
let a = s
.null_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.clone();
Count::List(a)
} else if let PhysicalType::LargeList = s.null_count.data_type().to_physical_type() {
let a = s
.null_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.clone();
Count::LargeList(a)
} else {
let a = s
.null_count
Expand All @@ -98,35 +120,54 @@ impl From<MutableStatistics> for Statistics {
.clone();
Count::Single(a)
};
let distinct_count =
if let PhysicalType::Struct = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
.clone();
Count::Struct(a)
} else if let PhysicalType::Map = s.null_count.data_type().to_physical_type() {
let a = s
.null_count
.as_box()
.as_any()
.downcast_ref::<MapArray>()
.unwrap()
.clone();
Count::Map(a)
} else {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<UInt64Array>()
.unwrap()
.clone();
Count::Single(a)
};
let distinct_count = if let PhysicalType::Struct =
s.distinct_count.data_type().to_physical_type()
{
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
.clone();
Count::Struct(a)
} else if let PhysicalType::Map = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<MapArray>()
.unwrap()
.clone();
Count::Map(a)
} else if let PhysicalType::List = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.clone();
Count::List(a)
} else if let PhysicalType::LargeList = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.clone();
Count::LargeList(a)
} else {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<UInt64Array>()
.unwrap()
.clone();
Count::Single(a)
};
Self {
null_count,
distinct_count,
Expand Down Expand Up @@ -198,6 +239,18 @@ fn create_dt(data_type: &DataType) -> DataType {
Box::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)),
*ordered,
)
} else if let DataType::List(f) = data_type.to_logical_type() {
DataType::List(Box::new(Field::new(
&f.name,
create_dt(&f.data_type),
f.is_nullable,
)))
} else if let DataType::LargeList(f) = data_type.to_logical_type() {
DataType::LargeList(Box::new(Field::new(
&f.name,
create_dt(&f.data_type),
f.is_nullable,
)))
} else {
DataType::UInt64
}
Expand Down Expand Up @@ -301,12 +354,20 @@ fn push(
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let distinct_count = distinct_count
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let null_count = null_count
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
return push(
stats,
min.inner.as_mut(),
max.inner.as_mut(),
distinct_count,
null_count,
distinct_count.inner.as_mut(),
null_count.inner.as_mut(),
);
}
Dictionary(_, _, _) => {
Expand Down
120 changes: 72 additions & 48 deletions tests/it/io/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ pub fn pyarrow_required_statistics(column: &str) -> Statistics {

pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
let new_list = |array: Box<dyn Array>, nullable: bool| {
Box::new(ListArray::<i32>::new(
ListArray::<i32>::new(
DataType::List(Box::new(Field::new(
"item",
array.data_type().clone(),
Expand All @@ -572,77 +572,101 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
vec![0, array.len() as i32].into(),
array,
None,
)) as Box<dyn Array>
)
};

match column {
"list_int16" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(1)])),
min_value: new_list(Box::new(Int16Array::from_slice([0])), true),
max_value: new_list(Box::new(Int16Array::from_slice([10])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Int16Array::from_slice([0])), true).boxed(),
max_value: new_list(Box::new(Int16Array::from_slice([10])), true).boxed(),
},
"list_bool" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(1)])),
min_value: new_list(Box::new(BooleanArray::from_slice([false])), true),
max_value: new_list(Box::new(BooleanArray::from_slice([true])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(BooleanArray::from_slice([false])), true).boxed(),
max_value: new_list(Box::new(BooleanArray::from_slice([true])), true).boxed(),
},
"list_utf8" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(Utf8Array::<i32>::from_slice([""])), true),
max_value: new_list(Box::new(Utf8Array::<i32>::from_slice(["ccc"])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Utf8Array::<i32>::from_slice([""])), true).boxed(),
max_value: new_list(Box::new(Utf8Array::<i32>::from_slice(["ccc"])), true).boxed(),
},
"list_large_binary" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b""])), true),
max_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b"ccc"])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b""])), true).boxed(),
max_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b"ccc"])), true).boxed(),
},
"list_int64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(Int64Array::from_slice([0])), true),
max_value: new_list(Box::new(Int64Array::from_slice([10])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
max_value: new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
},
"list_int64_required" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false).boxed(),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false).boxed(),
},
"list_int64_required_required" | "list_int64_optional_required" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(0)].into()),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), false)),
null_count: Count::List(new_list(UInt64Array::from([Some(0)]).boxed(), false)),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false).boxed(),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false).boxed(),
},
"list_nested_i64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(2)].into()),
min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true),
max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(2)]).boxed(), true)),
min_value: new_list(
new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
true,
)
.boxed(),
max_value: new_list(
new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
true,
)
.boxed(),
},
"list_nested_inner_required_required_i64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(0)].into()),
min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true),
max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true),
min_value: new_list(
new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
true,
)
.boxed(),
max_value: new_list(
new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
true,
)
.boxed(),
},
"list_nested_inner_required_i64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(0)].into()),
min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true),
max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true),
min_value: new_list(
new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
true,
)
.boxed(),
max_value: new_list(
new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
true,
)
.boxed(),
},
other => todo!("{}", other),
}
}

pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics {
let new_list = |array: Box<dyn Array>| {
Box::new(ListArray::<i32>::new(
ListArray::<i32>::new(
DataType::List(Box::new(Field::new(
"item",
array.data_type().clone(),
Expand All @@ -651,21 +675,21 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics {
vec![0, array.len() as i32].into(),
array,
None,
))
)
};

match column {
"simple" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(0)])),
min_value: new_list(Box::new(Int64Array::from([Some(0)]))),
max_value: new_list(Box::new(Int64Array::from([Some(1)]))),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed())),
null_count: Count::List(new_list(UInt64Array::from([Some(0)]).boxed())),
min_value: new_list(Box::new(Int64Array::from([Some(0)]))).boxed(),
max_value: new_list(Box::new(Int64Array::from([Some(1)]))).boxed(),
},
"null" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(1)])),
min_value: new_list(Box::new(Int64Array::from([None]))),
max_value: new_list(Box::new(Int64Array::from([None]))),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed())),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed())),
min_value: new_list(Box::new(Int64Array::from([None]))).boxed(),
max_value: new_list(Box::new(Int64Array::from([None]))).boxed(),
},
_ => unreachable!(),
}
Expand Down