Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Fixed error in nested stats (#1139)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Jul 5, 2022
1 parent 7e08d70 commit d87b38b
Show file tree
Hide file tree
Showing 2 changed files with 165 additions and 80 deletions.
125 changes: 93 additions & 32 deletions src/io/parquet/read/statistics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,12 @@ use super::get_field_columns;
/// Enum of a count statistics
#[derive(Debug, PartialEq)]
pub enum Count {
/// simple arrays (every type not a Struct) have a count of UInt64
/// simple arrays have a count of UInt64
Single(UInt64Array),
/// list arrays have a count as a list of UInt64
List(ListArray<i32>),
/// list arrays have a count as a list of UInt64
LargeList(ListArray<i64>),
/// struct arrays have a count as a struct of UInt64
Struct(StructArray),
/// map arrays have a count as a map of UInt64
Expand Down Expand Up @@ -88,6 +92,24 @@ impl From<MutableStatistics> for Statistics {
.unwrap()
.clone();
Count::Map(a)
} else if let PhysicalType::List = s.null_count.data_type().to_physical_type() {
let a = s
.null_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.clone();
Count::List(a)
} else if let PhysicalType::LargeList = s.null_count.data_type().to_physical_type() {
let a = s
.null_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.clone();
Count::LargeList(a)
} else {
let a = s
.null_count
Expand All @@ -98,35 +120,54 @@ impl From<MutableStatistics> for Statistics {
.clone();
Count::Single(a)
};
let distinct_count =
if let PhysicalType::Struct = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
.clone();
Count::Struct(a)
} else if let PhysicalType::Map = s.null_count.data_type().to_physical_type() {
let a = s
.null_count
.as_box()
.as_any()
.downcast_ref::<MapArray>()
.unwrap()
.clone();
Count::Map(a)
} else {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<UInt64Array>()
.unwrap()
.clone();
Count::Single(a)
};
let distinct_count = if let PhysicalType::Struct =
s.distinct_count.data_type().to_physical_type()
{
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<StructArray>()
.unwrap()
.clone();
Count::Struct(a)
} else if let PhysicalType::Map = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<MapArray>()
.unwrap()
.clone();
Count::Map(a)
} else if let PhysicalType::List = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i32>>()
.unwrap()
.clone();
Count::List(a)
} else if let PhysicalType::LargeList = s.distinct_count.data_type().to_physical_type() {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<ListArray<i64>>()
.unwrap()
.clone();
Count::LargeList(a)
} else {
let a = s
.distinct_count
.as_box()
.as_any()
.downcast_ref::<UInt64Array>()
.unwrap()
.clone();
Count::Single(a)
};
Self {
null_count,
distinct_count,
Expand Down Expand Up @@ -198,6 +239,18 @@ fn create_dt(data_type: &DataType) -> DataType {
Box::new(Field::new(&f.name, create_dt(&f.data_type), f.is_nullable)),
*ordered,
)
} else if let DataType::List(f) = data_type.to_logical_type() {
DataType::List(Box::new(Field::new(
&f.name,
create_dt(&f.data_type),
f.is_nullable,
)))
} else if let DataType::LargeList(f) = data_type.to_logical_type() {
DataType::LargeList(Box::new(Field::new(
&f.name,
create_dt(&f.data_type),
f.is_nullable,
)))
} else {
DataType::UInt64
}
Expand Down Expand Up @@ -301,12 +354,20 @@ fn push(
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let distinct_count = distinct_count
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
let null_count = null_count
.as_mut_any()
.downcast_mut::<list::DynMutableListArray>()
.unwrap();
return push(
stats,
min.inner.as_mut(),
max.inner.as_mut(),
distinct_count,
null_count,
distinct_count.inner.as_mut(),
null_count.inner.as_mut(),
);
}
Dictionary(_, _, _) => {
Expand Down
120 changes: 72 additions & 48 deletions tests/it/io/parquet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,7 @@ pub fn pyarrow_required_statistics(column: &str) -> Statistics {

pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
let new_list = |array: Box<dyn Array>, nullable: bool| {
Box::new(ListArray::<i32>::new(
ListArray::<i32>::new(
DataType::List(Box::new(Field::new(
"item",
array.data_type().clone(),
Expand All @@ -572,77 +572,101 @@ pub fn pyarrow_nested_nullable_statistics(column: &str) -> Statistics {
vec![0, array.len() as i32].into(),
array,
None,
)) as Box<dyn Array>
)
};

match column {
"list_int16" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(1)])),
min_value: new_list(Box::new(Int16Array::from_slice([0])), true),
max_value: new_list(Box::new(Int16Array::from_slice([10])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Int16Array::from_slice([0])), true).boxed(),
max_value: new_list(Box::new(Int16Array::from_slice([10])), true).boxed(),
},
"list_bool" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(1)])),
min_value: new_list(Box::new(BooleanArray::from_slice([false])), true),
max_value: new_list(Box::new(BooleanArray::from_slice([true])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(BooleanArray::from_slice([false])), true).boxed(),
max_value: new_list(Box::new(BooleanArray::from_slice([true])), true).boxed(),
},
"list_utf8" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(Utf8Array::<i32>::from_slice([""])), true),
max_value: new_list(Box::new(Utf8Array::<i32>::from_slice(["ccc"])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Utf8Array::<i32>::from_slice([""])), true).boxed(),
max_value: new_list(Box::new(Utf8Array::<i32>::from_slice(["ccc"])), true).boxed(),
},
"list_large_binary" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b""])), true),
max_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b"ccc"])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b""])), true).boxed(),
max_value: new_list(Box::new(BinaryArray::<i64>::from_slice([b"ccc"])), true).boxed(),
},
"list_int64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(Int64Array::from_slice([0])), true),
max_value: new_list(Box::new(Int64Array::from_slice([10])), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
max_value: new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
},
"list_int64_required" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(1)].into()),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed(), true)),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false).boxed(),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false).boxed(),
},
"list_int64_required_required" | "list_int64_optional_required" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(0)].into()),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), false)),
null_count: Count::List(new_list(UInt64Array::from([Some(0)]).boxed(), false)),
min_value: new_list(Box::new(Int64Array::from_slice([0])), false).boxed(),
max_value: new_list(Box::new(Int64Array::from_slice([10])), false).boxed(),
},
"list_nested_i64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(2)].into()),
min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true),
max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed(), true)),
null_count: Count::List(new_list(UInt64Array::from([Some(2)]).boxed(), true)),
min_value: new_list(
new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
true,
)
.boxed(),
max_value: new_list(
new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
true,
)
.boxed(),
},
"list_nested_inner_required_required_i64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(0)].into()),
min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true),
max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true),
min_value: new_list(
new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
true,
)
.boxed(),
max_value: new_list(
new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
true,
)
.boxed(),
},
"list_nested_inner_required_i64" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single([Some(0)].into()),
min_value: new_list(new_list(Box::new(Int64Array::from_slice([0])), true), true),
max_value: new_list(new_list(Box::new(Int64Array::from_slice([10])), true), true),
min_value: new_list(
new_list(Box::new(Int64Array::from_slice([0])), true).boxed(),
true,
)
.boxed(),
max_value: new_list(
new_list(Box::new(Int64Array::from_slice([10])), true).boxed(),
true,
)
.boxed(),
},
other => todo!("{}", other),
}
}

pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics {
let new_list = |array: Box<dyn Array>| {
Box::new(ListArray::<i32>::new(
ListArray::<i32>::new(
DataType::List(Box::new(Field::new(
"item",
array.data_type().clone(),
Expand All @@ -651,21 +675,21 @@ pub fn pyarrow_nested_edge_statistics(column: &str) -> Statistics {
vec![0, array.len() as i32].into(),
array,
None,
))
)
};

match column {
"simple" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(0)])),
min_value: new_list(Box::new(Int64Array::from([Some(0)]))),
max_value: new_list(Box::new(Int64Array::from([Some(1)]))),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed())),
null_count: Count::List(new_list(UInt64Array::from([Some(0)]).boxed())),
min_value: new_list(Box::new(Int64Array::from([Some(0)]))).boxed(),
max_value: new_list(Box::new(Int64Array::from([Some(1)]))).boxed(),
},
"null" => Statistics {
distinct_count: Count::Single(UInt64Array::from([None])),
null_count: Count::Single(UInt64Array::from([Some(1)])),
min_value: new_list(Box::new(Int64Array::from([None]))),
max_value: new_list(Box::new(Int64Array::from([None]))),
distinct_count: Count::List(new_list(UInt64Array::from([None]).boxed())),
null_count: Count::List(new_list(UInt64Array::from([Some(1)]).boxed())),
min_value: new_list(Box::new(Int64Array::from([None]))).boxed(),
max_value: new_list(Box::new(Int64Array::from([None]))).boxed(),
},
_ => unreachable!(),
}
Expand Down

0 comments on commit d87b38b

Please sign in to comment.