Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions parquet-variant-compute/src/type_conversion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use arrow::datatypes::{
self, ArrowPrimitiveType, ArrowTimestampType, Decimal32Type, Decimal64Type, Decimal128Type,
DecimalType,
};
use chrono::Timelike;
use parquet_variant::{Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16};

/// Options for controlling the behavior of `cast_to_variant_with_options`.
Expand Down Expand Up @@ -89,6 +90,9 @@ impl_primitive_from_variant!(
as_naive_date,
datatypes::Date32Type::from_naive_date
);
impl_primitive_from_variant!(datatypes::Time64MicrosecondType, as_time_utc, |v| {
(v.num_seconds_from_midnight() * 1_000_000 + v.nanosecond() / 1_000) as i64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it normal to take the floor instead of rounding?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is ok because nanosecond() will return u32, and the result will be truncated to zero.

});
impl_timestamp_from_variant!(
datatypes::TimestampMicrosecondType,
as_timestamp_ntz_micros,
Expand Down
63 changes: 54 additions & 9 deletions parquet-variant-compute/src/variant_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
use arrow::compute::cast;
use arrow::datatypes::{
Date32Type, Float16Type, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type, Int64Type,
Date32Type, Decimal32Type, Decimal64Type, Decimal128Type, Float16Type, Float32Type,
Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
TimestampMicrosecondType, TimestampNanosecondType,
};
use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
use chrono::DateTime;
use chrono::{DateTime, NaiveTime};
use parquet_variant::{
Uuid, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, VariantDecimalType as _,
};
Expand Down Expand Up @@ -539,7 +540,7 @@ impl<'a> DoubleEndedIterator for VariantArrayIter<'a> {

impl<'a> ExactSizeIterator for VariantArrayIter<'a> {}

/// One shredded field of a partially or prefectly shredded variant. For example, suppose the
/// One shredded field of a partially or perfectly shredded variant. For example, suppose the
/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is
/// itself a struct with the single field `b` of type INT. Then the physical layout of the column
/// is:
Expand Down Expand Up @@ -920,17 +921,12 @@ fn typed_value_to_variant<'a>(
panic!("Invalid variant, conflicting value and typed_value");
}
match data_type {
DataType::Null => Variant::Null,
DataType::Boolean => {
let boolean_array = typed_value.as_boolean();
let value = boolean_array.value(index);
Variant::from(value)
}
DataType::Date32 => {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has been adjusted to be grouped together with the Timestamp/Time.

let array = typed_value.as_primitive::<Date32Type>();
let value = array.value(index);
let date = Date32Type::to_naive_date(value);
Variant::from(date)
}
// 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
DataType::FixedSizeBinary(16) => {
let array = typed_value.as_fixed_size_binary();
Expand Down Expand Up @@ -968,6 +964,55 @@ fn typed_value_to_variant<'a>(
DataType::Float64 => {
primitive_conversion_single_value!(Float64Type, typed_value, index)
}
DataType::Decimal32(_, s) => {
generic_conversion_single_value!(
Decimal32Type,
as_primitive,
|v| VariantDecimal4::try_new(v, *s as u8).map_or(Variant::Null, Variant::from),
typed_value,
index
)
}
DataType::Decimal64(_, s) => {
generic_conversion_single_value!(
Decimal64Type,
as_primitive,
|v| VariantDecimal8::try_new(v, *s as u8).map_or(Variant::Null, Variant::from),
typed_value,
index
)
}
DataType::Decimal128(_, s) => {
generic_conversion_single_value!(
Decimal128Type,
as_primitive,
|v| VariantDecimal16::try_new(v, *s as u8).map_or(Variant::Null, Variant::from),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may need a follow-up item to track turning this into a proper Result instead of silently converting to NULL on failure? That would require VariantArray::value to return a Result<Variant>, which is a biggish change but honestly seems appropriate given that we can't guarantee the input shredding is even physically valid, let alone logically valid? It seems too sharp an edge to just panic.

@alamb -- thoughts?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed an issue #8672 to track this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may need a follow-up item to track turning this into a proper Result instead of silently converting to NULL on failure?

I think this makes a lot of sense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may need a follow-up item to track turning this into a proper Result instead of silently converting to NULL on failure? That would require VariantArray::value to return a Result, which is a biggish change but honestly seems appropriate given that we can't guarantee the input shredding is even physically valid, let alone logically valid? It seems too sharp an edge to just panic.

@alamb -- thoughts?

How about we add a try_value() method that returns a Result and just have value() unwrap the result?

typed_value,
index
)
}
DataType::Date32 => {
generic_conversion_single_value!(
Date32Type,
as_primitive,
Date32Type::to_naive_date,
typed_value,
index
)
}
DataType::Time64(TimeUnit::Microsecond) => {
generic_conversion_single_value!(
Time64MicrosecondType,
as_primitive,
|v| NaiveTime::from_num_seconds_from_midnight_opt(
(v / 1_000_000) as u32,
(v % 1_000_000) as u32 * 1000
)
.map_or(Variant::Null, Variant::from),
typed_value,
index
)
}
DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
generic_conversion_single_value!(
TimestampMicrosecondType,
Expand Down
159 changes: 157 additions & 2 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,17 @@ impl<'a> GetOptions<'a> {

#[cfg(test)]
mod test {
use std::str::FromStr;
use std::sync::Arc;

use super::{GetOptions, variant_get};
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
use crate::{VariantArray, VariantArrayBuilder, json_to_variant};
use arrow::array::{
Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array,
Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array,
Int16Array, Int32Array, Int64Array, StringArray, StructArray,
Int16Array, Int32Array, Int64Array, NullBuilder, StringArray, StructArray,
Time64MicrosecondArray,
};
use arrow::buffer::NullBuffer;
use arrow::compute::CastOptions;
Expand All @@ -312,7 +316,6 @@ mod test {
EMPTY_VARIANT_METADATA_BYTES, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16,
VariantDecimalType, VariantPath,
};
use std::sync::Arc;

fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) {
// Create input array from JSON string
Expand Down Expand Up @@ -969,6 +972,158 @@ mod test {
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_time_variant_array, || {
Time64MicrosecondArray::from(vec![Some(12345000), Some(87654000), Some(135792000)])
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_time_as_time,
DataType::Time64(TimeUnit::Microsecond),
perfectly_shredded_time_variant_array,
Time64MicrosecondArray::from(vec![Some(12345000), Some(87654000), Some(135792000)])
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_null_variant_array, || {
let mut builder = NullBuilder::new();
builder.append_nulls(3);
builder.finish()
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_null_as_null,
DataType::Null,
perfectly_shredded_null_variant_array,
arrow::array::NullArray::new(3)
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_decimal4_variant_array, || {
Decimal32Array::from(vec![Some(12345), Some(23400), Some(-12342)])
.with_precision_and_scale(5, 2)
.unwrap()
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_decimal4_as_decimal4,
DataType::Decimal32(5, 2),
perfectly_shredded_decimal4_variant_array,
Decimal32Array::from(vec![Some(12345), Some(23400), Some(-12342)])
.with_precision_and_scale(5, 2)
.unwrap()
);

perfectly_shredded_variant_array_fn!(
perfectly_shredded_decimal8_variant_array_cast2decimal32,
|| {
Decimal64Array::from(vec![Some(123456), Some(145678), Some(-123456)])
.with_precision_and_scale(6, 1)
.unwrap()
}
);

// The input will be cast to Decimal32 when transformed to Variant
// This tests will covert the logic DataType::Decimal64(the original array)
// -> Variant::Decimal4(VariantArray) -> DataType::Decimal64(the result array)
perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_decimal8_through_decimal32_as_decimal8,
DataType::Decimal64(6, 1),
perfectly_shredded_decimal8_variant_array_cast2decimal32,
Decimal64Array::from(vec![Some(123456), Some(145678), Some(-123456)])
.with_precision_and_scale(6, 1)
.unwrap()
);

// This tests will covert the logic DataType::Decimal64(the original array)
// -> Variant::Decimal8(VariantArray) -> DataType::Decimal64(the result array)
perfectly_shredded_variant_array_fn!(perfectly_shredded_decimal8_variant_array, || {
Decimal64Array::from(vec![Some(1234567809), Some(1456787000), Some(-1234561203)])
.with_precision_and_scale(10, 1)
.unwrap()
});

perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_decimal8_as_decimal8,
DataType::Decimal64(10, 1),
perfectly_shredded_decimal8_variant_array,
Decimal64Array::from(vec![Some(1234567809), Some(1456787000), Some(-1234561203)])
.with_precision_and_scale(10, 1)
.unwrap()
);

// This tests will covert the logic DataType::Decimal128(the original array)
// -> Variant::Decimal4(VariantArray) -> DataType::Decimal128(the result array)
perfectly_shredded_variant_array_fn!(
perfectly_shredded_decimal16_within_decimal4_variant_array,
|| {
Decimal128Array::from(vec![
Some(i128::from(1234589)),
Some(i128::from(2344444)),
Some(i128::from(-1234789)),
])
.with_precision_and_scale(7, 3)
.unwrap()
}
);

// This tests will covert the logic DataType::Decimal128(the original array)
// -> Variant::Decimal4(VariantArray) -> DataType::Decimal128(the result array)
perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_decimal16_within_decimal4_as_decimal16,
DataType::Decimal128(7, 3),
perfectly_shredded_decimal16_within_decimal4_variant_array,
Decimal128Array::from(vec![
Some(i128::from(1234589)),
Some(i128::from(2344444)),
Some(i128::from(-1234789)),
])
.with_precision_and_scale(7, 3)
.unwrap()
);

perfectly_shredded_variant_array_fn!(
perfectly_shredded_decimal16_within_decimal8_variant_array,
|| {
Decimal128Array::from(vec![Some(1234567809), Some(1456787000), Some(-1234561203)])
.with_precision_and_scale(10, 1)
.unwrap()
}
);

// This tests will covert the logic DataType::Decimal128(the original array)
// -> Variant::Decimal8(VariantArray) -> DataType::Decimal128(the result array)
perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_decimal16_within8_as_decimal16,
DataType::Decimal128(10, 1),
perfectly_shredded_decimal16_within_decimal8_variant_array,
Decimal128Array::from(vec![Some(1234567809), Some(1456787000), Some(-1234561203)])
.with_precision_and_scale(10, 1)
.unwrap()
);

perfectly_shredded_variant_array_fn!(perfectly_shredded_decimal16_variant_array, || {
Decimal128Array::from(vec![
Some(i128::from_str("12345678901234567899").unwrap()),
Some(i128::from_str("23445677483748324300").unwrap()),
Some(i128::from_str("-12345678901234567899").unwrap()),
])
.with_precision_and_scale(20, 3)
.unwrap()
});

// This tests will covert the logic DataType::Decimal128(the original array)
// -> Variant::Decimal16(VariantArray) -> DataType::Decimal128(the result array)
perfectly_shredded_to_arrow_primitive_test!(
get_variant_perfectly_shredded_decimal16_as_decimal16,
DataType::Decimal128(20, 3),
perfectly_shredded_decimal16_variant_array,
Decimal128Array::from(vec![
Some(i128::from_str("12345678901234567899").unwrap()),
Some(i128::from_str("23445677483748324300").unwrap()),
Some(i128::from_str("-12345678901234567899").unwrap())
])
.with_precision_and_scale(20, 3)
.unwrap()
);

macro_rules! assert_variant_get_as_variant_array_with_default_option {
($variant_array: expr, $array_expected: expr) => {{
let options = GetOptions::new();
Expand Down
Loading
Loading