From 8c7e4e5758ea5cd39fe6ebd7fe659f94e31fb43d Mon Sep 17 00:00:00 2001 From: b41sh Date: Sun, 5 Sep 2021 22:59:15 +0800 Subject: [PATCH 01/17] support interval MonthDayNano --- arrow/Cargo.toml | 2 +- arrow/src/array/array.rs | 6 +++ arrow/src/array/array_primitive.rs | 17 +++++++ arrow/src/array/builder.rs | 4 ++ arrow/src/array/data.rs | 4 ++ arrow/src/array/equal/mod.rs | 3 ++ arrow/src/array/mod.rs | 3 ++ arrow/src/array/ord.rs | 3 ++ arrow/src/array/transform/mod.rs | 4 ++ arrow/src/compute/kernels/cast.rs | 2 + arrow/src/compute/kernels/sort.rs | 5 ++ arrow/src/compute/kernels/take.rs | 12 +++++ arrow/src/datatypes/datatype.rs | 14 +++++- arrow/src/datatypes/mod.rs | 80 +++++++++++++++++++----------- arrow/src/datatypes/native.rs | 58 ++++++++++++++++++++++ arrow/src/datatypes/numeric.rs | 1 + arrow/src/datatypes/types.rs | 6 +++ arrow/src/ipc/convert.rs | 9 ++++ arrow/src/ipc/gen/Schema.rs | 14 ++++-- arrow/src/ipc/reader.rs | 3 +- arrow/src/util/display.rs | 42 ++++++++++++++++ arrow/src/util/integration_util.rs | 4 ++ format/Schema.fbs | 13 ++++- integration-testing/src/lib.rs | 23 +++++++++ parquet/src/arrow/array_reader.rs | 17 ++++++- parquet/src/arrow/arrow_writer.rs | 21 ++++++++ parquet/src/arrow/converter.rs | 52 +++++++++++++++++-- parquet/src/arrow/schema.rs | 41 +++++++-------- 28 files changed, 401 insertions(+), 62 deletions(-) diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index f597c22dccd3..50f415786321 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -38,7 +38,7 @@ path = "src/lib.rs" [dependencies] serde = { version = "1.0", features = ["rc"] } serde_derive = "1.0" -serde_json = { version = "1.0", features = ["preserve_order"] } +serde_json = { version = "1.0", features = ["preserve_order", "arbitrary_precision"] } indexmap = "1.6" rand = { version = "0.8", optional = true } num = "0.4" diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index be19fea37cc8..b28a726fe528 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -275,6 +275,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Interval(IntervalUnit::DayTime) => { Arc::new(IntervalDayTimeArray::from(data)) as ArrayRef } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Arc::new(IntervalMonthDayNanoArray::from(data)) as ArrayRef + } DataType::Duration(TimeUnit::Second) => { Arc::new(DurationSecondArray::from(data)) as ArrayRef } @@ -413,6 +416,9 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { IntervalUnit::DayTime => { new_null_sized_array::(data_type, length) } + IntervalUnit::MonthDayNano => { + new_null_sized_array::(data_type, length) + } }, DataType::FixedSizeBinary(value_len) => make_array(ArrayData::new( data_type.clone(), diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index 5777a0304844..a27093129f52 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -419,6 +419,7 @@ def_numeric_from_vec!(Time64MicrosecondType); def_numeric_from_vec!(Time64NanosecondType); def_numeric_from_vec!(IntervalYearMonthType); def_numeric_from_vec!(IntervalDayTimeType); +def_numeric_from_vec!(IntervalMonthDayNanoType); def_numeric_from_vec!(DurationSecondType); def_numeric_from_vec!(DurationMillisecondType); def_numeric_from_vec!(DurationMicrosecondType); @@ -624,6 +625,22 @@ mod tests { assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); assert_eq!(-5, arr.values()[2]); + + // a month_day_nano interval contains months, days and nanoseconds, + // but we do not yet have accessors for the values + let arr = IntervalMonthDayNanoArray::from(vec![ + Some(100000000000000000000), + None, + Some(-500000000000000000000), + ]); + assert_eq!(3, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(1, arr.null_count()); + assert_eq!(100000000000000000000, arr.value(0)); + assert_eq!(100000000000000000000, arr.values()[0]); + assert!(arr.is_null(1)); + assert_eq!(-500000000000000000000, arr.value(2)); + assert_eq!(-500000000000000000000, arr.values()[2]); } #[test] diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 8139b798b945..f6ea7c91837c 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1594,6 +1594,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(IntervalDayTimeBuilder::new(capacity)) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Box::new(IntervalMonthDayNanoBuilder::new(capacity)) + } DataType::Duration(TimeUnit::Second) => { Box::new(DurationSecondBuilder::new(capacity)) } @@ -1937,6 +1940,7 @@ impl FieldData { | DataType::Time64(_) | DataType::Interval(IntervalUnit::DayTime) | DataType::Duration(_) => self.append_null::()?, + DataType::Interval(IntervalUnit::MonthDayNano) => self.append_null::()?, DataType::UInt8 => self.append_null::()?, DataType::UInt16 => self.append_null::()?, DataType::UInt32 => self.append_null::()?, diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index cb389cacc7f6..47437871146d 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -114,6 +114,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, ], + DataType::Interval(IntervalUnit::MonthDayNano) => [ + MutableBuffer::new(capacity * mem::size_of::()), + empty_buffer, + ], DataType::Utf8 | DataType::Binary => { let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); // safety: `unsafe` code assumes that this buffer is initialized with one element diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index 8368717c6747..958ac3cfe8f6 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -199,6 +199,9 @@ fn equal_values( | DataType::Duration(_) => primitive_equal::( lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, ), + DataType::Interval(IntervalUnit::MonthDayNano) => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), DataType::Utf8 | DataType::Binary => variable_sized_equal::( lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, ), diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index bd791f96c64f..6fbdda2f0624 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -167,6 +167,7 @@ pub type Time64MicrosecondArray = PrimitiveArray; pub type Time64NanosecondArray = PrimitiveArray; pub type IntervalYearMonthArray = PrimitiveArray; pub type IntervalDayTimeArray = PrimitiveArray; +pub type IntervalMonthDayNanoArray = PrimitiveArray; pub type DurationSecondArray = PrimitiveArray; pub type DurationMillisecondArray = PrimitiveArray; pub type DurationMicrosecondArray = PrimitiveArray; @@ -207,6 +208,7 @@ pub type Time64MicrosecondBufferBuilder = BufferBuilder; pub type Time64NanosecondBufferBuilder = BufferBuilder; pub type IntervalYearMonthBufferBuilder = BufferBuilder; pub type IntervalDayTimeBufferBuilder = BufferBuilder; +pub type IntervalMonthDayNanoBufferBuilder = BufferBuilder; pub type DurationSecondBufferBuilder = BufferBuilder; pub type DurationMillisecondBufferBuilder = BufferBuilder; pub type DurationMicrosecondBufferBuilder = BufferBuilder; @@ -253,6 +255,7 @@ pub type Time64MicrosecondBuilder = PrimitiveBuilder; pub type Time64NanosecondBuilder = PrimitiveBuilder; pub type IntervalYearMonthBuilder = PrimitiveBuilder; pub type IntervalDayTimeBuilder = PrimitiveBuilder; +pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; pub type DurationSecondBuilder = PrimitiveBuilder; pub type DurationMillisecondBuilder = PrimitiveBuilder; pub type DurationMicrosecondBuilder = PrimitiveBuilder; diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index d6534efc9286..37bd0091eee0 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -174,6 +174,9 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { compare_primitives::(left, right) } + (Interval(MonthDayNano), Interval(MonthDayNano)) => { + compare_primitives::(left, right) + } (Duration(Second), Duration(Second)) => { compare_primitives::(left, right) } diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 69092c1af55d..938b0f0b2758 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -259,6 +259,9 @@ fn build_extend(array: &ArrayData) -> Extend { | DataType::Interval(IntervalUnit::DayTime) => { primitive::build_extend::(array) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + primitive::build_extend::(array) + } DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), DataType::LargeUtf8 | DataType::LargeBinary => { variable_size::build_extend::(array) @@ -300,6 +303,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { | DataType::Timestamp(_, _) | DataType::Duration(_) | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, + DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, DataType::List(_) => list::extend_nulls::, diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 593adecc381c..f4871c94bec9 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -3736,6 +3736,7 @@ mod tests { Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), + Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), Arc::new(DurationSecondArray::from(vec![1000, 2000])), Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), @@ -3884,6 +3885,7 @@ mod tests { Duration(TimeUnit::Nanosecond), Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::DayTime), + Interval(IntervalUnit::MonthDayNano), Binary, FixedSizeBinary(10), LargeBinary, diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 6f42be34aa44..3674855d2758 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -243,6 +243,11 @@ pub fn sort_to_indices( DataType::Interval(IntervalUnit::DayTime) => { sort_primitive::(values, v, n, cmp, &options, limit) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + sort_primitive::( + values, v, n, cmp, &options, limit, + ) + } DataType::Duration(TimeUnit::Second) => { sort_primitive::(values, v, n, cmp, &options, limit) } diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 71479723e022..95662d4e90cf 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -171,6 +171,9 @@ where DataType::Interval(IntervalUnit::DayTime) => { downcast_take!(IntervalDayTimeType, values, indices) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + downcast_take!(IntervalMonthDayNanoType, values, indices) + } DataType::Duration(TimeUnit::Second) => { downcast_take!(DurationSecondType, values, indices) } @@ -1170,6 +1173,15 @@ mod tests { ) .unwrap(); + // interval_month_day_nano + test_take_primitive_arrays::( + vec![Some(0), None, Some(2), Some(-15), None], + &index, + None, + vec![Some(-15), None, None, Some(-15), Some(2)], + ) + .unwrap(); + // duration_second test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(-15), None], diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 1cbec341cf37..b12c9b0c9ce2 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -158,7 +158,7 @@ pub enum TimeUnit { Nanosecond, } -/// YEAR_MONTH or DAY_TIME interval in SQL style. +/// YEAR_MONTH, DAY_TIME, MonthDayNano interval in SQL style. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum IntervalUnit { /// Indicates the number of elapsed whole months, stored as 4-byte integers. @@ -166,6 +166,14 @@ pub enum IntervalUnit { /// Indicates the number of elapsed days and milliseconds, /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). DayTime, + /// A triple of the number of elapsed months, days, and nanoseconds. + /// The values are stored contiguously in 16 byte blocks. Months and + /// days are encoded as 32 bit integers and nanoseconds is encoded as a + /// 64 bit integer. All integers are signed. Each field is independent + /// (e.g. there is no constraint that nanoseconds have the same sign + /// as days or that the quantity of nanoseconds represents less + /// than a day's worth of time). + MonthDayNano, } impl fmt::Display for DataType { @@ -287,6 +295,9 @@ impl DataType { Some(p) if p == "YEAR_MONTH" => { Ok(DataType::Interval(IntervalUnit::YearMonth)) } + Some(p) if p == "MONTH_DAY_NANO" => { + Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + } _ => Err(ArrowError::ParseError( "interval unit missing or invalid".to_string(), )), @@ -442,6 +453,7 @@ impl DataType { DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { IntervalUnit::YearMonth => "YEAR_MONTH", IntervalUnit::DayTime => "DAY_TIME", + IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", }}), DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { TimeUnit::Second => "SECOND", diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 9920cf95d3c6..45b8d1822329 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -454,13 +454,14 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), Field::new( - "c21", + "c22", DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), false, ), Field::new( - "c22", + "c23", DataType::FixedSizeList( Box::new(Field::new("bools", DataType::Boolean, false)), 5, @@ -468,7 +469,7 @@ mod tests { false, ), Field::new( - "c23", + "c24", DataType::List(Box::new(Field::new( "inner_list", DataType::List(Box::new(Field::new( @@ -481,21 +482,22 @@ mod tests { true, ), Field::new( - "c24", + "c25", DataType::Struct(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::UInt16, false), ]), false, ), - Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), - Field::new("c27", DataType::Duration(TimeUnit::Second), false), - Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), - Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), - Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), + Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), + Field::new("c29", DataType::Duration(TimeUnit::Second), false), + Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), + Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), + Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( - "c31", + "c33", DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), @@ -504,10 +506,10 @@ mod tests { 123, true, ), - Field::new("c32", DataType::LargeBinary, true), - Field::new("c33", DataType::LargeUtf8, true), + Field::new("c34", DataType::LargeBinary, true), + Field::new("c35", DataType::LargeUtf8, true), Field::new( - "c34", + "c36", DataType::LargeList(Box::new(Field::new( "inner_large_list", DataType::LargeList(Box::new(Field::new( @@ -520,7 +522,7 @@ mod tests { true, ), Field::new( - "c35", + "c37", DataType::Map( Box::new(Field::new( "my_entries", @@ -731,6 +733,15 @@ mod tests { { "name": "c21", "nullable": false, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c22", + "nullable": false, "type": { "name": "list" }, @@ -746,7 +757,7 @@ mod tests { ] }, { - "name": "c22", + "name": "c23", "nullable": false, "type": { "name": "fixedsizelist", @@ -764,7 +775,7 @@ mod tests { ] }, { - "name": "c23", + "name": "c24", "nullable": true, "type": { "name": "list" @@ -790,7 +801,7 @@ mod tests { ] }, { - "name": "c24", + "name": "c25", "nullable": false, "type": { "name": "struct" @@ -817,7 +828,7 @@ mod tests { ] }, { - "name": "c25", + "name": "c26", "nullable": true, "type": { "name": "interval", @@ -826,7 +837,7 @@ mod tests { "children": [] }, { - "name": "c26", + "name": "c27", "nullable": true, "type": { "name": "interval", @@ -835,7 +846,16 @@ mod tests { "children": [] }, { - "name": "c27", + "name": "c28", + "nullable": true, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c29", "nullable": false, "type": { "name": "duration", @@ -844,7 +864,7 @@ mod tests { "children": [] }, { - "name": "c28", + "name": "c30", "nullable": false, "type": { "name": "duration", @@ -853,7 +873,7 @@ mod tests { "children": [] }, { - "name": "c29", + "name": "c31", "nullable": false, "type": { "name": "duration", @@ -862,7 +882,7 @@ mod tests { "children": [] }, { - "name": "c30", + "name": "c32", "nullable": false, "type": { "name": "duration", @@ -871,7 +891,7 @@ mod tests { "children": [] }, { - "name": "c31", + "name": "c33", "nullable": true, "children": [], "type": { @@ -888,7 +908,7 @@ mod tests { } }, { - "name": "c32", + "name": "c34", "nullable": true, "type": { "name": "largebinary" @@ -896,7 +916,7 @@ mod tests { "children": [] }, { - "name": "c33", + "name": "c35", "nullable": true, "type": { "name": "largeutf8" @@ -904,7 +924,7 @@ mod tests { "children": [] }, { - "name": "c34", + "name": "c36", "nullable": true, "type": { "name": "largelist" @@ -930,7 +950,7 @@ mod tests { ] }, { - "name": "c35", + "name": "c37", "nullable": false, "type": { "name": "map", @@ -1156,10 +1176,12 @@ mod tests { assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1i128.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1u128.into_json_value()); assert_eq!( Some(VNumber(Number::from_f64(0.01f64).unwrap())), 0.01.into_json_value() diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 6e8cf892237e..579536333abc 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -67,6 +67,12 @@ pub trait ArrowNativeType: fn from_i64(_: i64) -> Option { None } + + /// Convert native type from i128. + #[inline] + fn from_i128(_: i128) -> Option { + None + } } /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the @@ -201,6 +207,35 @@ impl ArrowNativeType for i64 { } } +impl JsonSerializable for i128 { + fn into_json_value(self) -> Option { + Some(self.into()) + } +} + +impl ArrowNativeType for i128 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } + + /// Convert native type from i128. + #[inline] + fn from_i128(val: i128) -> Option { + Some(val) + } +} + impl JsonSerializable for u8 { fn into_json_value(self) -> Option { Some(self.into()) @@ -293,6 +328,29 @@ impl ArrowNativeType for u64 { } } +impl JsonSerializable for u128 { + fn into_json_value(self) -> Option { + Some(self.into()) + } +} + +impl ArrowNativeType for u128 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } +} + impl JsonSerializable for f32 { fn into_json_value(self) -> Option { Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(Value::Number) diff --git a/arrow/src/datatypes/numeric.rs b/arrow/src/datatypes/numeric.rs index 39c6732c3231..cbb953c57b94 100644 --- a/arrow/src/datatypes/numeric.rs +++ b/arrow/src/datatypes/numeric.rs @@ -348,6 +348,7 @@ make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8); make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8); make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16); make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8); +make_numeric_type!(IntervalMonthDayNanoType, i128, i128x4, m128x4); make_numeric_type!(DurationSecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); diff --git a/arrow/src/datatypes/types.rs b/arrow/src/datatypes/types.rs index 30c9aae89565..03643228f138 100644 --- a/arrow/src/datatypes/types.rs +++ b/arrow/src/datatypes/types.rs @@ -96,6 +96,11 @@ make_type!( i64, DataType::Interval(IntervalUnit::DayTime) ); +make_type!( + IntervalMonthDayNanoType, + i128, + DataType::Interval(IntervalUnit::MonthDayNano) +); make_type!( DurationSecondType, i64, @@ -152,6 +157,7 @@ impl ArrowTemporalType for Time64MicrosecondType {} impl ArrowTemporalType for Time64NanosecondType {} // impl ArrowTemporalType for IntervalYearMonthType {} // impl ArrowTemporalType for IntervalDayTimeType {} +// impl ArrowTemporalType for IntervalMonthDayNanoType {} impl ArrowTemporalType for DurationSecondType {} impl ArrowTemporalType for DurationMillisecondType {} impl ArrowTemporalType for DurationMicrosecondType {} diff --git a/arrow/src/ipc/convert.rs b/arrow/src/ipc/convert.rs index 5244a387c90b..f456c676b9ba 100644 --- a/arrow/src/ipc/convert.rs +++ b/arrow/src/ipc/convert.rs @@ -263,6 +263,9 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT DataType::Interval(IntervalUnit::YearMonth) } ipc::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), + ipc::IntervalUnit::MONTH_DAY_NANO => { + DataType::Interval(IntervalUnit::MonthDayNano) + } z => panic!("Interval type with unit of {:?} unsupported", z), } } @@ -557,6 +560,7 @@ pub(crate) fn get_fb_field_type<'a>( let interval_unit = match unit { IntervalUnit::YearMonth => ipc::IntervalUnit::YEAR_MONTH, IntervalUnit::DayTime => ipc::IntervalUnit::DAY_TIME, + IntervalUnit::MonthDayNano => ipc::IntervalUnit::MONTH_DAY_NANO, }; builder.add_unit(interval_unit); FBFieldType { @@ -771,6 +775,11 @@ mod tests { DataType::Interval(IntervalUnit::DayTime), true, ), + Field::new( + "interval[mdn]", + DataType::Interval(IntervalUnit::MonthDayNano), + true, + ), Field::new("utf8", DataType::Utf8, false), Field::new("binary", DataType::Binary, false), Field::new( diff --git a/arrow/src/ipc/gen/Schema.rs b/arrow/src/ipc/gen/Schema.rs index 12af5b5b0806..dd204e0704df 100644 --- a/arrow/src/ipc/gen/Schema.rs +++ b/arrow/src/ipc/gen/Schema.rs @@ -639,8 +639,11 @@ pub const ENUM_MAX_INTERVAL_UNIT: i16 = 1; note = "Use associated constants instead. This will no longer be generated in 2021." )] #[allow(non_camel_case_types)] -pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 2] = - [IntervalUnit::YEAR_MONTH, IntervalUnit::DAY_TIME]; +pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 3] = [ + IntervalUnit::YEAR_MONTH, + IntervalUnit::DAY_TIME, + IntervalUnit::MONTH_DAY_NANO, +]; #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(transparent)] @@ -649,15 +652,18 @@ pub struct IntervalUnit(pub i16); impl IntervalUnit { pub const YEAR_MONTH: Self = Self(0); pub const DAY_TIME: Self = Self(1); + pub const MONTH_DAY_NANO: Self = Self(2); pub const ENUM_MIN: i16 = 0; - pub const ENUM_MAX: i16 = 1; - pub const ENUM_VALUES: &'static [Self] = &[Self::YEAR_MONTH, Self::DAY_TIME]; + pub const ENUM_MAX: i16 = 2; + pub const ENUM_VALUES: &'static [Self] = + &[Self::YEAR_MONTH, Self::DAY_TIME, Self::MONTH_DAY_NANO]; /// Returns the variant's name or "" if unknown. pub fn variant_name(self) -> Option<&'static str> { match self { Self::YEAR_MONTH => Some("YEAR_MONTH"), Self::DAY_TIME => Some("DAY_TIME"), + Self::MONTH_DAY_NANO => Some("MONTH_DAY_NANO"), _ => None, } } diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 50e858f098a8..cca0c3196c6e 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -309,7 +309,8 @@ fn create_primitive_array( | Timestamp(_, _) | Date64 | Duration(_) - | Interval(IntervalUnit::DayTime) => { + | Interval(IntervalUnit::DayTime) + | Interval(IntervalUnit::MonthDayNano) => { let mut builder = ArrayData::builder(data_type.clone()) .len(length) .buffers(buffers[1..].to_vec()) diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index fbc0bd579e0b..4167dd5b408e 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -106,6 +106,45 @@ macro_rules! make_string_interval_day_time { }}; } +macro_rules! make_string_interval_month_day_nano { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let value: u128 = array.value($row) as u128; + + let months_part: i32 = + ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; + let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; + + let secs = nanoseconds_part / 1000000000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + format!( + "0 years {} mons {} days {} hours {} mins {}.{:02} secs", + months_part, + days_part, + hours, + mins, + secs, + (nanoseconds_part % 1000000000), + ) + }; + + Ok(s) + }}; +} + macro_rules! make_string_date { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); @@ -292,6 +331,9 @@ pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result { make_string_interval_year_month!(column, row) } + IntervalUnit::MonthDayNano => { + make_string_interval_month_day_nano!(column, row) + } }, DataType::List(_) => make_string_from_list!(column, row), DataType::Dictionary(index_type, _value_type) => match **index_type { diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index ada2494d3c2d..2adcfd734d76 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -286,6 +286,10 @@ impl ArrowJsonBatch { .collect::>(); arr.equals_json(&x.iter().collect::>()[..]) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let arr = IntervalMonthDayNanoArray::from(arr.data().clone()); + arr.equals_json(&json_array.iter().collect::>()[..]) + } DataType::UInt8 => { let arr = arr.as_any().downcast_ref::().unwrap(); arr.equals_json(&json_array.iter().collect::>()[..]) diff --git a/format/Schema.fbs b/format/Schema.fbs index 3b00dd4780d6..9da095177c7d 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -246,15 +246,24 @@ table Timestamp { timezone: string; } -enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} +enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO} // A "calendar" interval which models types that don't necessarily // have a precise duration without the context of a base timestamp (e.g. // days can differ in length during day light savings time transitions). +// All integers in the types below are stored in the endianness indicated +// by the schema. // YEAR_MONTH - Indicates the number of elapsed whole months, stored as -// 4-byte integers. +// 4-byte signed integers. // DAY_TIME - Indicates the number of elapsed days and milliseconds, // stored as 2 contiguous 32-bit integers (8-bytes in total). Support // of this IntervalUnit is not required for full arrow compatibility. +// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds. +// The values are stored contiguously in 16 byte blocks. Months and +// days are encoded as 32 bit integers and nanoseconds is encoded as a +// 64 bit integer. All integers are signed. Each field is independent +// (e.g. there is no constraint that nanoseconds have the same sign +// as days or that the quantity of nanoseconds represents less +// than a day's worth of time). table Interval { unit: IntervalUnit; } diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 6db3fce91e0d..06a0eb1a7a70 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -210,6 +210,29 @@ fn array_from_json( let array = Arc::new(b.finish()) as ArrayRef; arrow::compute::cast(&array, field.data_type()) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Number(n) => n.as_i64().unwrap() as i128, + Value::String(s) => { + s.parse().expect("Unable to parse string as i64") + } + _ => panic!("Unable to parse {:?} as number", value), + }), + _ => b.append_null(), + }?; + } + let array = Arc::new(b.finish()) as ArrayRef; + arrow::compute::cast(&array, field.data_type()) + } DataType::UInt8 => { let mut b = UInt8Builder::new(json_col.count); for (is_valid, value) in json_col diff --git a/parquet/src/arrow/array_reader.rs b/parquet/src/arrow/array_reader.rs index d3259c46bbad..c44971bd899e 100644 --- a/parquet/src/arrow/array_reader.rs +++ b/parquet/src/arrow/array_reader.rs @@ -58,7 +58,8 @@ use crate::arrow::converter::{ BinaryArrayConverter, BinaryConverter, Converter, DecimalArrayConverter, DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter, Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter, - IntervalDayTimeConverter, IntervalYearMonthArrayConverter, + IntervalDayTimeConverter, IntervalMonthDayNanoArrayConverter, + IntervalMonthDayNanoConverter, IntervalYearMonthArrayConverter, IntervalYearMonthConverter, LargeBinaryArrayConverter, LargeBinaryConverter, LargeUtf8ArrayConverter, LargeUtf8Converter, }; @@ -1807,6 +1808,20 @@ impl<'a> ArrayReaderBuilder { arrow_type, )?)) } + Some(ArrowType::Interval(IntervalUnit::MonthDayNano)) => { + let converter = IntervalMonthDayNanoConverter::new( + IntervalMonthDayNanoArrayConverter {}, + ); + Ok(Box::new(ComplexObjectArrayReader::< + FixedLenByteArrayType, + IntervalMonthDayNanoConverter, + >::new( + page_iterator, + column_desc, + converter, + arrow_type, + )?)) + } Some(t) => Err(ArrowError(format!( "Cannot write a Parquet interval to {:?}", t diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 7728cd4cb2f2..d4964d9b4d4b 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -426,6 +426,13 @@ fn write_leaf( .unwrap(); get_interval_dt_array_slice(array, &indices) } + IntervalUnit::MonthDayNano => { + let array = column + .as_any() + .downcast_ref::() + .unwrap(); + get_interval_mdn_array_slice(array, &indices) + } }, ArrowDataType::FixedSizeBinary(_) => { let array = column @@ -542,6 +549,20 @@ fn get_interval_dt_array_slice( values } +/// Returns 16-byte values representing 3 values of months, days (4-bytes each) and nanoseconds (8-bytes). +fn get_interval_mdn_array_slice( + array: &arrow_array::IntervalMonthDayNanoArray, + indices: &[usize], +) -> Vec { + let mut values = Vec::with_capacity(indices.len()); + for i in indices { + let value = array.value(*i).to_le_bytes().to_vec(); + debug_assert_eq!(value.len(), 16); + values.push(FixedLenByteArray::from(ByteArray::from(value))); + } + values +} + fn get_decimal_array_slice( array: &arrow_array::DecimalArray, indices: &[usize], diff --git a/parquet/src/arrow/converter.rs b/parquet/src/arrow/converter.rs index 1672be9c0462..3e9cc6e27963 100644 --- a/parquet/src/arrow/converter.rs +++ b/parquet/src/arrow/converter.rs @@ -19,9 +19,10 @@ use crate::data_type::{ByteArray, DataType, FixedLenByteArray, Int96}; // TODO: clean up imports (best done when there are few moving parts) use arrow::array::{ Array, ArrayRef, BinaryBuilder, DecimalBuilder, FixedSizeBinaryBuilder, - IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalYearMonthArray, - IntervalYearMonthBuilder, LargeBinaryBuilder, LargeStringBuilder, PrimitiveBuilder, - PrimitiveDictionaryBuilder, StringBuilder, StringDictionaryBuilder, + IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalMonthDayNanoArray, + IntervalMonthDayNanoBuilder, IntervalYearMonthArray, IntervalYearMonthBuilder, + LargeBinaryBuilder, LargeStringBuilder, PrimitiveBuilder, PrimitiveDictionaryBuilder, + StringBuilder, StringDictionaryBuilder, }; use arrow::compute::cast; use std::convert::{From, TryInto}; @@ -167,6 +168,46 @@ impl Converter>, IntervalDayTimeArray> } } +/// An Arrow Interval converter, which reads the last 16 bytes of a Parquet interval, +/// and interprets it as an i128 value representing the Arrow MonthDayNano value +pub struct IntervalMonthDayNanoArrayConverter {} + +impl IntervalMonthDayNanoArrayConverter { + fn from_bytes_to_i128(b: &[u8]) -> i128 { + assert!( + b.len() <= 16, + "IntervalMonthDayNanoArray supports only up to size 16" + ); + let first_bit = b[0] & 128u8 == 128u8; + let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; + for (i, v) in b.iter().enumerate() { + result[i + (16 - b.len())] = *v; + } + i128::from_be_bytes(result) + } +} + +impl Converter>, IntervalMonthDayNanoArray> + for IntervalMonthDayNanoArrayConverter +{ + fn convert( + &self, + source: Vec>, + ) -> Result { + let mut builder = IntervalMonthDayNanoBuilder::new(source.len()); + for v in source { + match v { + Some(array) => { + builder.append_value(Self::from_bytes_to_i128(array.data())) + } + None => builder.append_null(), + }? + } + + Ok(builder.finish()) + } +} + pub struct Int96ArrayConverter { pub timezone: Option, } @@ -388,6 +429,11 @@ pub type IntervalDayTimeConverter = ArrayRefConverter< IntervalDayTimeArray, IntervalDayTimeArrayConverter, >; +pub type IntervalMonthDayNanoConverter = ArrayRefConverter< + Vec>, + IntervalMonthDayNanoArray, + IntervalMonthDayNanoArrayConverter, +>; pub type DecimalConverter = ArrayRefConverter< Vec>, diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 5fe94cef94db..70a94a7e5c1b 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -2013,18 +2013,19 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), Field::new( - "c21", + "c22", DataType::List(Box::new(Field::new("list", DataType::Boolean, true))), false, ), // Field::new( - // "c22", + // "c23", // DataType::FixedSizeList(Box::new(DataType::Boolean), 5), // false, // ), // Field::new( - // "c23", + // "c24", // DataType::List(Box::new(DataType::LargeList(Box::new( // DataType::Struct(vec![ // Field::new("a", DataType::Int16, true), @@ -2034,21 +2035,21 @@ mod tests { // true, // ), Field::new( - "c24", + "c25", DataType::Struct(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::UInt16, false), ]), false, ), - Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), - // Field::new("c27", DataType::Duration(TimeUnit::Second), false), - // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), - // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), - // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), + // Field::new("c28", DataType::Duration(TimeUnit::Second), false), + // Field::new("c29", DataType::Duration(TimeUnit::Millisecond), false), + // Field::new("c30", DataType::Duration(TimeUnit::Microsecond), false), + // Field::new("c31", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( - "c31", + "c32", DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), @@ -2057,10 +2058,10 @@ mod tests { 123, true, ), - Field::new("c32", DataType::LargeBinary, true), - Field::new("c33", DataType::LargeUtf8, true), + Field::new("c33", DataType::LargeBinary, true), + Field::new("c34", DataType::LargeUtf8, true), // Field::new( - // "c34", + // "c35", // DataType::LargeList(Box::new(DataType::List(Box::new( // DataType::Struct(vec![ // Field::new("a", DataType::Int16, true), @@ -2069,12 +2070,12 @@ mod tests { // )))), // true, // ), - Field::new("c35", DataType::Null, true), - Field::new("c36", DataType::Decimal(2, 1), false), - Field::new("c37", DataType::Decimal(50, 20), false), - Field::new("c38", DataType::Decimal(18, 12), true), + Field::new("c36", DataType::Null, true), + Field::new("c37", DataType::Decimal(2, 1), false), + Field::new("c38", DataType::Decimal(50, 20), false), + Field::new("c39", DataType::Decimal(18, 12), true), Field::new( - "c39", + "c40", DataType::Map( Box::new(Field::new( "key_value", @@ -2097,7 +2098,7 @@ mod tests { true, ), Field::new( - "c40", + "c41", DataType::Map( Box::new(Field::new( "my_entries", From 4791a976f7160b4afc89a88f6d77d5045d80276a Mon Sep 17 00:00:00 2001 From: b41sh Date: Sat, 18 Sep 2021 18:16:11 +0800 Subject: [PATCH 02/17] fix --- integration-testing/src/lib.rs | 23 -------------- parquet/src/arrow/array_reader.rs | 17 +--------- parquet/src/arrow/arrow_writer.rs | 38 +++++++++++----------- parquet/src/arrow/converter.rs | 52 ++----------------------------- parquet/src/arrow/schema.rs | 41 ++++++++++++------------ 5 files changed, 42 insertions(+), 129 deletions(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 06a0eb1a7a70..6db3fce91e0d 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -210,29 +210,6 @@ fn array_from_json( let array = Arc::new(b.finish()) as ArrayRef; arrow::compute::cast(&array, field.data_type()) } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(match value { - Value::Number(n) => n.as_i64().unwrap() as i128, - Value::String(s) => { - s.parse().expect("Unable to parse string as i64") - } - _ => panic!("Unable to parse {:?} as number", value), - }), - _ => b.append_null(), - }?; - } - let array = Arc::new(b.finish()) as ArrayRef; - arrow::compute::cast(&array, field.data_type()) - } DataType::UInt8 => { let mut b = UInt8Builder::new(json_col.count); for (is_valid, value) in json_col diff --git a/parquet/src/arrow/array_reader.rs b/parquet/src/arrow/array_reader.rs index c44971bd899e..d3259c46bbad 100644 --- a/parquet/src/arrow/array_reader.rs +++ b/parquet/src/arrow/array_reader.rs @@ -58,8 +58,7 @@ use crate::arrow::converter::{ BinaryArrayConverter, BinaryConverter, Converter, DecimalArrayConverter, DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter, Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter, - IntervalDayTimeConverter, IntervalMonthDayNanoArrayConverter, - IntervalMonthDayNanoConverter, IntervalYearMonthArrayConverter, + IntervalDayTimeConverter, IntervalYearMonthArrayConverter, IntervalYearMonthConverter, LargeBinaryArrayConverter, LargeBinaryConverter, LargeUtf8ArrayConverter, LargeUtf8Converter, }; @@ -1808,20 +1807,6 @@ impl<'a> ArrayReaderBuilder { arrow_type, )?)) } - Some(ArrowType::Interval(IntervalUnit::MonthDayNano)) => { - let converter = IntervalMonthDayNanoConverter::new( - IntervalMonthDayNanoArrayConverter {}, - ); - Ok(Box::new(ComplexObjectArrayReader::< - FixedLenByteArrayType, - IntervalMonthDayNanoConverter, - >::new( - page_iterator, - column_desc, - converter, - arrow_type, - )?)) - } Some(t) => Err(ArrowError(format!( "Cannot write a Parquet interval to {:?}", t diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index d4964d9b4d4b..dba3fb4c6277 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -426,12 +426,13 @@ fn write_leaf( .unwrap(); get_interval_dt_array_slice(array, &indices) } - IntervalUnit::MonthDayNano => { - let array = column - .as_any() - .downcast_ref::() - .unwrap(); - get_interval_mdn_array_slice(array, &indices) + _ => { + return Err(ParquetError::NYI( + format!( + "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", + interval_unit + ) + )); } }, ArrowDataType::FixedSizeBinary(_) => { @@ -549,20 +550,6 @@ fn get_interval_dt_array_slice( values } -/// Returns 16-byte values representing 3 values of months, days (4-bytes each) and nanoseconds (8-bytes). -fn get_interval_mdn_array_slice( - array: &arrow_array::IntervalMonthDayNanoArray, - indices: &[usize], -) -> Vec { - let mut values = Vec::with_capacity(indices.len()); - for i in indices { - let value = array.value(*i).to_le_bytes().to_vec(); - debug_assert_eq!(value.len(), 16); - values.push(FixedLenByteArray::from(ByteArray::from(value))); - } - values -} - fn get_decimal_array_slice( array: &arrow_array::DecimalArray, indices: &[usize], @@ -1463,6 +1450,17 @@ mod tests { ); } + #[test] + #[should_panic( + expected = "Attempting to write an Arrow type MonthDayNano to parquet that is not yet implemented" + )] + fn interval_month_day_nano_single_column() { + required_and_optional::( + 0..SMALL_SIZE as i128, + "interval_month_day_nano_single_column", + ); + } + #[test] fn binary_single_column() { let one_vec: Vec = (0..SMALL_SIZE as u8).collect(); diff --git a/parquet/src/arrow/converter.rs b/parquet/src/arrow/converter.rs index 3e9cc6e27963..1672be9c0462 100644 --- a/parquet/src/arrow/converter.rs +++ b/parquet/src/arrow/converter.rs @@ -19,10 +19,9 @@ use crate::data_type::{ByteArray, DataType, FixedLenByteArray, Int96}; // TODO: clean up imports (best done when there are few moving parts) use arrow::array::{ Array, ArrayRef, BinaryBuilder, DecimalBuilder, FixedSizeBinaryBuilder, - IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalMonthDayNanoArray, - IntervalMonthDayNanoBuilder, IntervalYearMonthArray, IntervalYearMonthBuilder, - LargeBinaryBuilder, LargeStringBuilder, PrimitiveBuilder, PrimitiveDictionaryBuilder, - StringBuilder, StringDictionaryBuilder, + IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalYearMonthArray, + IntervalYearMonthBuilder, LargeBinaryBuilder, LargeStringBuilder, PrimitiveBuilder, + PrimitiveDictionaryBuilder, StringBuilder, StringDictionaryBuilder, }; use arrow::compute::cast; use std::convert::{From, TryInto}; @@ -168,46 +167,6 @@ impl Converter>, IntervalDayTimeArray> } } -/// An Arrow Interval converter, which reads the last 16 bytes of a Parquet interval, -/// and interprets it as an i128 value representing the Arrow MonthDayNano value -pub struct IntervalMonthDayNanoArrayConverter {} - -impl IntervalMonthDayNanoArrayConverter { - fn from_bytes_to_i128(b: &[u8]) -> i128 { - assert!( - b.len() <= 16, - "IntervalMonthDayNanoArray supports only up to size 16" - ); - let first_bit = b[0] & 128u8 == 128u8; - let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] }; - for (i, v) in b.iter().enumerate() { - result[i + (16 - b.len())] = *v; - } - i128::from_be_bytes(result) - } -} - -impl Converter>, IntervalMonthDayNanoArray> - for IntervalMonthDayNanoArrayConverter -{ - fn convert( - &self, - source: Vec>, - ) -> Result { - let mut builder = IntervalMonthDayNanoBuilder::new(source.len()); - for v in source { - match v { - Some(array) => { - builder.append_value(Self::from_bytes_to_i128(array.data())) - } - None => builder.append_null(), - }? - } - - Ok(builder.finish()) - } -} - pub struct Int96ArrayConverter { pub timezone: Option, } @@ -429,11 +388,6 @@ pub type IntervalDayTimeConverter = ArrayRefConverter< IntervalDayTimeArray, IntervalDayTimeArrayConverter, >; -pub type IntervalMonthDayNanoConverter = ArrayRefConverter< - Vec>, - IntervalMonthDayNanoArray, - IntervalMonthDayNanoArrayConverter, ->; pub type DecimalConverter = ArrayRefConverter< Vec>, diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 70a94a7e5c1b..5fe94cef94db 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -2013,19 +2013,18 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), - Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), Field::new( - "c22", + "c21", DataType::List(Box::new(Field::new("list", DataType::Boolean, true))), false, ), // Field::new( - // "c23", + // "c22", // DataType::FixedSizeList(Box::new(DataType::Boolean), 5), // false, // ), // Field::new( - // "c24", + // "c23", // DataType::List(Box::new(DataType::LargeList(Box::new( // DataType::Struct(vec![ // Field::new("a", DataType::Int16, true), @@ -2035,21 +2034,21 @@ mod tests { // true, // ), Field::new( - "c25", + "c24", DataType::Struct(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::UInt16, false), ]), false, ), - Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), - // Field::new("c28", DataType::Duration(TimeUnit::Second), false), - // Field::new("c29", DataType::Duration(TimeUnit::Millisecond), false), - // Field::new("c30", DataType::Duration(TimeUnit::Microsecond), false), - // Field::new("c31", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), + // Field::new("c27", DataType::Duration(TimeUnit::Second), false), + // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), + // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), + // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( - "c32", + "c31", DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), @@ -2058,10 +2057,10 @@ mod tests { 123, true, ), - Field::new("c33", DataType::LargeBinary, true), - Field::new("c34", DataType::LargeUtf8, true), + Field::new("c32", DataType::LargeBinary, true), + Field::new("c33", DataType::LargeUtf8, true), // Field::new( - // "c35", + // "c34", // DataType::LargeList(Box::new(DataType::List(Box::new( // DataType::Struct(vec![ // Field::new("a", DataType::Int16, true), @@ -2070,12 +2069,12 @@ mod tests { // )))), // true, // ), - Field::new("c36", DataType::Null, true), - Field::new("c37", DataType::Decimal(2, 1), false), - Field::new("c38", DataType::Decimal(50, 20), false), - Field::new("c39", DataType::Decimal(18, 12), true), + Field::new("c35", DataType::Null, true), + Field::new("c36", DataType::Decimal(2, 1), false), + Field::new("c37", DataType::Decimal(50, 20), false), + Field::new("c38", DataType::Decimal(18, 12), true), Field::new( - "c40", + "c39", DataType::Map( Box::new(Field::new( "key_value", @@ -2098,7 +2097,7 @@ mod tests { true, ), Field::new( - "c41", + "c40", DataType::Map( Box::new(Field::new( "my_entries", From 41132568ffe8a0481163110cf51d0b773a89c1a7 Mon Sep 17 00:00:00 2001 From: b41sh Date: Thu, 23 Sep 2021 01:32:18 +0800 Subject: [PATCH 03/17] fix --- arrow/src/array/array_primitive.rs | 3 ++- arrow/src/datatypes/datatype.rs | 2 +- arrow/src/datatypes/native.rs | 23 ----------------------- parquet/src/arrow/arrow_writer.rs | 2 +- 4 files changed, 4 insertions(+), 26 deletions(-) diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index a27093129f52..da1769b0aadc 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -627,7 +627,8 @@ mod tests { assert_eq!(-5, arr.values()[2]); // a month_day_nano interval contains months, days and nanoseconds, - // but we do not yet have accessors for the values + // but we do not yet have accessors for the values. + // TODO: implement month, day, and nanos access method for month_day_nano. let arr = IntervalMonthDayNanoArray::from(vec![ Some(100000000000000000000), None, diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index b12c9b0c9ce2..ab8cf0cc648d 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -158,7 +158,7 @@ pub enum TimeUnit { Nanosecond, } -/// YEAR_MONTH, DAY_TIME, MonthDayNano interval in SQL style. +/// YEAR_MONTH, DAY_TIME, Month_Day_Nano interval in SQL style. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum IntervalUnit { /// Indicates the number of elapsed whole months, stored as 4-byte integers. diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 579536333abc..d91f58153459 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -328,29 +328,6 @@ impl ArrowNativeType for u64 { } } -impl JsonSerializable for u128 { - fn into_json_value(self) -> Option { - Some(self.into()) - } -} - -impl ArrowNativeType for u128 { - #[inline] - fn from_usize(v: usize) -> Option { - num::FromPrimitive::from_usize(v) - } - - #[inline] - fn to_usize(&self) -> Option { - num::ToPrimitive::to_usize(self) - } - - #[inline] - fn to_isize(&self) -> Option { - num::ToPrimitive::to_isize(self) - } -} - impl JsonSerializable for f32 { fn into_json_value(self) -> Option { Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(Value::Number) diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index dba3fb4c6277..ddc2dc5ecc9e 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -429,7 +429,7 @@ fn write_leaf( _ => { return Err(ParquetError::NYI( format!( - "Attempting to write an Arrow type {:?} to parquet that is not yet implemented", + "Attempting to write an Arrow interval type {:?} to parquet that is not yet implemented", interval_unit ) )); From 3c2155e15bcf528fb49726533880e88ac421f4d6 Mon Sep 17 00:00:00 2001 From: b41sh Date: Thu, 23 Sep 2021 01:34:54 +0800 Subject: [PATCH 04/17] fix --- arrow/src/datatypes/datatype.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index ab8cf0cc648d..b08f714a8209 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -158,7 +158,7 @@ pub enum TimeUnit { Nanosecond, } -/// YEAR_MONTH, DAY_TIME, Month_Day_Nano interval in SQL style. +/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum IntervalUnit { /// Indicates the number of elapsed whole months, stored as 4-byte integers. From 1ff27a3c54c55f8284a2e473965d4633701a6cb8 Mon Sep 17 00:00:00 2001 From: b41sh Date: Thu, 23 Sep 2021 01:48:39 +0800 Subject: [PATCH 05/17] fix test --- arrow/src/datatypes/mod.rs | 1 - parquet/src/arrow/arrow_writer.rs | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 45b8d1822329..bc866b0145d7 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -1181,7 +1181,6 @@ mod tests { assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value()); - assert_eq!(Some(VNumber(Number::from(1))), 1u128.into_json_value()); assert_eq!( Some(VNumber(Number::from_f64(0.01f64).unwrap())), 0.01.into_json_value() diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index ddc2dc5ecc9e..90cf38816c83 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -1452,7 +1452,7 @@ mod tests { #[test] #[should_panic( - expected = "Attempting to write an Arrow type MonthDayNano to parquet that is not yet implemented" + expected = "Attempting to write an Arrow interval type MonthDayNano to parquet that is not yet implemented" )] fn interval_month_day_nano_single_column() { required_and_optional::( From 2ad60f1fc2c234fed6fd257a2f79c7e08ddc6844 Mon Sep 17 00:00:00 2001 From: b41sh Date: Sun, 5 Dec 2021 01:23:36 +0800 Subject: [PATCH 06/17] add IPC integration test --- .github/workflows/integration.yml | 4 +- integration-testing/unskip.patch | 70 +++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 integration-testing/unskip.patch diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 41b1dcbe8eb9..10db6ceb9f98 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -38,6 +38,8 @@ jobs: with: path: rust fetch-depth: 0 + - name: Test more cases + run: git apply rust/integration-testing/unskip.patch - name: Setup Python uses: actions/setup-python@v2 with: @@ -84,7 +86,7 @@ jobs: virtualenv venv source venv/bin/activate pip install maturin toml pytest pytz pyarrow>=5.0 - - name: Run tests + - name: Run tests env: CARGO_HOME: "/home/runner/.cargo" CARGO_TARGET_DIR: "/home/runner/target" diff --git a/integration-testing/unskip.patch b/integration-testing/unskip.patch new file mode 100644 index 000000000000..436e2b5cfa2e --- /dev/null +++ b/integration-testing/unskip.patch @@ -0,0 +1,70 @@ +diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py +index 6a077a893..cab6ecd37 100644 +--- a/dev/archery/archery/integration/datagen.py ++++ b/dev/archery/archery/integration/datagen.py +@@ -1561,8 +1561,7 @@ def get_generated_json_files(tempdir=None): + .skip_category('C#') + .skip_category('JS'), # TODO(ARROW-7900) + +- generate_decimal128_case() +- .skip_category('Rust'), ++ generate_decimal128_case(), + + generate_decimal256_case() + .skip_category('Go') # TODO(ARROW-7948): Decimal + Go +@@ -1574,18 +1573,15 @@ def get_generated_json_files(tempdir=None): + + generate_interval_case() + .skip_category('C#') +- .skip_category('JS') # TODO(ARROW-5239): Intervals + JS +- .skip_category('Rust'), ++ .skip_category('JS'), # TODO(ARROW-5239): Intervals + JS + + generate_month_day_nano_interval_case() + .skip_category('C#') +- .skip_category('JS') +- .skip_category('Rust'), ++ .skip_category('JS'), + + + generate_map_case() +- .skip_category('C#') +- .skip_category('Rust'), ++ .skip_category('C#'), + + generate_non_canonical_map_case() + .skip_category('C#') +@@ -1602,14 +1598,12 @@ def get_generated_json_files(tempdir=None): + generate_nested_large_offsets_case() + .skip_category('C#') + .skip_category('Go') +- .skip_category('JS') +- .skip_category('Rust'), ++ .skip_category('JS'), + + generate_unions_case() + .skip_category('C#') + .skip_category('Go') +- .skip_category('JS') +- .skip_category('Rust'), ++ .skip_category('JS'), + + generate_custom_metadata_case() + .skip_category('C#') +@@ -1634,14 +1628,12 @@ def get_generated_json_files(tempdir=None): + .skip_category('C#') + .skip_category('Go') + .skip_category('Java') # TODO(ARROW-7779) +- .skip_category('JS') +- .skip_category('Rust'), ++ .skip_category('JS'), + + generate_extension_case() + .skip_category('C#') + .skip_category('Go') # TODO(ARROW-3039): requires dictionaries +- .skip_category('JS') +- .skip_category('Rust'), ++ .skip_category('JS'), + ] + + generated_paths = [] From 4e19eb8cff858caeac42b216c7961a53237f417c Mon Sep 17 00:00:00 2001 From: b41sh Date: Sun, 5 Dec 2021 02:02:34 +0800 Subject: [PATCH 07/17] fix rat --- integration-testing/unskip.patch | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/integration-testing/unskip.patch b/integration-testing/unskip.patch index 436e2b5cfa2e..47e0877b8fa1 100644 --- a/integration-testing/unskip.patch +++ b/integration-testing/unskip.patch @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 6a077a893..cab6ecd37 100644 --- a/dev/archery/archery/integration/datagen.py From cb5a23715f7629ac301023e876ef6b5a3c36c945 Mon Sep 17 00:00:00 2001 From: b41sh Date: Sun, 5 Dec 2021 07:32:58 +0800 Subject: [PATCH 08/17] update patch --- integration-testing/unskip.patch | 62 ++++---------------------------- 1 file changed, 6 insertions(+), 56 deletions(-) diff --git a/integration-testing/unskip.patch b/integration-testing/unskip.patch index 47e0877b8fa1..0b5ba317f2e8 100644 --- a/integration-testing/unskip.patch +++ b/integration-testing/unskip.patch @@ -16,72 +16,22 @@ // under the License. diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py -index 6a077a893..cab6ecd37 100644 +index 6a077a893..277722c25 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py -@@ -1561,8 +1561,7 @@ def get_generated_json_files(tempdir=None): - .skip_category('C#') - .skip_category('JS'), # TODO(ARROW-7900) - -- generate_decimal128_case() -- .skip_category('Rust'), -+ generate_decimal128_case(), - - generate_decimal256_case() - .skip_category('Go') # TODO(ARROW-7948): Decimal + Go -@@ -1574,18 +1573,15 @@ def get_generated_json_files(tempdir=None): - +@@ -1574,13 +1574,11 @@ def get_generated_json_files(tempdir=None): + generate_interval_case() .skip_category('C#') - .skip_category('JS') # TODO(ARROW-5239): Intervals + JS - .skip_category('Rust'), + .skip_category('JS'), # TODO(ARROW-5239): Intervals + JS - + generate_month_day_nano_interval_case() .skip_category('C#') - .skip_category('JS') - .skip_category('Rust'), + .skip_category('JS'), - - + + generate_map_case() -- .skip_category('C#') -- .skip_category('Rust'), -+ .skip_category('C#'), - - generate_non_canonical_map_case() - .skip_category('C#') -@@ -1602,14 +1598,12 @@ def get_generated_json_files(tempdir=None): - generate_nested_large_offsets_case() - .skip_category('C#') - .skip_category('Go') -- .skip_category('JS') -- .skip_category('Rust'), -+ .skip_category('JS'), - - generate_unions_case() - .skip_category('C#') - .skip_category('Go') -- .skip_category('JS') -- .skip_category('Rust'), -+ .skip_category('JS'), - - generate_custom_metadata_case() - .skip_category('C#') -@@ -1634,14 +1628,12 @@ def get_generated_json_files(tempdir=None): - .skip_category('C#') - .skip_category('Go') - .skip_category('Java') # TODO(ARROW-7779) -- .skip_category('JS') -- .skip_category('Rust'), -+ .skip_category('JS'), - - generate_extension_case() - .skip_category('C#') - .skip_category('Go') # TODO(ARROW-3039): requires dictionaries -- .skip_category('JS') -- .skip_category('Rust'), -+ .skip_category('JS'), - ] - - generated_paths = [] From b2065adf75bff03e839738dcfd07b8d0fbb03354 Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 01:08:42 +0800 Subject: [PATCH 09/17] fix --- .github/workflows/integration.yml | 2 +- integration-testing/src/lib.rs | 35 +++++++++++++++++++++++++++++++ integration-testing/unskip.patch | 11 +++------- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 10db6ceb9f98..1dc84878e61b 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -86,7 +86,7 @@ jobs: virtualenv venv source venv/bin/activate pip install maturin toml pytest pytz pyarrow>=5.0 - - name: Run tests + - name: Run tests env: CARGO_HOME: "/home/runner/.cargo" CARGO_TARGET_DIR: "/home/runner/target" diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index f25157f635bc..14ace11bc7e0 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -280,6 +280,41 @@ fn array_from_json( } Ok(Arc::new(b.finish())) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Object(v) => { + let months = v.get("months").unwrap(); + let days = v.get("days").unwrap(); + let nanoseconds = v.get("nanoseconds").unwrap(); + match (months, days, nanoseconds) { + (Value::Number(months), Value::Number(days), Value::Number(nanoseconds)) => { + let months = months.as_i64().unwrap() as u32; + let days = days.as_i64().unwrap() as i32; + let nanoseconds = nanoseconds.as_i64().unwrap(); + let months_days_ns: i128 = (months as i128) << 96 | (days as i128) << 64 | (nanoseconds as i128); + months_days_ns + } + (_, _, _) => panic!("Unable to parse {:?} as MonthDayNano", v), + } + } + _ => panic!("Unable to parse {:?} as MonthDayNano", value), + }), + _ => b.append_null(), + }?; + } + let array = Arc::new(b.finish()) as ArrayRef; + arrow::compute::cast(&array, field.data_type()) + } + DataType::Float32 => { let mut b = Float32Builder::new(json_col.count); for (is_valid, value) in json_col diff --git a/integration-testing/unskip.patch b/integration-testing/unskip.patch index 0b5ba317f2e8..5f53afd60f0c 100644 --- a/integration-testing/unskip.patch +++ b/integration-testing/unskip.patch @@ -16,16 +16,10 @@ // under the License. diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py -index 6a077a893..277722c25 100644 +index 6a077a893..104c49c52 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py -@@ -1574,13 +1574,11 @@ def get_generated_json_files(tempdir=None): - - generate_interval_case() - .skip_category('C#') -- .skip_category('JS') # TODO(ARROW-5239): Intervals + JS -- .skip_category('Rust'), -+ .skip_category('JS'), # TODO(ARROW-5239): Intervals + JS +@@ -1579,8 +1579,7 @@ def get_generated_json_files(tempdir=None): generate_month_day_nano_interval_case() .skip_category('C#') @@ -35,3 +29,4 @@ index 6a077a893..277722c25 100644 generate_map_case() + From 1304df6b6f3f072baec17d9db3a3312c08b2eb67 Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 01:26:10 +0800 Subject: [PATCH 10/17] fmt --- integration-testing/src/lib.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 14ace11bc7e0..8cd507bde4e0 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -296,14 +296,22 @@ fn array_from_json( let days = v.get("days").unwrap(); let nanoseconds = v.get("nanoseconds").unwrap(); match (months, days, nanoseconds) { - (Value::Number(months), Value::Number(days), Value::Number(nanoseconds)) => { + ( + Value::Number(months), + Value::Number(days), + Value::Number(nanoseconds), + ) => { let months = months.as_i64().unwrap() as u32; let days = days.as_i64().unwrap() as i32; let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = (months as i128) << 96 | (days as i128) << 64 | (nanoseconds as i128); + let months_days_ns: i128 = (months as i128) << 96 + | (days as i128) << 64 + | (nanoseconds as i128); months_days_ns } - (_, _, _) => panic!("Unable to parse {:?} as MonthDayNano", v), + (_, _, _) => { + panic!("Unable to parse {:?} as MonthDayNano", v) + } } } _ => panic!("Unable to parse {:?} as MonthDayNano", value), From 74abfcba13c541432780ca9634ad106e724c4f79 Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 16:30:42 +0800 Subject: [PATCH 11/17] fix --- integration-testing/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 8cd507bde4e0..840377b49624 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -301,7 +301,7 @@ fn array_from_json( Value::Number(days), Value::Number(nanoseconds), ) => { - let months = months.as_i64().unwrap() as u32; + let months = months.as_i64().unwrap() as i32; let days = days.as_i64().unwrap() as i32; let nanoseconds = nanoseconds.as_i64().unwrap(); let months_days_ns: i128 = (months as i128) << 96 From 49d503ef058b552e3549bda030a7efc7b5a76ec6 Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 17:21:47 +0800 Subject: [PATCH 12/17] fix --- integration-testing/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 840377b49624..45b8c1a5e9d5 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -301,8 +301,8 @@ fn array_from_json( Value::Number(days), Value::Number(nanoseconds), ) => { - let months = months.as_i64().unwrap() as i32; - let days = days.as_i64().unwrap() as i32; + let months = months.as_i64().unwrap(); + let days = days.as_i64().unwrap(); let nanoseconds = nanoseconds.as_i64().unwrap(); let months_days_ns: i128 = (months as i128) << 96 | (days as i128) << 64 From 3571b1bbcdc511f3d325c683f5604c7c8fd999b8 Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 18:27:57 +0800 Subject: [PATCH 13/17] fix --- integration-testing/src/lib.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 45b8c1a5e9d5..7d398af54429 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -295,6 +295,10 @@ fn array_from_json( let months = v.get("months").unwrap(); let days = v.get("days").unwrap(); let nanoseconds = v.get("nanoseconds").unwrap(); + println!( + "months={:?} days={:?} nanos={:?}", + months, days, nanoseconds + ); match (months, days, nanoseconds) { ( Value::Number(months), @@ -307,6 +311,7 @@ fn array_from_json( let months_days_ns: i128 = (months as i128) << 96 | (days as i128) << 64 | (nanoseconds as i128); + println!("months_days_ns={:?}", months_days_ns); months_days_ns } (_, _, _) => { From adc36f106aceff21996c372b79991402b2991895 Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 20:02:01 +0800 Subject: [PATCH 14/17] fix --- integration-testing/src/lib.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 7d398af54429..02d20e90b385 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -305,12 +305,13 @@ fn array_from_json( Value::Number(days), Value::Number(nanoseconds), ) => { - let months = months.as_i64().unwrap(); - let days = days.as_i64().unwrap(); + let months = months.as_i64().unwrap() as i32; + let days = days.as_i64().unwrap() as i32; let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = (months as i128) << 96 - | (days as i128) << 64 - | (nanoseconds as i128); + let months_days_ns: i128 = (nanoseconds as i128) + << 64 + | (days as i128) << 32 + | (months as i128); println!("months_days_ns={:?}", months_days_ns); months_days_ns } From cf8402d32f86c148c5bb347adf05c36e647bc15e Mon Sep 17 00:00:00 2001 From: b41sh Date: Mon, 6 Dec 2021 21:05:49 +0800 Subject: [PATCH 15/17] fix --- integration-testing/src/lib.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 02d20e90b385..d55728c86180 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -308,10 +308,11 @@ fn array_from_json( let months = months.as_i64().unwrap() as i32; let days = days.as_i64().unwrap() as i32; let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = (nanoseconds as i128) + let months_days_ns: i128 = ((nanoseconds as i128) + & 0xFFFFFFFFFFFFFFFF) << 64 - | (days as i128) << 32 - | (months as i128); + | ((days as i128) & 0xFFFFFFFF) << 32 + | ((months as i128) & 0xFFFFFFFF); println!("months_days_ns={:?}", months_days_ns); months_days_ns } From b628d499226f4f7af7b79e4d4d502fa949d1f2ed Mon Sep 17 00:00:00 2001 From: b41sh Date: Tue, 7 Dec 2021 10:19:04 +0800 Subject: [PATCH 16/17] fix --- integration-testing/src/lib.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index d55728c86180..cb57ffc2d0dc 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -295,10 +295,6 @@ fn array_from_json( let months = v.get("months").unwrap(); let days = v.get("days").unwrap(); let nanoseconds = v.get("nanoseconds").unwrap(); - println!( - "months={:?} days={:?} nanos={:?}", - months, days, nanoseconds - ); match (months, days, nanoseconds) { ( Value::Number(months), @@ -313,7 +309,6 @@ fn array_from_json( << 64 | ((days as i128) & 0xFFFFFFFF) << 32 | ((months as i128) & 0xFFFFFFFF); - println!("months_days_ns={:?}", months_days_ns); months_days_ns } (_, _, _) => { @@ -326,10 +321,8 @@ fn array_from_json( _ => b.append_null(), }?; } - let array = Arc::new(b.finish()) as ArrayRef; - arrow::compute::cast(&array, field.data_type()) + Ok(Arc::new(b.finish())) } - DataType::Float32 => { let mut b = Float32Builder::new(json_col.count); for (is_valid, value) in json_col From 841321ae7aa95552f46437a731731acc8c68082c Mon Sep 17 00:00:00 2001 From: b41sh Date: Thu, 9 Dec 2021 16:47:30 +0800 Subject: [PATCH 17/17] remove integration-testing/unskip.patch --- .github/workflows/integration.yml | 2 -- integration-testing/unskip.patch | 32 ------------------------------- 2 files changed, 34 deletions(-) delete mode 100644 integration-testing/unskip.patch diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 1dc84878e61b..41b1dcbe8eb9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -38,8 +38,6 @@ jobs: with: path: rust fetch-depth: 0 - - name: Test more cases - run: git apply rust/integration-testing/unskip.patch - name: Setup Python uses: actions/setup-python@v2 with: diff --git a/integration-testing/unskip.patch b/integration-testing/unskip.patch deleted file mode 100644 index 5f53afd60f0c..000000000000 --- a/integration-testing/unskip.patch +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py -index 6a077a893..104c49c52 100644 ---- a/dev/archery/archery/integration/datagen.py -+++ b/dev/archery/archery/integration/datagen.py -@@ -1579,8 +1579,7 @@ def get_generated_json_files(tempdir=None): - - generate_month_day_nano_interval_case() - .skip_category('C#') -- .skip_category('JS') -- .skip_category('Rust'), -+ .skip_category('JS'), - - - generate_map_case() -