diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 12c141c0a8f8..d7903877165d 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -39,7 +39,7 @@ path = "src/lib.rs" [dependencies] serde = { version = "1.0" } serde_derive = "1.0" -serde_json = { version = "1.0", features = ["preserve_order"] } +serde_json = { version = "1.0", features = ["preserve_order", "arbitrary_precision"] } indexmap = { version = "1.6", features = ["std"] } rand = { version = "0.8", optional = true } num = "0.4" diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 34cdb73f7166..7f790ef8f796 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -275,6 +275,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::Interval(IntervalUnit::DayTime) => { Arc::new(IntervalDayTimeArray::from(data)) as ArrayRef } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Arc::new(IntervalMonthDayNanoArray::from(data)) as ArrayRef + } DataType::Duration(TimeUnit::Second) => { Arc::new(DurationSecondArray::from(data)) as ArrayRef } @@ -415,6 +418,9 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { IntervalUnit::DayTime => { new_null_sized_array::(data_type, length) } + IntervalUnit::MonthDayNano => { + new_null_sized_array::(data_type, length) + } }, DataType::FixedSizeBinary(value_len) => make_array(unsafe { ArrayData::new_unchecked( diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index ac56b4a97cc7..a9e1ba87207e 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -444,6 +444,7 @@ def_numeric_from_vec!(Time64MicrosecondType); def_numeric_from_vec!(Time64NanosecondType); def_numeric_from_vec!(IntervalYearMonthType); def_numeric_from_vec!(IntervalDayTimeType); +def_numeric_from_vec!(IntervalMonthDayNanoType); def_numeric_from_vec!(DurationSecondType); def_numeric_from_vec!(DurationMillisecondType); def_numeric_from_vec!(DurationMicrosecondType); @@ -649,6 +650,23 @@ mod tests { assert!(arr.is_null(1)); assert_eq!(-5, arr.value(2)); assert_eq!(-5, arr.values()[2]); + + // a month_day_nano interval contains months, days and nanoseconds, + // but we do not yet have accessors for the values. + // TODO: implement month, day, and nanos access method for month_day_nano. + let arr = IntervalMonthDayNanoArray::from(vec![ + Some(100000000000000000000), + None, + Some(-500000000000000000000), + ]); + assert_eq!(3, arr.len()); + assert_eq!(0, arr.offset()); + assert_eq!(1, arr.null_count()); + assert_eq!(100000000000000000000, arr.value(0)); + assert_eq!(100000000000000000000, arr.values()[0]); + assert!(arr.is_null(1)); + assert_eq!(-500000000000000000000, arr.value(2)); + assert_eq!(-500000000000000000000, arr.values()[2]); } #[test] diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index af6f3c39a713..8d650db032d6 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1686,6 +1686,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(IntervalDayTimeBuilder::new(capacity)) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + Box::new(IntervalMonthDayNanoBuilder::new(capacity)) + } DataType::Duration(TimeUnit::Second) => { Box::new(DurationSecondBuilder::new(capacity)) } @@ -2031,6 +2034,7 @@ impl FieldData { | DataType::Time64(_) | DataType::Interval(IntervalUnit::DayTime) | DataType::Duration(_) => self.append_null::()?, + DataType::Interval(IntervalUnit::MonthDayNano) => self.append_null::()?, DataType::UInt8 => self.append_null::()?, DataType::UInt16 => self.append_null::()?, DataType::UInt32 => self.append_null::()?, diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 2dc694c852a9..9b46a79ed799 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -121,6 +121,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, ], + DataType::Interval(IntervalUnit::MonthDayNano) => [ + MutableBuffer::new(capacity * mem::size_of::()), + empty_buffer, + ], DataType::Utf8 | DataType::Binary => { let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); // safety: `unsafe` code assumes that this buffer is initialized with one element @@ -1178,6 +1182,9 @@ fn layout(data_type: &DataType) -> DataTypeLayout { DataType::Interval(IntervalUnit::DayTime) => { DataTypeLayout::new_fixed_width(size_of::()) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + DataTypeLayout::new_fixed_width(size_of::()) + } DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::()), DataType::Binary => DataTypeLayout::new_binary(size_of::()), DataType::FixedSizeBinary(bytes_per_value) => { diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index 0e8d8bb2731b..9a044e612906 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -199,6 +199,9 @@ fn equal_values( | DataType::Duration(_) => primitive_equal::( lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, ), + DataType::Interval(IntervalUnit::MonthDayNano) => primitive_equal::( + lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, + ), DataType::Utf8 | DataType::Binary => variable_sized_equal::( lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len, ), diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 26db3e6004a4..1145e833f4bd 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -385,6 +385,7 @@ pub type Time64MicrosecondArray = PrimitiveArray; pub type Time64NanosecondArray = PrimitiveArray; pub type IntervalYearMonthArray = PrimitiveArray; pub type IntervalDayTimeArray = PrimitiveArray; +pub type IntervalMonthDayNanoArray = PrimitiveArray; pub type DurationSecondArray = PrimitiveArray; pub type DurationMillisecondArray = PrimitiveArray; pub type DurationMicrosecondArray = PrimitiveArray; @@ -425,6 +426,7 @@ pub type Time64MicrosecondBufferBuilder = BufferBuilder; pub type Time64NanosecondBufferBuilder = BufferBuilder; pub type IntervalYearMonthBufferBuilder = BufferBuilder; pub type IntervalDayTimeBufferBuilder = BufferBuilder; +pub type IntervalMonthDayNanoBufferBuilder = BufferBuilder; pub type DurationSecondBufferBuilder = BufferBuilder; pub type DurationMillisecondBufferBuilder = BufferBuilder; pub type DurationMicrosecondBufferBuilder = BufferBuilder; @@ -473,6 +475,7 @@ pub type Time64MicrosecondBuilder = PrimitiveBuilder; pub type Time64NanosecondBuilder = PrimitiveBuilder; pub type IntervalYearMonthBuilder = PrimitiveBuilder; pub type IntervalDayTimeBuilder = PrimitiveBuilder; +pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder; pub type DurationSecondBuilder = PrimitiveBuilder; pub type DurationMillisecondBuilder = PrimitiveBuilder; pub type DurationMicrosecondBuilder = PrimitiveBuilder; diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index d6534efc9286..37bd0091eee0 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -174,6 +174,9 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { compare_primitives::(left, right) } + (Interval(MonthDayNano), Interval(MonthDayNano)) => { + compare_primitives::(left, right) + } (Duration(Second), Duration(Second)) => { compare_primitives::(left, right) } diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 9ad3dbf7c13b..4d2d139b982c 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -257,6 +257,9 @@ fn build_extend(array: &ArrayData) -> Extend { | DataType::Interval(IntervalUnit::DayTime) => { primitive::build_extend::(array) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + primitive::build_extend::(array) + } DataType::Utf8 | DataType::Binary => variable_size::build_extend::(array), DataType::LargeUtf8 | DataType::LargeBinary => { variable_size::build_extend::(array) @@ -298,6 +301,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { | DataType::Timestamp(_, _) | DataType::Duration(_) | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::, + DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::, DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::, DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, DataType::List(_) => list::extend_nulls::, diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 2c694248490a..48073637b148 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -3785,6 +3785,7 @@ mod tests { Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), + Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), Arc::new(DurationSecondArray::from(vec![1000, 2000])), Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), @@ -3940,6 +3941,7 @@ mod tests { Duration(TimeUnit::Nanosecond), Interval(IntervalUnit::YearMonth), Interval(IntervalUnit::DayTime), + Interval(IntervalUnit::MonthDayNano), Binary, FixedSizeBinary(10), LargeBinary, diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 6a72224979cd..99ad726f06a2 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -243,6 +243,11 @@ pub fn sort_to_indices( DataType::Interval(IntervalUnit::DayTime) => { sort_primitive::(values, v, n, cmp, &options, limit) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + sort_primitive::( + values, v, n, cmp, &options, limit, + ) + } DataType::Duration(TimeUnit::Second) => { sort_primitive::(values, v, n, cmp, &options, limit) } diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 9fe00ea9a7b9..cd836376cdf2 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -171,6 +171,9 @@ where DataType::Interval(IntervalUnit::DayTime) => { downcast_take!(IntervalDayTimeType, values, indices) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + downcast_take!(IntervalMonthDayNanoType, values, indices) + } DataType::Duration(TimeUnit::Second) => { downcast_take!(DurationSecondType, values, indices) } @@ -1185,6 +1188,15 @@ mod tests { ) .unwrap(); + // interval_month_day_nano + test_take_primitive_arrays::( + vec![Some(0), None, Some(2), Some(-15), None], + &index, + None, + vec![Some(-15), None, None, Some(-15), Some(2)], + ) + .unwrap(); + // duration_second test_take_primitive_arrays::( vec![Some(0), None, Some(2), Some(-15), None], diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 96fb18be6ec9..ae61f0831522 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -158,7 +158,7 @@ pub enum TimeUnit { Nanosecond, } -/// YEAR_MONTH or DAY_TIME interval in SQL style. +/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub enum IntervalUnit { /// Indicates the number of elapsed whole months, stored as 4-byte integers. @@ -166,6 +166,14 @@ pub enum IntervalUnit { /// Indicates the number of elapsed days and milliseconds, /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). DayTime, + /// A triple of the number of elapsed months, days, and nanoseconds. + /// The values are stored contiguously in 16 byte blocks. Months and + /// days are encoded as 32 bit integers and nanoseconds is encoded as a + /// 64 bit integer. All integers are signed. Each field is independent + /// (e.g. there is no constraint that nanoseconds have the same sign + /// as days or that the quantity of nanoseconds represents less + /// than a day's worth of time). + MonthDayNano, } impl fmt::Display for DataType { @@ -287,6 +295,9 @@ impl DataType { Some(p) if p == "YEAR_MONTH" => { Ok(DataType::Interval(IntervalUnit::YearMonth)) } + Some(p) if p == "MONTH_DAY_NANO" => { + Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + } _ => Err(ArrowError::ParseError( "interval unit missing or invalid".to_string(), )), @@ -442,6 +453,7 @@ impl DataType { DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { IntervalUnit::YearMonth => "YEAR_MONTH", IntervalUnit::DayTime => "DAY_TIME", + IntervalUnit::MonthDayNano => "MONTH_DAY_NANO", }}), DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { TimeUnit::Second => "SECOND", diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index 9920cf95d3c6..bc866b0145d7 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -454,13 +454,14 @@ mod tests { ), Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), + Field::new("c21", DataType::Interval(IntervalUnit::MonthDayNano), false), Field::new( - "c21", + "c22", DataType::List(Box::new(Field::new("item", DataType::Boolean, true))), false, ), Field::new( - "c22", + "c23", DataType::FixedSizeList( Box::new(Field::new("bools", DataType::Boolean, false)), 5, @@ -468,7 +469,7 @@ mod tests { false, ), Field::new( - "c23", + "c24", DataType::List(Box::new(Field::new( "inner_list", DataType::List(Box::new(Field::new( @@ -481,21 +482,22 @@ mod tests { true, ), Field::new( - "c24", + "c25", DataType::Struct(vec![ Field::new("a", DataType::Utf8, false), Field::new("b", DataType::UInt16, false), ]), false, ), - Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), - Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), - Field::new("c27", DataType::Duration(TimeUnit::Second), false), - Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), - Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), - Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), + Field::new("c26", DataType::Interval(IntervalUnit::YearMonth), true), + Field::new("c27", DataType::Interval(IntervalUnit::DayTime), true), + Field::new("c28", DataType::Interval(IntervalUnit::MonthDayNano), true), + Field::new("c29", DataType::Duration(TimeUnit::Second), false), + Field::new("c30", DataType::Duration(TimeUnit::Millisecond), false), + Field::new("c31", DataType::Duration(TimeUnit::Microsecond), false), + Field::new("c32", DataType::Duration(TimeUnit::Nanosecond), false), Field::new_dict( - "c31", + "c33", DataType::Dictionary( Box::new(DataType::Int32), Box::new(DataType::Utf8), @@ -504,10 +506,10 @@ mod tests { 123, true, ), - Field::new("c32", DataType::LargeBinary, true), - Field::new("c33", DataType::LargeUtf8, true), + Field::new("c34", DataType::LargeBinary, true), + Field::new("c35", DataType::LargeUtf8, true), Field::new( - "c34", + "c36", DataType::LargeList(Box::new(Field::new( "inner_large_list", DataType::LargeList(Box::new(Field::new( @@ -520,7 +522,7 @@ mod tests { true, ), Field::new( - "c35", + "c37", DataType::Map( Box::new(Field::new( "my_entries", @@ -731,6 +733,15 @@ mod tests { { "name": "c21", "nullable": false, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c22", + "nullable": false, "type": { "name": "list" }, @@ -746,7 +757,7 @@ mod tests { ] }, { - "name": "c22", + "name": "c23", "nullable": false, "type": { "name": "fixedsizelist", @@ -764,7 +775,7 @@ mod tests { ] }, { - "name": "c23", + "name": "c24", "nullable": true, "type": { "name": "list" @@ -790,7 +801,7 @@ mod tests { ] }, { - "name": "c24", + "name": "c25", "nullable": false, "type": { "name": "struct" @@ -817,7 +828,7 @@ mod tests { ] }, { - "name": "c25", + "name": "c26", "nullable": true, "type": { "name": "interval", @@ -826,7 +837,7 @@ mod tests { "children": [] }, { - "name": "c26", + "name": "c27", "nullable": true, "type": { "name": "interval", @@ -835,7 +846,16 @@ mod tests { "children": [] }, { - "name": "c27", + "name": "c28", + "nullable": true, + "type": { + "name": "interval", + "unit": "MONTH_DAY_NANO" + }, + "children": [] + }, + { + "name": "c29", "nullable": false, "type": { "name": "duration", @@ -844,7 +864,7 @@ mod tests { "children": [] }, { - "name": "c28", + "name": "c30", "nullable": false, "type": { "name": "duration", @@ -853,7 +873,7 @@ mod tests { "children": [] }, { - "name": "c29", + "name": "c31", "nullable": false, "type": { "name": "duration", @@ -862,7 +882,7 @@ mod tests { "children": [] }, { - "name": "c30", + "name": "c32", "nullable": false, "type": { "name": "duration", @@ -871,7 +891,7 @@ mod tests { "children": [] }, { - "name": "c31", + "name": "c33", "nullable": true, "children": [], "type": { @@ -888,7 +908,7 @@ mod tests { } }, { - "name": "c32", + "name": "c34", "nullable": true, "type": { "name": "largebinary" @@ -896,7 +916,7 @@ mod tests { "children": [] }, { - "name": "c33", + "name": "c35", "nullable": true, "type": { "name": "largeutf8" @@ -904,7 +924,7 @@ mod tests { "children": [] }, { - "name": "c34", + "name": "c36", "nullable": true, "type": { "name": "largelist" @@ -930,7 +950,7 @@ mod tests { ] }, { - "name": "c35", + "name": "c37", "nullable": false, "type": { "name": "map", @@ -1156,6 +1176,7 @@ mod tests { assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value()); + assert_eq!(Some(VNumber(Number::from(1))), 1i128.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); diff --git a/arrow/src/datatypes/native.rs b/arrow/src/datatypes/native.rs index 18d593b72980..019a1f225bb2 100644 --- a/arrow/src/datatypes/native.rs +++ b/arrow/src/datatypes/native.rs @@ -67,6 +67,12 @@ pub trait ArrowNativeType: fn from_i64(_: i64) -> Option { None } + + /// Convert native type from i128. + #[inline] + fn from_i128(_: i128) -> Option { + None + } } /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the @@ -201,6 +207,35 @@ impl ArrowNativeType for i64 { } } +impl JsonSerializable for i128 { + fn into_json_value(self) -> Option { + Some(self.into()) + } +} + +impl ArrowNativeType for i128 { + #[inline] + fn from_usize(v: usize) -> Option { + num::FromPrimitive::from_usize(v) + } + + #[inline] + fn to_usize(&self) -> Option { + num::ToPrimitive::to_usize(self) + } + + #[inline] + fn to_isize(&self) -> Option { + num::ToPrimitive::to_isize(self) + } + + /// Convert native type from i128. + #[inline] + fn from_i128(val: i128) -> Option { + Some(val) + } +} + impl JsonSerializable for u8 { fn into_json_value(self) -> Option { Some(self.into()) diff --git a/arrow/src/datatypes/numeric.rs b/arrow/src/datatypes/numeric.rs index 39c6732c3231..cbb953c57b94 100644 --- a/arrow/src/datatypes/numeric.rs +++ b/arrow/src/datatypes/numeric.rs @@ -348,6 +348,7 @@ make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8); make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8); make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16); make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8); +make_numeric_type!(IntervalMonthDayNanoType, i128, i128x4, m128x4); make_numeric_type!(DurationSecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); diff --git a/arrow/src/datatypes/types.rs b/arrow/src/datatypes/types.rs index 2731e3d46658..0937c3b3c9d7 100644 --- a/arrow/src/datatypes/types.rs +++ b/arrow/src/datatypes/types.rs @@ -98,6 +98,11 @@ make_type!( i64, DataType::Interval(IntervalUnit::DayTime) ); +make_type!( + IntervalMonthDayNanoType, + i128, + DataType::Interval(IntervalUnit::MonthDayNano) +); make_type!( DurationSecondType, i64, @@ -154,6 +159,7 @@ impl ArrowTemporalType for Time64MicrosecondType {} impl ArrowTemporalType for Time64NanosecondType {} // impl ArrowTemporalType for IntervalYearMonthType {} // impl ArrowTemporalType for IntervalDayTimeType {} +// impl ArrowTemporalType for IntervalMonthDayNanoType {} impl ArrowTemporalType for DurationSecondType {} impl ArrowTemporalType for DurationMillisecondType {} impl ArrowTemporalType for DurationMicrosecondType {} diff --git a/arrow/src/ipc/convert.rs b/arrow/src/ipc/convert.rs index dcc9fcc84a0f..4d64583d6fa0 100644 --- a/arrow/src/ipc/convert.rs +++ b/arrow/src/ipc/convert.rs @@ -263,6 +263,9 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT DataType::Interval(IntervalUnit::YearMonth) } ipc::IntervalUnit::DAY_TIME => DataType::Interval(IntervalUnit::DayTime), + ipc::IntervalUnit::MONTH_DAY_NANO => { + DataType::Interval(IntervalUnit::MonthDayNano) + } z => panic!("Interval type with unit of {:?} unsupported", z), } } @@ -557,6 +560,7 @@ pub(crate) fn get_fb_field_type<'a>( let interval_unit = match unit { IntervalUnit::YearMonth => ipc::IntervalUnit::YEAR_MONTH, IntervalUnit::DayTime => ipc::IntervalUnit::DAY_TIME, + IntervalUnit::MonthDayNano => ipc::IntervalUnit::MONTH_DAY_NANO, }; builder.add_unit(interval_unit); FBFieldType { @@ -757,6 +761,11 @@ mod tests { DataType::Interval(IntervalUnit::DayTime), true, ), + Field::new( + "interval[mdn]", + DataType::Interval(IntervalUnit::MonthDayNano), + true, + ), Field::new("utf8", DataType::Utf8, false), Field::new("binary", DataType::Binary, false), Field::new( diff --git a/arrow/src/ipc/gen/Schema.rs b/arrow/src/ipc/gen/Schema.rs index 12af5b5b0806..dd204e0704df 100644 --- a/arrow/src/ipc/gen/Schema.rs +++ b/arrow/src/ipc/gen/Schema.rs @@ -639,8 +639,11 @@ pub const ENUM_MAX_INTERVAL_UNIT: i16 = 1; note = "Use associated constants instead. This will no longer be generated in 2021." )] #[allow(non_camel_case_types)] -pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 2] = - [IntervalUnit::YEAR_MONTH, IntervalUnit::DAY_TIME]; +pub const ENUM_VALUES_INTERVAL_UNIT: [IntervalUnit; 3] = [ + IntervalUnit::YEAR_MONTH, + IntervalUnit::DAY_TIME, + IntervalUnit::MONTH_DAY_NANO, +]; #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] #[repr(transparent)] @@ -649,15 +652,18 @@ pub struct IntervalUnit(pub i16); impl IntervalUnit { pub const YEAR_MONTH: Self = Self(0); pub const DAY_TIME: Self = Self(1); + pub const MONTH_DAY_NANO: Self = Self(2); pub const ENUM_MIN: i16 = 0; - pub const ENUM_MAX: i16 = 1; - pub const ENUM_VALUES: &'static [Self] = &[Self::YEAR_MONTH, Self::DAY_TIME]; + pub const ENUM_MAX: i16 = 2; + pub const ENUM_VALUES: &'static [Self] = + &[Self::YEAR_MONTH, Self::DAY_TIME, Self::MONTH_DAY_NANO]; /// Returns the variant's name or "" if unknown. pub fn variant_name(self) -> Option<&'static str> { match self { Self::YEAR_MONTH => Some("YEAR_MONTH"), Self::DAY_TIME => Some("DAY_TIME"), + Self::MONTH_DAY_NANO => Some("MONTH_DAY_NANO"), _ => None, } } diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 3878586e50c2..1d9f36d78337 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -312,7 +312,8 @@ fn create_primitive_array( | Timestamp(_, _) | Date64 | Duration(_) - | Interval(IntervalUnit::DayTime) => { + | Interval(IntervalUnit::DayTime) + | Interval(IntervalUnit::MonthDayNano) => { let mut builder = ArrayData::builder(data_type.clone()) .len(length) .buffers(buffers[1..].to_vec()) diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index 09872a79661b..e427dcca91be 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -106,6 +106,45 @@ macro_rules! make_string_interval_day_time { }}; } +macro_rules! make_string_interval_month_day_nano { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::() + .unwrap(); + + let s = if array.is_null($row) { + "NULL".to_string() + } else { + let value: u128 = array.value($row) as u128; + + let months_part: i32 = + ((value & 0xFFFFFFFF000000000000000000000000) >> 96) as i32; + let days_part: i32 = ((value & 0xFFFFFFFF0000000000000000) >> 64) as i32; + let nanoseconds_part: i64 = (value & 0xFFFFFFFFFFFFFFFF) as i64; + + let secs = nanoseconds_part / 1000000000; + let mins = secs / 60; + let hours = mins / 60; + + let secs = secs - (mins * 60); + let mins = mins - (hours * 60); + + format!( + "0 years {} mons {} days {} hours {} mins {}.{:02} secs", + months_part, + days_part, + hours, + mins, + secs, + (nanoseconds_part % 1000000000), + ) + }; + + Ok(s) + }}; +} + macro_rules! make_string_date { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); @@ -308,6 +347,9 @@ pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result { make_string_interval_year_month!(column, row) } + IntervalUnit::MonthDayNano => { + make_string_interval_month_day_nano!(column, row) + } }, DataType::List(_) => make_string_from_list!(column, row), DataType::Dictionary(index_type, _value_type) => match **index_type { diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index 1a402bc6e368..e10de768aa25 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -286,6 +286,10 @@ impl ArrowJsonBatch { .collect::>(); arr.equals_json(&x.iter().collect::>()[..]) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let arr = IntervalMonthDayNanoArray::from(arr.data().clone()); + arr.equals_json(&json_array.iter().collect::>()[..]) + } DataType::UInt8 => { let arr = arr.as_any().downcast_ref::().unwrap(); arr.equals_json(&json_array.iter().collect::>()[..]) diff --git a/format/Schema.fbs b/format/Schema.fbs index 3b00dd4780d6..9da095177c7d 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -246,15 +246,24 @@ table Timestamp { timezone: string; } -enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} +enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO} // A "calendar" interval which models types that don't necessarily // have a precise duration without the context of a base timestamp (e.g. // days can differ in length during day light savings time transitions). +// All integers in the types below are stored in the endianness indicated +// by the schema. // YEAR_MONTH - Indicates the number of elapsed whole months, stored as -// 4-byte integers. +// 4-byte signed integers. // DAY_TIME - Indicates the number of elapsed days and milliseconds, // stored as 2 contiguous 32-bit integers (8-bytes in total). Support // of this IntervalUnit is not required for full arrow compatibility. +// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds. +// The values are stored contiguously in 16 byte blocks. Months and +// days are encoded as 32 bit integers and nanoseconds is encoded as a +// 64 bit integer. All integers are signed. Each field is independent +// (e.g. there is no constraint that nanoseconds have the same sign +// as days or that the quantity of nanoseconds represents less +// than a day's worth of time). table Interval { unit: IntervalUnit; } diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index f25157f635bc..cb57ffc2d0dc 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -280,6 +280,49 @@ fn array_from_json( } Ok(Arc::new(b.finish())) } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Object(v) => { + let months = v.get("months").unwrap(); + let days = v.get("days").unwrap(); + let nanoseconds = v.get("nanoseconds").unwrap(); + match (months, days, nanoseconds) { + ( + Value::Number(months), + Value::Number(days), + Value::Number(nanoseconds), + ) => { + let months = months.as_i64().unwrap() as i32; + let days = days.as_i64().unwrap() as i32; + let nanoseconds = nanoseconds.as_i64().unwrap(); + let months_days_ns: i128 = ((nanoseconds as i128) + & 0xFFFFFFFFFFFFFFFF) + << 64 + | ((days as i128) & 0xFFFFFFFF) << 32 + | ((months as i128) & 0xFFFFFFFF); + months_days_ns + } + (_, _, _) => { + panic!("Unable to parse {:?} as MonthDayNano", v) + } + } + } + _ => panic!("Unable to parse {:?} as MonthDayNano", value), + }), + _ => b.append_null(), + }?; + } + Ok(Arc::new(b.finish())) + } DataType::Float32 => { let mut b = Float32Builder::new(json_col.count); for (is_valid, value) in json_col diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 643f5a29f488..9f8742871026 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -426,6 +426,14 @@ fn write_leaf( .unwrap(); get_interval_dt_array_slice(array, &indices) } + _ => { + return Err(ParquetError::NYI( + format!( + "Attempting to write an Arrow interval type {:?} to parquet that is not yet implemented", + interval_unit + ) + )); + } }, ArrowDataType::FixedSizeBinary(_) => { let array = column @@ -1462,6 +1470,17 @@ mod tests { ); } + #[test] + #[should_panic( + expected = "Attempting to write an Arrow interval type MonthDayNano to parquet that is not yet implemented" + )] + fn interval_month_day_nano_single_column() { + required_and_optional::( + 0..SMALL_SIZE as i128, + "interval_month_day_nano_single_column", + ); + } + #[test] fn binary_single_column() { let one_vec: Vec = (0..SMALL_SIZE as u8).collect();