Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MONTH_DAY_NANO interval type, impl ArrowNativeType for i128 #779

Merged
merged 20 commits into from
Dec 20, 2021
2 changes: 1 addition & 1 deletion arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ path = "src/lib.rs"
[dependencies]
serde = { version = "1.0" }
serde_derive = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] }
serde_json = { version = "1.0", features = ["preserve_order", "arbitrary_precision"] }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is often much concern about adding new dependencies to arrow - however, this feature does not seem to add any new dependencies: https://github.com/serde-rs/json/blob/master/Cargo.toml#L74

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need this feature because we need to deserialize i128 numbers
https://github.com/serde-rs/json/blob/master/src/number.rs#L534

indexmap = { version = "1.6", features = ["std"] }
rand = { version = "0.8", optional = true }
num = "0.4"
Expand Down
6 changes: 6 additions & 0 deletions arrow/src/array/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,9 @@ pub fn make_array(data: ArrayData) -> ArrayRef {
DataType::Interval(IntervalUnit::DayTime) => {
Arc::new(IntervalDayTimeArray::from(data)) as ArrayRef
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
Arc::new(IntervalMonthDayNanoArray::from(data)) as ArrayRef
}
DataType::Duration(TimeUnit::Second) => {
Arc::new(DurationSecondArray::from(data)) as ArrayRef
}
Expand Down Expand Up @@ -415,6 +418,9 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
IntervalUnit::DayTime => {
new_null_sized_array::<IntervalDayTimeType>(data_type, length)
}
IntervalUnit::MonthDayNano => {
new_null_sized_array::<IntervalMonthDayNanoType>(data_type, length)
}
},
DataType::FixedSizeBinary(value_len) => make_array(unsafe {
ArrayData::new_unchecked(
Expand Down
18 changes: 18 additions & 0 deletions arrow/src/array/array_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ def_numeric_from_vec!(Time64MicrosecondType);
def_numeric_from_vec!(Time64NanosecondType);
def_numeric_from_vec!(IntervalYearMonthType);
def_numeric_from_vec!(IntervalDayTimeType);
def_numeric_from_vec!(IntervalMonthDayNanoType);
def_numeric_from_vec!(DurationSecondType);
def_numeric_from_vec!(DurationMillisecondType);
def_numeric_from_vec!(DurationMicrosecondType);
Expand Down Expand Up @@ -649,6 +650,23 @@ mod tests {
assert!(arr.is_null(1));
assert_eq!(-5, arr.value(2));
assert_eq!(-5, arr.values()[2]);

// a month_day_nano interval contains months, days and nanoseconds,
// but we do not yet have accessors for the values.
// TODO: implement month, day, and nanos access method for month_day_nano.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 When this PR is merged, I will try and file a ticket for adding these accessors -- I think it would be a fairly good "first PR" type change for new contributors

let arr = IntervalMonthDayNanoArray::from(vec![
Some(100000000000000000000),
None,
Some(-500000000000000000000),
]);
assert_eq!(3, arr.len());
assert_eq!(0, arr.offset());
assert_eq!(1, arr.null_count());
assert_eq!(100000000000000000000, arr.value(0));
assert_eq!(100000000000000000000, arr.values()[0]);
assert!(arr.is_null(1));
assert_eq!(-500000000000000000000, arr.value(2));
assert_eq!(-500000000000000000000, arr.values()[2]);
}

#[test]
Expand Down
4 changes: 4 additions & 0 deletions arrow/src/array/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1686,6 +1686,9 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde
DataType::Interval(IntervalUnit::DayTime) => {
Box::new(IntervalDayTimeBuilder::new(capacity))
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
Box::new(IntervalMonthDayNanoBuilder::new(capacity))
}
DataType::Duration(TimeUnit::Second) => {
Box::new(DurationSecondBuilder::new(capacity))
}
Expand Down Expand Up @@ -2031,6 +2034,7 @@ impl FieldData {
| DataType::Time64(_)
| DataType::Interval(IntervalUnit::DayTime)
| DataType::Duration(_) => self.append_null::<Int64Type>()?,
DataType::Interval(IntervalUnit::MonthDayNano) => self.append_null::<IntervalMonthDayNanoType>()?,
DataType::UInt8 => self.append_null::<UInt8Type>()?,
DataType::UInt16 => self.append_null::<UInt16Type>()?,
DataType::UInt32 => self.append_null::<UInt32Type>()?,
Expand Down
7 changes: 7 additions & 0 deletions arrow/src/array/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
MutableBuffer::new(capacity * mem::size_of::<i64>()),
empty_buffer,
],
DataType::Interval(IntervalUnit::MonthDayNano) => [
MutableBuffer::new(capacity * mem::size_of::<i128>()),
empty_buffer,
],
DataType::Utf8 | DataType::Binary => {
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
Expand Down Expand Up @@ -1178,6 +1182,9 @@ fn layout(data_type: &DataType) -> DataTypeLayout {
DataType::Interval(IntervalUnit::DayTime) => {
DataTypeLayout::new_fixed_width(size_of::<i64>())
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
DataTypeLayout::new_fixed_width(size_of::<i128>())
}
DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()),
DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()),
DataType::FixedSizeBinary(bytes_per_value) => {
Expand Down
3 changes: 3 additions & 0 deletions arrow/src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,9 @@ fn equal_values(
| DataType::Duration(_) => primitive_equal::<i64>(
lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len,
),
DataType::Interval(IntervalUnit::MonthDayNano) => primitive_equal::<i128>(
lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len,
),
DataType::Utf8 | DataType::Binary => variable_sized_equal::<i32>(
lhs, rhs, lhs_nulls, rhs_nulls, lhs_start, rhs_start, len,
),
Expand Down
3 changes: 3 additions & 0 deletions arrow/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ pub type Time64MicrosecondArray = PrimitiveArray<Time64MicrosecondType>;
pub type Time64NanosecondArray = PrimitiveArray<Time64NanosecondType>;
pub type IntervalYearMonthArray = PrimitiveArray<IntervalYearMonthType>;
pub type IntervalDayTimeArray = PrimitiveArray<IntervalDayTimeType>;
pub type IntervalMonthDayNanoArray = PrimitiveArray<IntervalMonthDayNanoType>;
pub type DurationSecondArray = PrimitiveArray<DurationSecondType>;
pub type DurationMillisecondArray = PrimitiveArray<DurationMillisecondType>;
pub type DurationMicrosecondArray = PrimitiveArray<DurationMicrosecondType>;
Expand Down Expand Up @@ -425,6 +426,7 @@ pub type Time64MicrosecondBufferBuilder = BufferBuilder<Time64MicrosecondType>;
pub type Time64NanosecondBufferBuilder = BufferBuilder<Time64NanosecondType>;
pub type IntervalYearMonthBufferBuilder = BufferBuilder<IntervalYearMonthType>;
pub type IntervalDayTimeBufferBuilder = BufferBuilder<IntervalDayTimeType>;
pub type IntervalMonthDayNanoBufferBuilder = BufferBuilder<IntervalMonthDayNanoType>;
pub type DurationSecondBufferBuilder = BufferBuilder<DurationSecondType>;
pub type DurationMillisecondBufferBuilder = BufferBuilder<DurationMillisecondType>;
pub type DurationMicrosecondBufferBuilder = BufferBuilder<DurationMicrosecondType>;
Expand Down Expand Up @@ -473,6 +475,7 @@ pub type Time64MicrosecondBuilder = PrimitiveBuilder<Time64MicrosecondType>;
pub type Time64NanosecondBuilder = PrimitiveBuilder<Time64NanosecondType>;
pub type IntervalYearMonthBuilder = PrimitiveBuilder<IntervalYearMonthType>;
pub type IntervalDayTimeBuilder = PrimitiveBuilder<IntervalDayTimeType>;
pub type IntervalMonthDayNanoBuilder = PrimitiveBuilder<IntervalMonthDayNanoType>;
pub type DurationSecondBuilder = PrimitiveBuilder<DurationSecondType>;
pub type DurationMillisecondBuilder = PrimitiveBuilder<DurationMillisecondType>;
pub type DurationMicrosecondBuilder = PrimitiveBuilder<DurationMicrosecondType>;
Expand Down
3 changes: 3 additions & 0 deletions arrow/src/array/ord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result<DynComparato
(Interval(DayTime), Interval(DayTime)) => {
compare_primitives::<IntervalDayTimeType>(left, right)
}
(Interval(MonthDayNano), Interval(MonthDayNano)) => {
compare_primitives::<IntervalMonthDayNanoType>(left, right)
}
(Duration(Second), Duration(Second)) => {
compare_primitives::<DurationSecondType>(left, right)
}
Expand Down
4 changes: 4 additions & 0 deletions arrow/src/array/transform/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ fn build_extend(array: &ArrayData) -> Extend {
| DataType::Interval(IntervalUnit::DayTime) => {
primitive::build_extend::<i64>(array)
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
primitive::build_extend::<i128>(array)
}
DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array),
DataType::LargeUtf8 | DataType::LargeBinary => {
variable_size::build_extend::<i64>(array)
Expand Down Expand Up @@ -298,6 +301,7 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
| DataType::Timestamp(_, _)
| DataType::Duration(_)
| DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::<i64>,
DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::<i128>,
DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>,
DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
DataType::List(_) => list::extend_nulls::<i32>,
Expand Down
2 changes: 2 additions & 0 deletions arrow/src/compute/kernels/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3785,6 +3785,7 @@ mod tests {
Arc::new(Time64NanosecondArray::from(vec![1000, 2000])),
Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])),
Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])),
Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])),
Arc::new(DurationSecondArray::from(vec![1000, 2000])),
Arc::new(DurationMillisecondArray::from(vec![1000, 2000])),
Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])),
Expand Down Expand Up @@ -3940,6 +3941,7 @@ mod tests {
Duration(TimeUnit::Nanosecond),
Interval(IntervalUnit::YearMonth),
Interval(IntervalUnit::DayTime),
Interval(IntervalUnit::MonthDayNano),
Binary,
FixedSizeBinary(10),
LargeBinary,
Expand Down
5 changes: 5 additions & 0 deletions arrow/src/compute/kernels/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,11 @@ pub fn sort_to_indices(
DataType::Interval(IntervalUnit::DayTime) => {
sort_primitive::<IntervalDayTimeType, _>(values, v, n, cmp, &options, limit)
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
sort_primitive::<IntervalMonthDayNanoType, _>(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The i128 order relationship does not hold for months,days,nanos. AFAIK month,days,nanos do not have a partial order relationship.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I write a function like sort_month_day_nanos for it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that in general the time intervals do not have an natural order without an associated datetime pinning them to a specific time line. The conversion month,days,nanos -> seconds is lossy because one day is not 24 hours (some days are 23 and others 25).

values, v, n, cmp, &options, limit,
)
}
DataType::Duration(TimeUnit::Second) => {
sort_primitive::<DurationSecondType, _>(values, v, n, cmp, &options, limit)
}
Expand Down
12 changes: 12 additions & 0 deletions arrow/src/compute/kernels/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,9 @@ where
DataType::Interval(IntervalUnit::DayTime) => {
downcast_take!(IntervalDayTimeType, values, indices)
}
DataType::Interval(IntervalUnit::MonthDayNano) => {
downcast_take!(IntervalMonthDayNanoType, values, indices)
}
DataType::Duration(TimeUnit::Second) => {
downcast_take!(DurationSecondType, values, indices)
}
Expand Down Expand Up @@ -1185,6 +1188,15 @@ mod tests {
)
.unwrap();

// interval_month_day_nano
test_take_primitive_arrays::<IntervalMonthDayNanoType>(
vec![Some(0), None, Some(2), Some(-15), None],
&index,
None,
vec![Some(-15), None, None, Some(-15), Some(2)],
)
.unwrap();

// duration_second
test_take_primitive_arrays::<DurationSecondType>(
vec![Some(0), None, Some(2), Some(-15), None],
Expand Down
14 changes: 13 additions & 1 deletion arrow/src/datatypes/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,22 @@ pub enum TimeUnit {
Nanosecond,
}

/// YEAR_MONTH or DAY_TIME interval in SQL style.
/// YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO interval in SQL style.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum IntervalUnit {
/// Indicates the number of elapsed whole months, stored as 4-byte integers.
YearMonth,
/// Indicates the number of elapsed days and milliseconds,
/// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
DayTime,
/// A triple of the number of elapsed months, days, and nanoseconds.
/// The values are stored contiguously in 16 byte blocks. Months and
/// days are encoded as 32 bit integers and nanoseconds is encoded as a
/// 64 bit integer. All integers are signed. Each field is independent
/// (e.g. there is no constraint that nanoseconds have the same sign
/// as days or that the quantity of nanoseconds represents less
/// than a day's worth of time).
MonthDayNano,
}

impl fmt::Display for DataType {
Expand Down Expand Up @@ -287,6 +295,9 @@ impl DataType {
Some(p) if p == "YEAR_MONTH" => {
Ok(DataType::Interval(IntervalUnit::YearMonth))
}
Some(p) if p == "MONTH_DAY_NANO" => {
Ok(DataType::Interval(IntervalUnit::MonthDayNano))
}
_ => Err(ArrowError::ParseError(
"interval unit missing or invalid".to_string(),
)),
Expand Down Expand Up @@ -442,6 +453,7 @@ impl DataType {
DataType::Interval(unit) => json!({"name": "interval", "unit": match unit {
IntervalUnit::YearMonth => "YEAR_MONTH",
IntervalUnit::DayTime => "DAY_TIME",
IntervalUnit::MonthDayNano => "MONTH_DAY_NANO",
}}),
DataType::Duration(unit) => json!({"name": "duration", "unit": match unit {
TimeUnit::Second => "SECOND",
Expand Down
Loading