From 946e9d45c695b5dfeeab95001c6cbffea4c9e2fa Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 13 Jun 2022 22:45:57 -0700 Subject: [PATCH 1/3] Add Decimal128 --- arrow/src/array/array_binary.rs | 44 +++----- arrow/src/array/builder.rs | 31 +++++- arrow/src/array/equal_json.rs | 2 +- arrow/src/array/iterator.rs | 2 +- arrow/src/compute/kernels/cast.rs | 41 +++---- arrow/src/compute/kernels/sort.rs | 2 +- arrow/src/compute/kernels/take.rs | 2 +- arrow/src/util/decimal.rs | 148 ++++++++++++++++++++++++++ arrow/src/util/mod.rs | 1 + parquet/src/arrow/arrow_reader.rs | 2 +- parquet/src/arrow/arrow_writer/mod.rs | 2 +- 11 files changed, 217 insertions(+), 60 deletions(-) create mode 100644 arrow/src/util/decimal.rs diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index b1fc06d369a4..481ea92d66c3 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -33,6 +33,7 @@ use crate::datatypes::{ }; use crate::error::{ArrowError, Result}; use crate::util::bit_util; +use crate::util::decimal::Decimal128; use crate::{buffer::MutableBuffer, datatypes::DataType}; /// See [`BinaryArray`] and [`LargeBinaryArray`] for storing @@ -756,7 +757,7 @@ impl Array for FixedSizeBinaryArray { /// .unwrap(); /// /// assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); -/// assert_eq!(8_887_000_000, decimal_array.value(0)); +/// assert_eq!(8_887_000_000_i128, decimal_array.value(0).as_i128()); /// assert_eq!("8887.000000", decimal_array.value_as_string(0)); /// assert_eq!(3, decimal_array.len()); /// assert_eq!(1, decimal_array.null_count()); @@ -775,8 +776,8 @@ pub struct DecimalArray { } impl DecimalArray { - /// Returns the element at index `i` as i128. - pub fn value(&self, i: usize) -> i128 { + /// Returns the element at index `i`. + pub fn value(&self, i: usize) -> Decimal128 { assert!(i < self.data.len(), "DecimalArray out of bounds access"); let offset = i.checked_add(self.data.offset()).unwrap(); let raw_val = unsafe { @@ -787,10 +788,11 @@ impl DecimalArray { ) }; let as_array = raw_val.try_into(); - match as_array { + let integer = match as_array { Ok(v) if raw_val.len() == 16 => i128::from_le_bytes(v), _ => panic!("DecimalArray elements are not 128bit integers."), - } + }; + Decimal128::new_from_i128(self.precision, self.scale, integer) } /// Returns the offset for the element at index `i`. @@ -821,23 +823,7 @@ impl DecimalArray { #[inline] pub fn value_as_string(&self, row: usize) -> String { - let value = self.value(row); - let value_str = value.to_string(); - - if self.scale == 0 { - value_str - } else { - let (sign, rest) = value_str.split_at(if value >= 0 { 0 } else { 1 }); - - if rest.len() > self.scale { - // Decimal separator is in the middle of the string - let (whole, decimal) = value_str.split_at(value_str.len() - self.scale); - format!("{}.{}", whole, decimal) - } else { - // String has to be padded - format!("{}0.{:0>width$}", sign, rest, width = self.scale) - } - } + self.value(row).as_string() } pub fn from_fixed_size_list_array( @@ -1498,8 +1484,8 @@ mod tests { .build() .unwrap(); let decimal_array = DecimalArray::from(array_data); - assert_eq!(8_887_000_000, decimal_array.value(0)); - assert_eq!(-8_887_000_000, decimal_array.value(1)); + assert_eq!(8_887_000_000_i128, decimal_array.value(0).into()); + assert_eq!(-8_887_000_000_i128, decimal_array.value(1).into()); assert_eq!(16, decimal_array.value_length()); } @@ -1550,11 +1536,11 @@ mod tests { let array = DecimalArray::from_iter_values(vec![-100, 0, 101].into_iter()); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal(38, 10)); - assert_eq!(-100, array.value(0)); + assert_eq!(-100_i128, array.value(0).into()); assert!(!array.is_null(0)); - assert_eq!(0, array.value(1)); + assert_eq!(0_i128, array.value(1).into()); assert!(!array.is_null(1)); - assert_eq!(101, array.value(2)); + assert_eq!(101_i128, array.value(2).into()); assert!(!array.is_null(2)); } @@ -1563,10 +1549,10 @@ mod tests { let array: DecimalArray = vec![Some(-100), None, Some(101)].into_iter().collect(); assert_eq!(array.len(), 3); assert_eq!(array.data_type(), &DataType::Decimal(38, 10)); - assert_eq!(-100, array.value(0)); + assert_eq!(-100_i128, array.value(0).into()); assert!(!array.is_null(0)); assert!(array.is_null(1)); - assert_eq!(101, array.value(2)); + assert_eq!(101_i128, array.value(2).into()); assert!(!array.is_null(2)); } diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 041b7a92c33f..d47f93c2a4ef 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1478,11 +1478,11 @@ impl DecimalBuilder { /// Automatically calls the `append` method to delimit the slice appended in as a /// distinct array element. #[inline] - pub fn append_value(&mut self, value: i128) -> Result<()> { + pub fn append_value(&mut self, value: impl Into) -> Result<()> { let value = if self.value_validation { - validate_decimal_precision(value, self.precision)? + validate_decimal_precision(value.into(), self.precision)? } else { - value + value.into() }; let value_as_bytes = Self::from_i128_to_fixed_size_bytes( @@ -2530,6 +2530,7 @@ mod tests { use crate::array::Array; use crate::bitmap::Bitmap; + use crate::util::decimal::Decimal128; #[test] fn test_builder_i32_empty() { @@ -3442,9 +3443,29 @@ mod tests { fn test_decimal_builder() { let mut builder = DecimalBuilder::new(30, 38, 6); - builder.append_value(8_887_000_000).unwrap(); + builder.append_value(8_887_000_000_i128).unwrap(); builder.append_null().unwrap(); - builder.append_value(-8_887_000_000).unwrap(); + builder.append_value(-8_887_000_000_i128).unwrap(); + let decimal_array: DecimalArray = builder.finish(); + + assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); + assert_eq!(3, decimal_array.len()); + assert_eq!(1, decimal_array.null_count()); + assert_eq!(32, decimal_array.value_offset(2)); + assert_eq!(16, decimal_array.value_length()); + } + + #[test] + fn test_decimal_builder_with_decimal128() { + let mut builder = DecimalBuilder::new(30, 38, 6); + + builder + .append_value(Decimal128::new_from_i128(30, 38, 8_887_000_000_i128)) + .unwrap(); + builder.append_null().unwrap(); + builder + .append_value(Decimal128::new_from_i128(30, 38, -8_887_000_000_i128)) + .unwrap(); let decimal_array: DecimalArray = builder.finish(); assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); diff --git a/arrow/src/array/equal_json.rs b/arrow/src/array/equal_json.rs index 64f109df5ff9..9db1a4397cb8 100644 --- a/arrow/src/array/equal_json.rs +++ b/arrow/src/array/equal_json.rs @@ -370,7 +370,7 @@ impl JsonEqual for DecimalArray { self.is_valid(i) && (s .parse::() - .map_or_else(|_| false, |v| v == self.value(i))) + .map_or_else(|_| false, |v| v == self.value(i).as_i128())) } JNull => self.is_null(i), _ => false, diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index 18bdca621795..bc70d1a2a8ed 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -425,7 +425,7 @@ impl<'a> std::iter::Iterator for DecimalIter<'a> { if self.array.is_null(old) { Some(None) } else { - Some(Some(self.array.value(old))) + Some(Some(self.array.value(old).as_i128())) } } } diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 93a8ebcb6b5a..1b17f28db1a1 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -353,7 +353,7 @@ macro_rules! cast_decimal_to_integer { if array.is_null(i) { value_builder.append_null()?; } else { - let v = array.value(i) / div; + let v = array.value(i).as_i128() / div; // check the overflow // For example: Decimal(128,10,0) as i8 // 128 is out of range i8 @@ -383,7 +383,7 @@ macro_rules! cast_decimal_to_float { } else { // The range of f32 or f64 is larger than i128, we don't need to check overflow. // cast the i128 to f64 will lose precision, for example the `112345678901234568` will be as `112345678901234560`. - let v = (array.value(i) as f64 / div) as $NATIVE_TYPE; + let v = (array.value(i).as_i128() as f64 / div) as $NATIVE_TYPE; value_builder.append_value(v)?; } } @@ -2196,6 +2196,7 @@ where #[cfg(test)] mod tests { use super::*; + use crate::util::decimal::Decimal128; use crate::{buffer::Buffer, util::display::array_value_to_string}; macro_rules! generate_cast_test_case { @@ -2247,9 +2248,9 @@ mod tests { DecimalArray, &output_type, vec![ - Some(11234560_i128), - Some(21234560_i128), - Some(31234560_i128), + Some(Decimal128::new_from_i128(20, 4, 11234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 21234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 31234560_i128)), None ] ); @@ -2426,11 +2427,11 @@ mod tests { DecimalArray, &decimal_type, vec![ - Some(1000000_i128), - Some(2000000_i128), - Some(3000000_i128), + Some(Decimal128::new_from_i128(38, 6, 1000000_i128)), + Some(Decimal128::new_from_i128(38, 6, 2000000_i128)), + Some(Decimal128::new_from_i128(38, 6, 3000000_i128)), None, - Some(5000000_i128) + Some(Decimal128::new_from_i128(38, 6, 5000000_i128)) ] ); } @@ -2458,12 +2459,12 @@ mod tests { DecimalArray, &decimal_type, vec![ - Some(1100000_i128), - Some(2200000_i128), - Some(4400000_i128), + Some(Decimal128::new_from_i128(38, 6, 1100000_i128)), + Some(Decimal128::new_from_i128(38, 6, 2200000_i128)), + Some(Decimal128::new_from_i128(38, 6, 4400000_i128)), None, - Some(1123456_i128), - Some(1123456_i128), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), ] ); @@ -2483,13 +2484,13 @@ mod tests { DecimalArray, &decimal_type, vec![ - Some(1100000_i128), - Some(2200000_i128), - Some(4400000_i128), + Some(Decimal128::new_from_i128(38, 6, 1100000_i128)), + Some(Decimal128::new_from_i128(38, 6, 2200000_i128)), + Some(Decimal128::new_from_i128(38, 6, 4400000_i128)), None, - Some(1123456_i128), - Some(1123456_i128), - Some(1123456_i128), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), + Some(Decimal128::new_from_i128(38, 6, 1123456_i128)), ] ); } diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 140a57f33ed5..6d0df6566ebb 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -504,7 +504,7 @@ where .expect("Unable to downcast to decimal array"); let valids = value_indices .into_iter() - .map(|index| (index, decimal_array.value(index as usize))) + .map(|index| (index, decimal_array.value(index as usize).as_i128())) .collect::>(); sort_primitive_inner(decimal_values, null_indices, cmp, options, limit, valids) } diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 567bf5c8ba27..1c4b8d5b2f2e 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -524,7 +524,7 @@ where if decimal_values.is_null(index) { Ok(None) } else { - Ok(Some(decimal_values.value(index))) + Ok(Some(decimal_values.value(index).as_i128())) } }); let t: Result>> = t.transpose(); diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs new file mode 100644 index 000000000000..5f68bc154762 --- /dev/null +++ b/arrow/src/util/decimal.rs @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Decimal related utils + +use std::cmp::Ordering; + +#[derive(Clone, Debug)] +pub struct Decimal128 { + precision: usize, + scale: usize, + value: i128, +} + +impl PartialOrd for Decimal128 { + fn partial_cmp(&self, other: &Self) -> Option { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimal128 with different scale: {}, {}", + self.scale, other.scale + ); + self.value.partial_cmp(&other.value) + } +} + +impl Ord for Decimal128 { + fn cmp(&self, other: &Self) -> Ordering { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimal128 with different scale: {}, {}", + self.scale, other.scale + ); + self.value.cmp(&other.value) + } +} + +impl PartialEq for Decimal128 { + fn eq(&self, other: &Self) -> bool { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimal128 with different scale: {}, {}", + self.scale, other.scale + ); + self.value.eq(&other.value) + } +} + +impl Eq for Decimal128 {} + +impl Decimal128 { + pub fn new_from_bytes(precision: usize, scale: usize, bytes: &[u8]) -> Self { + let as_array = bytes.try_into(); + let value = match as_array { + Ok(v) if bytes.len() == 16 => i128::from_le_bytes(v), + _ => panic!("Input to Decimal128 is not 128bit integer."), + }; + + Decimal128 { + precision, + scale, + value, + } + } + + pub fn new_from_i128(precision: usize, scale: usize, value: i128) -> Self { + Decimal128 { + precision, + scale, + value, + } + } + + pub fn as_i128(&self) -> i128 { + self.value + } + + pub fn as_string(&self) -> String { + let value_str = self.value.to_string(); + + if self.scale == 0 { + value_str + } else { + let (sign, rest) = value_str.split_at(if self.value >= 0 { 0 } else { 1 }); + + if rest.len() > self.scale { + // Decimal separator is in the middle of the string + let (whole, decimal) = value_str.split_at(value_str.len() - self.scale); + format!("{}.{}", whole, decimal) + } else { + // String has to be padded + format!("{}0.{:0>width$}", sign, rest, width = self.scale) + } + } + } +} + +/// Converts `Decimal128` to i128 to keep API +impl Into for Decimal128 { + fn into(self) -> i128 { + self.value + } +} + +#[cfg(test)] +mod tests { + use crate::util::decimal::Decimal128; + + #[test] + fn decimal_128_to_string() { + let mut value = Decimal128::new_from_i128(5, 2, 100); + assert_eq!(value.as_string(), "1.00"); + + value = Decimal128::new_from_i128(5, 3, 100); + assert_eq!(value.as_string(), "0.100"); + } + + #[test] + fn decimal_128_from_bytes() { + let bytes = 100_i128.to_le_bytes(); + let value = Decimal128::new_from_bytes(5, 2, &bytes); + assert_eq!(value.as_string(), "1.00"); + } + + fn i128_func(value: impl Into) -> i128 { + value.into() + } + + #[test] + fn decimal_128_to_i128() { + let value = Decimal128::new_from_i128(5, 2, 100); + let integer = i128_func(value); + assert_eq!(integer, 100); + } +} diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 3b6de8a4b263..dcb9cd52a0bf 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -35,4 +35,5 @@ pub mod test_util; mod trusted_len; pub(crate) use trusted_len::trusted_len_unzip; +pub mod decimal; pub(crate) mod reader_parser; diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader.rs index 92d4ff264c19..89406cd616a4 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader.rs @@ -644,7 +644,7 @@ mod tests { assert_eq!(col.scale(), 2); for (i, v) in expected.enumerate() { - assert_eq!(col.value(i), v * 100_i128); + assert_eq!(col.value(i).as_i128(), v * 100_i128); } } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 44631e57409a..99e0001c122c 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -673,7 +673,7 @@ fn get_decimal_array_slice( let mut values = Vec::with_capacity(indices.len()); let size = decimal_length_from_precision(array.precision()); for i in indices { - let as_be_bytes = array.value(*i).to_be_bytes(); + let as_be_bytes = array.value(*i).as_i128().to_be_bytes(); let resized_value = as_be_bytes[(16 - size)..].to_vec(); values.push(FixedLenByteArray::from(ByteArray::from(resized_value))); } From ecb026f222c22ab81f16f69b451eb5bb2e9f3641 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 13 Jun 2022 23:17:04 -0700 Subject: [PATCH 2/3] Fix clippy --- arrow/src/util/decimal.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs index 5f68bc154762..967b3a81c398 100644 --- a/arrow/src/util/decimal.rs +++ b/arrow/src/util/decimal.rs @@ -19,8 +19,9 @@ use std::cmp::Ordering; -#[derive(Clone, Debug)] +#[derive(Debug)] pub struct Decimal128 { + #[allow(dead_code)] precision: usize, scale: usize, value: i128, @@ -108,10 +109,9 @@ impl Decimal128 { } } -/// Converts `Decimal128` to i128 to keep API -impl Into for Decimal128 { - fn into(self) -> i128 { - self.value +impl From for i128 { + fn from(decimal: Decimal128) -> Self { + decimal.as_i128() } } From 77ad6f353472e57376e1f985e5390e24a045ee4d Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 15 Jun 2022 09:02:21 -0700 Subject: [PATCH 3/3] Add code comment --- arrow/src/util/decimal.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs index 967b3a81c398..b78af3acc6cd 100644 --- a/arrow/src/util/decimal.rs +++ b/arrow/src/util/decimal.rs @@ -19,6 +19,8 @@ use std::cmp::Ordering; +/// Represents a decimal value with precision and scale. +/// The decimal value is represented by a signed 128-bit integer. #[derive(Debug)] pub struct Decimal128 { #[allow(dead_code)]