diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 036ef0cdd52f..4ff0ed4d93e6 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -203,10 +203,10 @@ pub type DurationMicrosecondArray = PrimitiveArray; pub type DurationNanosecondArray = PrimitiveArray; /// An array where each element is a 128-bits decimal with precision in [1, 38] and -/// scale in [-38, 38]. +/// scale less or equal to 38. pub type Decimal128Array = PrimitiveArray; /// An array where each element is a 256-bits decimal with precision in [1, 76] and -/// scale in [-76, 76]. +/// scale less or equal to 76. pub type Decimal256Array = PrimitiveArray; /// Trait bridging the dynamic-typed nature of Arrow (via [`DataType`]) with the @@ -1121,13 +1121,6 @@ impl PrimitiveArray { T::MAX_SCALE ))); } - if scale < -T::MAX_SCALE { - return Err(ArrowError::InvalidArgumentError(format!( - "scale {} is smaller than min {}", - scale, - -Decimal128Type::MAX_SCALE - ))); - } if scale > 0 && scale as u8 > precision { return Err(ArrowError::InvalidArgumentError(format!( "scale {} is greater than precision {}", @@ -1151,6 +1144,14 @@ impl PrimitiveArray { }) } + /// Validates the Decimal Array, if the value of slot is overflow for the specified precision, and + /// will be casted to Null + pub fn null_if_overflow_precision(&self, precision: u8) -> Self { + self.unary_opt::<_, T>(|v| { + (T::validate_decimal_precision(v, precision).is_ok()).then_some(v) + }) + } + /// Returns [`Self::value`] formatted as a string pub fn value_as_string(&self, row: usize) -> String { T::format_decimal(self.value(row), self.precision(), self.scale()) @@ -2055,6 +2056,15 @@ mod tests { .unwrap(); } + #[test] + fn test_decimal_array_set_null_if_overflow_with_precision() { + let array = + Decimal128Array::from(vec![Some(123456), Some(123), None, Some(123456)]); + let result = array.null_if_overflow_precision(5); + let expected = Decimal128Array::from(vec![None, Some(123), None, None]); + assert_eq!(result, expected); + } + #[test] fn test_decimal256_iter() { let mut builder = Decimal256Builder::with_capacity(30); diff --git a/arrow-array/src/builder/map_builder.rs b/arrow-array/src/builder/map_builder.rs index 831128c29d05..0de89e7b73da 100644 --- a/arrow-array/src/builder/map_builder.rs +++ b/arrow-array/src/builder/map_builder.rs @@ -26,40 +26,32 @@ use std::sync::Arc; /// Creates a new `MapBuilder` /// ``` -/// use arrow_array::builder::{MapBuilder, Int32Builder, StringBuilder}; -/// use arrow_array::{StringArray, Int32Array}; -/// use std::sync::Arc; +/// # use arrow_array::builder::{Int32Builder, MapBuilder, StringBuilder}; +/// # use arrow_array::{Int32Array, StringArray}; /// /// let string_builder = StringBuilder::new(); /// let int_builder = Int32Builder::with_capacity(4); /// +/// // Construct `[{"joe": 1}, {"blogs": 2, "foo": 4}, {}, null]` /// let mut builder = MapBuilder::new(None, string_builder, int_builder); /// -/// let string_builder = builder.keys(); -/// string_builder.append_value("joe"); -/// string_builder.append_value("n1"); -/// string_builder.append_value("n2"); -/// string_builder.append_value("mark"); -/// -/// let int_builder = builder.values(); -/// int_builder.append_value(1); -/// int_builder.append_value(2); -/// int_builder.append_null(); -/// int_builder.append_value(4); +/// builder.keys().append_value("joe"); +/// builder.values().append_value(1); +/// builder.append(true).unwrap(); /// +/// builder.keys().append_value("blogs"); +/// builder.values().append_value(2); +/// builder.keys().append_value("foo"); +/// builder.values().append_value(4); /// builder.append(true).unwrap(); -/// builder.append(false).unwrap(); /// builder.append(true).unwrap(); +/// builder.append(false).unwrap(); +/// +/// let array = builder.finish(); +/// assert_eq!(array.value_offsets(), &[0, 1, 3, 3, 3]); +/// assert_eq!(*array.values(), Int32Array::from(vec![1, 2, 4])); +/// assert_eq!(*array.keys(), StringArray::from(vec!["joe", "blogs", "foo"])); /// -/// let arr = builder.finish(); -/// assert_eq!( -/// *arr.values(), -/// Int32Array::from(vec![Some(1), Some(2), None, Some(4)]) -/// ); -/// assert_eq!( -/// *arr.keys(), -/// StringArray::from(vec![Some("joe"), Some("n1"), Some("n2"), Some("mark")]) -/// ); /// ``` #[derive(Debug)] pub struct MapBuilder { @@ -91,7 +83,6 @@ impl Default for MapFieldNames { } } -#[allow(dead_code)] impl MapBuilder { /// Creates a new `MapBuilder` pub fn new( @@ -264,67 +255,3 @@ impl ArrayBuilder for MapBuilder { self } } - -#[cfg(test)] -mod tests { - use super::*; - use arrow_buffer::Buffer; - use arrow_data::Bitmap; - - use crate::builder::{Int32Builder, StringBuilder}; - - // TODO: add a test that finishes building, after designing a spec-compliant - // way of inserting values to the map. - // A map's values shouldn't be repeated within a slot - - #[test] - fn test_map_array_builder() { - let string_builder = StringBuilder::new(); - let int_builder = Int32Builder::with_capacity(4); - - let mut builder = MapBuilder::new(None, string_builder, int_builder); - - let string_builder = builder.keys(); - string_builder.append_value("joe"); - string_builder.append_value("n1"); - string_builder.append_value("n2"); - string_builder.append_value("mark"); - - let int_builder = builder.values(); - int_builder.append_value(1); - int_builder.append_value(2); - int_builder.append_null(); - int_builder.append_value(4); - - builder.append(true).unwrap(); - builder.append(false).unwrap(); - builder.append(true).unwrap(); - - let arr = builder.finish(); - - let map_data = arr.data(); - assert_eq!(3, map_data.len()); - assert_eq!(1, map_data.null_count()); - assert_eq!( - Some(&Bitmap::from(Buffer::from(&[5_u8]))), - map_data.null_bitmap() - ); - - let expected_string_data = ArrayData::builder(DataType::Utf8) - .len(4) - .add_buffer(Buffer::from_slice_ref([0, 3, 5, 7, 11])) - .add_buffer(Buffer::from_slice_ref(b"joen1n2mark")) - .build() - .unwrap(); - - let expected_int_data = ArrayData::builder(DataType::Int32) - .len(4) - .null_bit_buffer(Some(Buffer::from_slice_ref([11_u8]))) - .add_buffer(Buffer::from_slice_ref([1, 2, 0, 4])) - .build() - .unwrap(); - - assert_eq!(&expected_string_data, arr.keys().data()); - assert_eq!(&expected_int_data, arr.values().data()); - } -} diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 1480e03729d1..3f0b3d1be4e6 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -160,7 +160,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (Utf8, _) => DataType::is_numeric(to_type), + (Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (LargeUtf8, LargeBinary | Date32 @@ -171,11 +171,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (LargeUtf8, _) => DataType::is_numeric(to_type), + (LargeUtf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, (Date32, Utf8) | (Date32, LargeUtf8) => true, (Date64, Utf8) | (Date64, LargeUtf8) => true, - (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) || from_type == &Binary, + (_, Utf8 | LargeUtf8) => (DataType::is_numeric(from_type) && from_type != &Float16) || from_type == &Binary, // start numeric casts ( @@ -972,6 +972,7 @@ pub fn cast_with_options( Int16 => cast_numeric_to_bool::(array), Int32 => cast_numeric_to_bool::(array), Int64 => cast_numeric_to_bool::(array), + Float16 => cast_numeric_to_bool::(array), Float32 => cast_numeric_to_bool::(array), Float64 => cast_numeric_to_bool::(array), Utf8 => cast_utf8_to_boolean(array, cast_options), @@ -989,6 +990,7 @@ pub fn cast_with_options( Int16 => cast_bool_to_numeric::(array, cast_options), Int32 => cast_bool_to_numeric::(array, cast_options), Int64 => cast_bool_to_numeric::(array, cast_options), + Float16 => cast_bool_to_numeric::(array, cast_options), Float32 => cast_bool_to_numeric::(array, cast_options), Float64 => cast_bool_to_numeric::(array, cast_options), Utf8 => { @@ -3811,7 +3813,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_decimal128_to_decimal128() { let input_type = DataType::Decimal128(20, 3); let output_type = DataType::Decimal128(20, 4); @@ -4202,7 +4203,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal128() { let decimal_type = DataType::Decimal128(38, 6); // u8, u16, u32, u64 @@ -4374,7 +4374,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_numeric_to_decimal256() { // test negative cast type let decimal_type = DataType::Decimal256(58, 6); @@ -5352,25 +5351,6 @@ mod tests { assert!(c.is_null(2)); } - #[test] - #[cfg(feature = "chrono-tz")] - fn test_cast_timestamp_to_string() { - let a = TimestampMillisecondArray::from(vec![ - Some(864000000005), - Some(1545696000001), - None, - ]) - .with_timezone("UTC".to_string()); - let array = Arc::new(a) as ArrayRef; - dbg!(&array); - let b = cast(&array, &DataType::Utf8).unwrap(); - let c = b.as_any().downcast_ref::().unwrap(); - assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); - assert!(c.is_null(2)); - } - #[test] fn test_cast_date32_to_string() { let a = Date32Array::from(vec![10000, 17890]); @@ -6877,41 +6857,6 @@ mod tests { assert!(!c.is_valid(5)); // "2000-01-01" } - #[test] - #[cfg_attr(miri, ignore)] // running forever - #[cfg(feature = "chrono-tz")] - fn test_can_cast_types() { - // this function attempts to ensure that can_cast_types stays - // in sync with cast. It simply tries all combinations of - // types and makes sure that if `can_cast_types` returns - // true, so does `cast` - - let all_types = get_all_types(); - - for array in get_arrays_of_all_types() { - for to_type in &all_types { - println!("Test casting {:?} --> {:?}", array.data_type(), to_type); - let cast_result = cast(&array, to_type); - let reported_cast_ability = can_cast_types(array.data_type(), to_type); - - // check for mismatch - match (cast_result, reported_cast_ability) { - (Ok(_), false) => { - panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", - array, array.data_type(), to_type) - } - (Err(e), true) => { - panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ - Error was {:?}", - array, array.data_type(), to_type, e) - } - // otherwise it was a match - _ => {} - }; - } - } - } - #[test] fn test_cast_list_containers() { // large-list to list @@ -6946,99 +6891,6 @@ mod tests { assert_eq!(&expected.value(2), &actual.value(2)); } - /// Create instances of arrays with varying types for cast tests - #[cfg(feature = "chrono-tz")] - fn get_arrays_of_all_types() -> Vec { - let tz_name = String::from("America/New_York"); - let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; - vec![ - Arc::new(BinaryArray::from(binary_data.clone())), - Arc::new(LargeBinaryArray::from(binary_data.clone())), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_primitive::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - make_dictionary_utf8::(), - Arc::new(make_list_array()), - Arc::new(make_large_list_array()), - Arc::new(make_fixed_size_list_array()), - Arc::new(make_fixed_size_binary_array()), - Arc::new(StructArray::from(vec![ - ( - Field::new("a", DataType::Boolean, false), - Arc::new(BooleanArray::from(vec![false, false, true, true])) - as Arc, - ), - ( - Field::new("b", DataType::Int32, false), - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ), - ])), - Arc::new(make_union_array()), - Arc::new(NullArray::new(10)), - Arc::new(StringArray::from(vec!["foo", "bar"])), - Arc::new(LargeStringArray::from(vec!["foo", "bar"])), - Arc::new(BooleanArray::from(vec![true, false])), - Arc::new(Int8Array::from(vec![1, 2])), - Arc::new(Int16Array::from(vec![1, 2])), - Arc::new(Int32Array::from(vec![1, 2])), - Arc::new(Int64Array::from(vec![1, 2])), - Arc::new(UInt8Array::from(vec![1, 2])), - Arc::new(UInt16Array::from(vec![1, 2])), - Arc::new(UInt32Array::from(vec![1, 2])), - Arc::new(UInt64Array::from(vec![1, 2])), - Arc::new(Float32Array::from(vec![1.0, 2.0])), - Arc::new(Float64Array::from(vec![1.0, 2.0])), - Arc::new(TimestampSecondArray::from(vec![1000, 2000])), - Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), - Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), - Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), - Arc::new( - TimestampSecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMillisecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampMicrosecondArray::from(vec![1000, 2000]) - .with_timezone(tz_name.clone()), - ), - Arc::new( - TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name), - ), - Arc::new(Date32Array::from(vec![1000, 2000])), - Arc::new(Date64Array::from(vec![1000, 2000])), - Arc::new(Time32SecondArray::from(vec![1000, 2000])), - Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), - Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), - Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), - Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), - Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), - Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), - Arc::new(DurationSecondArray::from(vec![1000, 2000])), - Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), - Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), - Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), - Arc::new( - create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0) - .unwrap(), - ), - ] - } - fn make_list_array() -> ListArray { // Construct a value array let value_data = ArrayData::builder(DataType::Int32) @@ -7087,140 +6939,6 @@ mod tests { LargeListArray::from(list_data) } - #[cfg(feature = "chrono-tz")] - fn make_fixed_size_list_array() -> FixedSizeListArray { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, true)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data) - .build() - .unwrap(); - FixedSizeListArray::from(list_data) - } - - #[cfg(feature = "chrono-tz")] - fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - FixedSizeBinaryArray::from(array_data) - } - - #[cfg(feature = "chrono-tz")] - fn make_union_array() -> UnionArray { - let mut builder = UnionBuilder::with_capacity_dense(7); - builder.append::("a", 1).unwrap(); - builder.append::("b", 2).unwrap(); - builder.build().unwrap() - } - - /// Creates a dictionary with primitive dictionary values, and keys of type K - #[cfg(feature = "chrono-tz")] - fn make_dictionary_primitive() -> ArrayRef { - // Pick Int32 arbitrarily for dictionary values - let mut b: PrimitiveDictionaryBuilder = - PrimitiveDictionaryBuilder::new(); - b.append(1).unwrap(); - b.append(2).unwrap(); - Arc::new(b.finish()) - } - - /// Creates a dictionary with utf8 values, and keys of type K - #[cfg(feature = "chrono-tz")] - fn make_dictionary_utf8() -> ArrayRef { - // Pick Int32 arbitrarily for dictionary values - let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); - b.append("foo").unwrap(); - b.append("bar").unwrap(); - Arc::new(b.finish()) - } - - // Get a selection of datatypes to try and cast to - #[cfg(feature = "chrono-tz")] - fn get_all_types() -> Vec { - use DataType::*; - let tz_name = String::from("America/New_York"); - - vec![ - Null, - Boolean, - Int8, - Int16, - Int32, - UInt64, - UInt8, - UInt16, - UInt32, - UInt64, - Float16, - Float32, - Float64, - Timestamp(TimeUnit::Second, None), - Timestamp(TimeUnit::Millisecond, None), - Timestamp(TimeUnit::Microsecond, None), - Timestamp(TimeUnit::Nanosecond, None), - Timestamp(TimeUnit::Second, Some(tz_name.clone())), - Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), - Timestamp(TimeUnit::Nanosecond, Some(tz_name)), - Date32, - Date64, - Time32(TimeUnit::Second), - Time32(TimeUnit::Millisecond), - Time64(TimeUnit::Microsecond), - Time64(TimeUnit::Nanosecond), - Duration(TimeUnit::Second), - Duration(TimeUnit::Millisecond), - Duration(TimeUnit::Microsecond), - Duration(TimeUnit::Nanosecond), - Interval(IntervalUnit::YearMonth), - Interval(IntervalUnit::DayTime), - Interval(IntervalUnit::MonthDayNano), - Binary, - FixedSizeBinary(10), - LargeBinary, - Utf8, - LargeUtf8, - List(Box::new(Field::new("item", DataType::Int8, true))), - List(Box::new(Field::new("item", DataType::Utf8, true))), - FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), - FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), - LargeList(Box::new(Field::new("item", DataType::Int8, true))), - LargeList(Box::new(Field::new("item", DataType::Utf8, false))), - Struct(vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ]), - Union( - vec![ - Field::new("f1", DataType::Int32, false), - Field::new("f2", DataType::Utf8, true), - ], - vec![0, 1], - UnionMode::Dense, - ), - Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), - Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), - Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - Decimal128(38, 0), - ] - } - #[test] fn test_utf8_cast_offsets() { // test if offset of the array is taken into account during cast @@ -7247,41 +6965,6 @@ mod tests { assert_eq!(&out1, &out2.slice(1, 2)) } - #[test] - #[cfg(feature = "chrono-tz")] - fn test_timestamp_cast_utf8() { - let array: PrimitiveArray = - vec![Some(37800000000), None, Some(86339000000)].into(); - let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); - - let expected = StringArray::from(vec![ - Some("1970-01-01 10:30:00"), - None, - Some("1970-01-01 23:58:59"), - ]); - - assert_eq!( - out.as_any().downcast_ref::().unwrap(), - &expected - ); - - let array: PrimitiveArray = - vec![Some(37800000000), None, Some(86339000000)].into(); - let array = array.with_timezone("Australia/Sydney".to_string()); - let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); - - let expected = StringArray::from(vec![ - Some("1970-01-01 20:30:00 +10:00"), - None, - Some("1970-01-02 09:58:59 +10:00"), - ]); - - assert_eq!( - out.as_any().downcast_ref::().unwrap(), - &expected - ); - } - #[test] fn test_list_to_string() { let str_array = StringArray::from(vec!["a", "b", "c", "d", "e", "f", "g", "h"]); @@ -7346,7 +7029,6 @@ mod tests { } #[test] - #[cfg(not(feature = "force_validate"))] fn test_cast_f64_to_decimal128() { // to reproduce https://github.com/apache/arrow-rs/issues/2997 diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 32f580afbf55..ef0a49be693b 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -131,7 +131,7 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - create_list_array(list_node, data_type, &list_buffers, triple.0) + create_list_array(list_node, data_type, &list_buffers, triple.0)? } FixedSizeList(ref list_field, _) => { let list_node = nodes.get(node_index); @@ -156,7 +156,7 @@ fn create_array( node_index = triple.1; buffer_index = triple.2; - create_list_array(list_node, data_type, &list_buffers, triple.0) + create_list_array(list_node, data_type, &list_buffers, triple.0)? } Struct(struct_fields) => { let struct_node = nodes.get(node_index); @@ -220,7 +220,7 @@ fn create_array( data_type, &index_buffers, value_array.clone(), - ) + )? } Union(fields, field_type_ids, mode) => { let union_node = nodes.get(node_index); @@ -527,7 +527,7 @@ fn create_list_array( data_type: &DataType, buffers: &[Buffer], child_array: ArrayRef, -) -> ArrayRef { +) -> Result { let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone()); let length = field_node.length() as usize; let child_data = child_array.into_data(); @@ -545,7 +545,7 @@ fn create_list_array( _ => unreachable!("Cannot create list or map array from {:?}", data_type), }; - make_array(builder.build().unwrap()) + Ok(make_array(builder.build()?)) } /// Reads the correct number of buffers based on list type and null_count, and creates a @@ -555,7 +555,7 @@ fn create_dictionary_array( data_type: &DataType, buffers: &[Buffer], value_array: ArrayRef, -) -> ArrayRef { +) -> Result { if let Dictionary(_, _) = *data_type { let null_buffer = (field_node.null_count() > 0).then_some(buffers[0].clone()); let builder = ArrayData::builder(data_type.clone()) @@ -564,7 +564,7 @@ fn create_dictionary_array( .add_child_data(value_array.into_data()) .null_bit_buffer(null_buffer); - make_array(unsafe { builder.build_unchecked() }) + Ok(make_array(builder.build()?)) } else { unreachable!("Cannot create dictionary array from {:?}", data_type) } diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index f1d13aefd279..4162d41bf1b4 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -190,6 +190,13 @@ pub enum DataType { /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. Decimal128(u8, i8), /// Exact 256-bit width decimal value with precision and scale /// @@ -197,6 +204,13 @@ pub enum DataType { /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. + /// + /// In certain situations, scale could be negative number. For + /// negative scale, it is the number of padding 0 to the right + /// of the digits. + /// + /// For example the number 12300 could be treated as a decimal + /// has precision 3 and scale -2. Decimal256(u8, i8), /// A Map is a logical nested type that is represented as /// diff --git a/arrow-select/src/lib.rs b/arrow-select/src/lib.rs index cf887dfca47c..c468e20a511e 100644 --- a/arrow-select/src/lib.rs +++ b/arrow-select/src/lib.rs @@ -20,6 +20,7 @@ pub mod concat; pub mod filter; pub mod interleave; +pub mod nullif; pub mod take; pub mod window; pub mod zip; diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs new file mode 100644 index 000000000000..a0a1a3a2206b --- /dev/null +++ b/arrow-select/src/nullif.rs @@ -0,0 +1,454 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::BooleanBufferBuilder; +use arrow_array::{make_array, Array, ArrayRef, BooleanArray}; +use arrow_buffer::buffer::{ + bitwise_bin_op_helper, bitwise_unary_op_helper, buffer_bin_and, +}; +use arrow_schema::ArrowError; + +/// Copies original array, setting validity bit to false if a secondary comparison +/// boolean array is set to true +/// +/// Typically used to implement NULLIF. +pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { + let left_data = left.data(); + let right_data = right.data(); + + if left_data.len() != right_data.len() { + return Err(ArrowError::ComputeError( + "Cannot perform comparison operation on arrays of different length" + .to_string(), + )); + } + let len = left_data.len(); + let left_offset = left_data.offset(); + + // left=0 (null) right=null output bitmap=null + // left=0 right=1 output bitmap=null + // left=1 (set) right=null output bitmap=set (passthrough) + // left=1 right=1 & comp=true output bitmap=null + // left=1 right=1 & comp=false output bitmap=set + // + // Thus: result = left null bitmap & (!right_values | !right_bitmap) + // OR left null bitmap & !(right_values & right_bitmap) + + // Compute right_values & right_bitmap + let (right, right_offset) = match right_data.null_buffer() { + Some(buffer) => ( + buffer_bin_and( + &right_data.buffers()[0], + right_data.offset(), + buffer, + right_data.offset(), + len, + ), + 0, + ), + None => (right_data.buffers()[0].clone(), right_data.offset()), + }; + + // Compute left null bitmap & !right + let mut valid_count = 0; + let combined = match left_data.null_buffer() { + Some(left) => { + bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { + let t = l & !r; + valid_count += t.count_ones() as usize; + t + }) + } + None => { + let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { + let t = !b; + valid_count += t.count_ones() as usize; + t + }); + // We need to compensate for the additional bits read from the end + let remainder_len = len % 64; + if remainder_len != 0 { + valid_count -= 64 - remainder_len + } + buffer + } + }; + + // Need to construct null buffer with offset of left + let null_buffer = match left_data.offset() { + 0 => combined, + _ => { + let mut builder = BooleanBufferBuilder::new(len + left_offset); + // Pad with 0s up to offset + builder.resize(left_offset); + builder.append_packed_range(0..len, &combined); + builder.finish() + } + }; + + let null_count = len - valid_count; + let data = left_data + .clone() + .into_builder() + .null_bit_buffer(Some(null_buffer)) + .null_count(null_count); + + // SAFETY: + // Only altered null mask + Ok(make_array(unsafe { data.build_unchecked() })) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; + use arrow_array::cast::{as_boolean_array, as_primitive_array, as_string_array}; + use arrow_array::types::Int32Type; + use arrow_array::{Int32Array, StringArray, StructArray}; + use arrow_schema::{DataType, Field}; + + #[test] + fn test_nullif_int_array() { + let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); + let comp = + BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let res = nullif(&a, &comp).unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), + None, + None, // comp true, slot 2 turned into null + Some(1), + // Even though comp array / right is null, should still pass through original value + // comp true, slot 2 turned into null + Some(9), + ]); + + let res = as_primitive_array::(&res); + assert_eq!(&expected, res); + } + + #[test] + fn test_nullif_int_array_offset() { + let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]); + let a = a.slice(1, 3); // Some(15), Some(8), Some(1) + let a = a.as_any().downcast_ref::().unwrap(); + let comp = BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(a, comp).unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + let res = as_primitive_array::(&res); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_string() { + let s = StringArray::from_iter([ + Some("hello"), + None, + Some("world"), + Some("a"), + Some("b"), + None, + None, + ]); + let select = BooleanArray::from_iter([ + Some(true), + Some(true), + Some(false), + Some(true), + Some(false), + Some(false), + None, + ]); + + let a = nullif(&s, &select).unwrap(); + let r: Vec<_> = as_string_array(&a).iter().collect(); + assert_eq!( + r, + vec![None, None, Some("world"), None, Some("b"), None, None] + ); + + let s = s.slice(2, 3); + let select = select.slice(1, 3); + let select = as_boolean_array(select.as_ref()); + let a = nullif(s.as_ref(), select).unwrap(); + let r: Vec<_> = as_string_array(&a).iter().collect(); + assert_eq!(r, vec![None, Some("a"), None]); + } + + #[test] + fn test_nullif_int_large_left_offset() { + let a = Int32Array::from(vec![ + Some(-1), // 0 + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), // 8 + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + Some(-1), + None, // 16 + Some(15), // 17 + Some(8), + Some(1), + Some(9), + ]); + let a = a.slice(17, 3); // Some(15), Some(8), Some(1) + + let comp = BooleanArray::from(vec![ + Some(false), + Some(false), + Some(false), + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_int_large_right_offset() { + let a = Int32Array::from(vec![ + None, // 0 + Some(15), // 1 + Some(8), + Some(1), + Some(9), + ]); + let a = a.slice(1, 3); // Some(15), Some(8), Some(1) + + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), // 8 + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), + Some(false), // 16 + Some(false), // 17 + Some(false), // 18 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(18, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = Int32Array::from(vec![ + Some(15), // False => keep it + Some(8), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + #[test] + fn test_nullif_boolean_offset() { + let a = BooleanArray::from(vec![ + None, // 0 + Some(true), // 1 + Some(false), + Some(true), + Some(true), + ]); + let a = a.slice(1, 3); // Some(true), Some(false), Some(true) + + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), // 1 + Some(false), // 2 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 3); // Some(false), None, Some(true) + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&a, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = BooleanArray::from(vec![ + Some(true), // False => keep it + Some(false), // None => keep it + None, // true => None + ]); + assert_eq!(&expected, res) + } + + struct Foo { + a: Option, + b: Option, + /// Whether the entry should be valid. + is_valid: bool, + } + + impl Foo { + fn new_valid(a: i32, b: bool) -> Foo { + Self { + a: Some(a), + b: Some(b), + is_valid: true, + } + } + + fn new_null() -> Foo { + Self { + a: None, + b: None, + is_valid: false, + } + } + } + + /// Struct Array equality is a bit weird -- we need to have the *child values* + /// correct even if the enclosing struct indicates it is null. But we + /// also need the top level is_valid bits to be correct. + fn create_foo_struct(values: Vec) -> StructArray { + let mut struct_array = StructBuilder::new( + vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Boolean, true), + ], + vec![ + Box::new(Int32Builder::with_capacity(values.len())), + Box::new(BooleanBuilder::with_capacity(values.len())), + ], + ); + + for value in values { + struct_array + .field_builder::(0) + .unwrap() + .append_option(value.a); + struct_array + .field_builder::(1) + .unwrap() + .append_option(value.b); + struct_array.append(value.is_valid); + } + + struct_array.finish() + } + + #[test] + fn test_nullif_struct_slices() { + let struct_array = create_foo_struct(vec![ + Foo::new_valid(7, true), + Foo::new_valid(15, false), + Foo::new_valid(8, true), + Foo::new_valid(12, false), + Foo::new_null(), + Foo::new_null(), + Foo::new_valid(42, true), + ]); + + // Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}), + // None, None + let struct_array = struct_array.slice(1, 5); + let comp = BooleanArray::from(vec![ + Some(false), // 0 + Some(false), // 1 + Some(false), // 2 + None, + Some(true), + Some(false), + None, + ]); + let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None + let comp = comp.as_any().downcast_ref::().unwrap(); + let res = nullif(&struct_array, comp).unwrap(); + let res = res.as_any().downcast_ref::().unwrap(); + + let expected = create_foo_struct(vec![ + // Some(false) -> keep + Foo::new_valid(15, false), + // None -> keep + Foo::new_valid(8, true), + // Some(true) -> null out. But child values are still there. + Foo { + a: Some(12), + b: Some(false), + is_valid: false, + }, + // Some(false) -> keep, but was null + Foo::new_null(), + // None -> keep, but was null + Foo::new_null(), + ]); + + assert_eq!(&expected, res); + } + + #[test] + fn test_nullif_no_nulls() { + let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); + let comp = + BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); + let res = nullif(&a, &comp).unwrap(); + let res = as_primitive_array::(res.as_ref()); + + let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); + assert_eq!(res, &expected); + } +} diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a97ec1ac123f..876d0d65084e 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -273,3 +273,7 @@ required-features = ["csv", "chrono-tz"] [[test]] name = "pyarrow" required-features = ["pyarrow"] + +[[test]] +name = "array_cast" +required-features = ["chrono-tz"] diff --git a/arrow/src/compute/kernels/arithmetic.rs b/arrow/src/compute/kernels/arithmetic.rs index f9deada5389b..c57e27095c23 100644 --- a/arrow/src/compute/kernels/arithmetic.rs +++ b/arrow/src/compute/kernels/arithmetic.rs @@ -1624,7 +1624,7 @@ where mod tests { use super::*; use crate::array::Int32Array; - use crate::compute::{try_unary_mut, unary_mut}; + use crate::compute::{binary_mut, try_binary_mut, try_unary_mut, unary_mut}; use crate::datatypes::{Date64Type, Int32Type, Int8Type}; use arrow_buffer::i256; use chrono::NaiveDate; @@ -3100,6 +3100,35 @@ mod tests { assert_eq!(result.null_count(), 13); } + #[test] + fn test_primitive_array_add_mut_by_binary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + + let c = binary_mut(a, &b, |a, b| a.add_wrapping(b)) + .unwrap() + .unwrap(); + let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); + assert_eq!(c, expected); + } + + #[test] + fn test_primitive_add_mut_wrapping_overflow_by_try_binary_mut() { + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + + let wrapped = binary_mut(a, &b, |a, b| a.add_wrapping(b)) + .unwrap() + .unwrap(); + let expected = Int32Array::from(vec![-2147483648, -2147483647]); + assert_eq!(expected, wrapped); + + let a = Int32Array::from(vec![i32::MAX, i32::MIN]); + let b = Int32Array::from(vec![1, 1]); + let overflow = try_binary_mut(a, &b, |a, b| a.add_checked(b)); + let _ = overflow.unwrap().expect_err("overflow should be detected"); + } + #[test] fn test_primitive_add_scalar_by_unary_mut() { let a = Int32Array::from(vec![15, 14, 9, 8, 1]); diff --git a/arrow/src/compute/kernels/arity.rs b/arrow/src/compute/kernels/arity.rs index 946d15e9e984..d0f18cf5866d 100644 --- a/arrow/src/compute/kernels/arity.rs +++ b/arrow/src/compute/kernels/arity.rs @@ -232,6 +232,75 @@ where Ok(unsafe { build_primitive_array(len, buffer, null_count, null_buffer) }) } +/// Given two arrays of length `len`, calls `op(a[i], b[i])` for `i` in `0..len`, mutating +/// the mutable [`PrimitiveArray`] `a`. If any index is null in either `a` or `b`, the +/// corresponding index in the result will also be null. +/// +/// Mutable primitive array means that the buffer is not shared with other arrays. +/// As a result, this mutates the buffer directly without allocating new buffer. +/// +/// Like [`unary`] the provided function is evaluated for every index, ignoring validity. This +/// is beneficial when the cost of the operation is low compared to the cost of branching, and +/// especially when the operation can be vectorised, however, requires `op` to be infallible +/// for all possible values of its inputs +/// +/// # Error +/// +/// This function gives error if the arrays have different lengths. +/// This function gives error of original [`PrimitiveArray`] `a` if it is not a mutable +/// primitive array. +pub fn binary_mut( + a: PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + T: ArrowPrimitiveType, + F: Fn(T::Native, T::Native) -> T::Native, +{ + if a.len() != b.len() { + return Ok(Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + ))); + } + + if a.is_empty() { + return Ok(Ok(PrimitiveArray::from(ArrayData::new_empty( + &T::DATA_TYPE, + )))); + } + + let len = a.len(); + + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits_offset(0, len)) + .unwrap_or_default(); + + let mut builder = a.into_builder()?; + + builder + .values_slice_mut() + .iter_mut() + .zip(b.values()) + .for_each(|(l, r)| *l = op(*l, *r)); + + let array_builder = builder + .finish() + .data() + .clone() + .into_builder() + .null_bit_buffer(null_buffer) + .null_count(null_count); + + let array_data = unsafe { array_builder.build_unchecked() }; + Ok(Ok(PrimitiveArray::::from(array_data))) +} + /// Applies the provided fallible binary operation across `a` and `b`, returning any error, /// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a` /// or `b`, the corresponding index in the result will also be null @@ -289,6 +358,83 @@ where } } +/// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable +/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in +/// either `a` or `b`, the corresponding index in the result will also be null +/// +/// Like [`try_unary`] the function is only evaluated for non-null indices +/// +/// Mutable primitive array means that the buffer is not shared with other arrays. +/// As a result, this mutates the buffer directly without allocating new buffer. +/// +/// # Error +/// +/// Return an error if the arrays have different lengths or +/// the operation is under erroneous. +/// This function gives error of original [`PrimitiveArray`] `a` if it is not a mutable +/// primitive array. +pub fn try_binary_mut( + a: PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + T: ArrowPrimitiveType, + F: Fn(T::Native, T::Native) -> Result, +{ + if a.len() != b.len() { + return Ok(Err(ArrowError::ComputeError( + "Cannot perform binary operation on arrays of different length".to_string(), + ))); + } + let len = a.len(); + + if a.is_empty() { + return Ok(Ok(PrimitiveArray::from(ArrayData::new_empty( + &T::DATA_TYPE, + )))); + } + + if a.null_count() == 0 && b.null_count() == 0 { + try_binary_no_nulls_mut(len, a, b, op) + } else { + let null_buffer = combine_option_bitmap(&[a.data(), b.data()], len).unwrap(); + let null_count = null_buffer + .as_ref() + .map(|x| len - x.count_set_bits_offset(0, len)) + .unwrap_or_default(); + + let mut builder = a.into_builder()?; + + let slice = builder.values_slice_mut(); + + match try_for_each_valid_idx(len, 0, null_count, null_buffer.as_deref(), |idx| { + unsafe { + *slice.get_unchecked_mut(idx) = + op(*slice.get_unchecked(idx), b.value_unchecked(idx))? + }; + Ok::<_, ArrowError>(()) + }) { + Ok(_) => {} + Err(err) => return Ok(Err(err)), + }; + + let array_builder = builder + .finish() + .data() + .clone() + .into_builder() + .null_bit_buffer(null_buffer) + .null_count(null_count); + + let array_data = unsafe { array_builder.build_unchecked() }; + Ok(Ok(PrimitiveArray::::from(array_data))) + } +} + /// This intentional inline(never) attribute helps LLVM optimize the loop. #[inline(never)] fn try_binary_no_nulls( @@ -310,6 +456,35 @@ where Ok(unsafe { build_primitive_array(len, buffer.into(), 0, None) }) } +/// This intentional inline(never) attribute helps LLVM optimize the loop. +#[inline(never)] +fn try_binary_no_nulls_mut( + len: usize, + a: PrimitiveArray, + b: &PrimitiveArray, + op: F, +) -> std::result::Result< + std::result::Result, ArrowError>, + PrimitiveArray, +> +where + T: ArrowPrimitiveType, + F: Fn(T::Native, T::Native) -> Result, +{ + let mut builder = a.into_builder()?; + let slice = builder.values_slice_mut(); + + for idx in 0..len { + unsafe { + match op(*slice.get_unchecked(idx), b.value_unchecked(idx)) { + Ok(value) => *slice.get_unchecked_mut(idx) = value, + Err(err) => return Ok(Err(err)), + }; + }; + } + Ok(Ok(builder.finish())) +} + #[inline(never)] fn try_binary_opt_no_nulls( len: usize, @@ -385,6 +560,7 @@ mod tests { use super::*; use crate::array::{as_primitive_array, Float64Array, PrimitiveDictionaryBuilder}; use crate::datatypes::{Float64Type, Int32Type, Int8Type}; + use arrow_array::Int32Array; #[test] fn test_unary_f64_slice() { @@ -444,4 +620,44 @@ mod tests { &expected ); } + + #[test] + fn test_binary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let c = binary_mut(a, &b, |l, r| l + r).unwrap().unwrap(); + + let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); + assert_eq!(c, expected); + } + + #[test] + fn test_try_binary_mut() { + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let c = try_binary_mut(a, &b, |l, r| Ok(l + r)).unwrap().unwrap(); + + let expected = Int32Array::from(vec![Some(16), None, Some(12), None, Some(6)]); + assert_eq!(c, expected); + + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![1, 2, 3, 4, 5]); + let c = try_binary_mut(a, &b, |l, r| Ok(l + r)).unwrap().unwrap(); + let expected = Int32Array::from(vec![16, 16, 12, 12, 6]); + assert_eq!(c, expected); + + let a = Int32Array::from(vec![15, 14, 9, 8, 1]); + let b = Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)]); + let _ = try_binary_mut(a, &b, |l, r| { + if l == 1 { + Err(ArrowError::InvalidArgumentError( + "got error".parse().unwrap(), + )) + } else { + Ok(l + r) + } + }) + .unwrap() + .expect_err("should got error"); + } } diff --git a/arrow/src/compute/kernels/boolean.rs b/arrow/src/compute/kernels/boolean.rs index dee5d0d1b3ba..1b33fa19ea02 100644 --- a/arrow/src/compute/kernels/boolean.rs +++ b/arrow/src/compute/kernels/boolean.rs @@ -22,6 +22,8 @@ //! `RUSTFLAGS="-C target-feature=+avx2"` for example. See the documentation //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. +pub use arrow_select::nullif; + use crate::array::{Array, ArrayData, BooleanArray}; use crate::buffer::{ bitwise_bin_op_helper, bitwise_quaternary_op_helper, buffer_bin_and, buffer_bin_or, @@ -31,9 +33,6 @@ use crate::compute::util::combine_option_bitmap; use crate::datatypes::DataType; use crate::error::{ArrowError, Result}; use crate::util::bit_util::ceil; -use arrow_array::builder::BooleanBufferBuilder; -use arrow_array::{make_array, ArrayRef}; -use arrow_buffer::buffer::bitwise_unary_op_helper; /// Updates null buffer based on data buffer and null buffer of the operand at other side /// in boolean AND kernel with Kleene logic. In short, because for AND kernel, null AND false @@ -471,105 +470,10 @@ pub fn is_not_null(input: &dyn Array) -> Result { Ok(BooleanArray::from(data)) } -/// Copies original array, setting validity bit to false if a secondary comparison -/// boolean array is set to true -/// -/// Typically used to implement NULLIF. -pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { - let left_data = left.data(); - let right_data = right.data(); - - if left_data.len() != right_data.len() { - return Err(ArrowError::ComputeError( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - let len = left_data.len(); - let left_offset = left_data.offset(); - - // left=0 (null) right=null output bitmap=null - // left=0 right=1 output bitmap=null - // left=1 (set) right=null output bitmap=set (passthrough) - // left=1 right=1 & comp=true output bitmap=null - // left=1 right=1 & comp=false output bitmap=set - // - // Thus: result = left null bitmap & (!right_values | !right_bitmap) - // OR left null bitmap & !(right_values & right_bitmap) - - // Compute right_values & right_bitmap - let (right, right_offset) = match right_data.null_buffer() { - Some(buffer) => ( - buffer_bin_and( - &right_data.buffers()[0], - right_data.offset(), - buffer, - right_data.offset(), - len, - ), - 0, - ), - None => (right_data.buffers()[0].clone(), right_data.offset()), - }; - - // Compute left null bitmap & !right - let mut valid_count = 0; - let combined = match left_data.null_buffer() { - Some(left) => { - bitwise_bin_op_helper(left, left_offset, &right, right_offset, len, |l, r| { - let t = l & !r; - valid_count += t.count_ones() as usize; - t - }) - } - None => { - let buffer = bitwise_unary_op_helper(&right, right_offset, len, |b| { - let t = !b; - valid_count += t.count_ones() as usize; - t - }); - // We need to compensate for the additional bits read from the end - let remainder_len = len % 64; - if remainder_len != 0 { - valid_count -= 64 - remainder_len - } - buffer - } - }; - - // Need to construct null buffer with offset of left - let null_buffer = match left_data.offset() { - 0 => combined, - _ => { - let mut builder = BooleanBufferBuilder::new(len + left_offset); - // Pad with 0s up to offset - builder.resize(left_offset); - builder.append_packed_range(0..len, &combined); - builder.finish() - } - }; - - let null_count = len - valid_count; - let data = left_data - .clone() - .into_builder() - .null_bit_buffer(Some(null_buffer)) - .null_count(null_count); - - // SAFETY: - // Only altered null mask - Ok(make_array(unsafe { data.build_unchecked() })) -} - #[cfg(test)] mod tests { use super::*; use crate::array::{ArrayRef, Int32Array}; - use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder}; - use arrow_array::cast::{as_boolean_array, as_primitive_array, as_string_array}; - use arrow_array::types::Int32Type; - use arrow_array::{StringArray, StructArray}; - use arrow_schema::Field; use std::sync::Arc; #[test] @@ -1100,335 +1004,4 @@ mod tests { assert_eq!(expected, res); assert_eq!(None, res.data_ref().null_bitmap()); } - - #[test] - fn test_nullif_int_array() { - let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); - let res = nullif(&a, &comp).unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), - None, - None, // comp true, slot 2 turned into null - Some(1), - // Even though comp array / right is null, should still pass through original value - // comp true, slot 2 turned into null - Some(9), - ]); - - let res = as_primitive_array::(&res); - assert_eq!(&expected, res); - } - - #[test] - fn test_nullif_int_array_offset() { - let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]); - let a = a.slice(1, 3); // Some(15), Some(8), Some(1) - let a = a.as_any().downcast_ref::().unwrap(); - let comp = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(a, comp).unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - let res = as_primitive_array::(&res); - assert_eq!(&expected, res) - } - - #[test] - fn test_nullif_string() { - let s = StringArray::from_iter([ - Some("hello"), - None, - Some("world"), - Some("a"), - Some("b"), - None, - None, - ]); - let select = BooleanArray::from_iter([ - Some(true), - Some(true), - Some(false), - Some(true), - Some(false), - Some(false), - None, - ]); - - let a = nullif(&s, &select).unwrap(); - let r: Vec<_> = as_string_array(&a).iter().collect(); - assert_eq!( - r, - vec![None, None, Some("world"), None, Some("b"), None, None] - ); - - let s = s.slice(2, 3); - let select = select.slice(1, 3); - let select = as_boolean_array(select.as_ref()); - let a = nullif(s.as_ref(), select).unwrap(); - let r: Vec<_> = as_string_array(&a).iter().collect(); - assert_eq!(r, vec![None, Some("a"), None]); - } - - #[test] - fn test_nullif_int_large_left_offset() { - let a = Int32Array::from(vec![ - Some(-1), // 0 - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), // 8 - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - Some(-1), - None, // 16 - Some(15), // 17 - Some(8), - Some(1), - Some(9), - ]); - let a = a.slice(17, 3); // Some(15), Some(8), Some(1) - - let comp = BooleanArray::from(vec![ - Some(false), - Some(false), - Some(false), - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, res) - } - - #[test] - fn test_nullif_int_large_right_offset() { - let a = Int32Array::from(vec![ - None, // 0 - Some(15), // 1 - Some(8), - Some(1), - Some(9), - ]); - let a = a.slice(1, 3); // Some(15), Some(8), Some(1) - - let comp = BooleanArray::from(vec![ - Some(false), // 0 - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), // 8 - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), - Some(false), // 16 - Some(false), // 17 - Some(false), // 18 - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(18, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = Int32Array::from(vec![ - Some(15), // False => keep it - Some(8), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, res) - } - - #[test] - fn test_nullif_boolean_offset() { - let a = BooleanArray::from(vec![ - None, // 0 - Some(true), // 1 - Some(false), - Some(true), - Some(true), - ]); - let a = a.slice(1, 3); // Some(true), Some(false), Some(true) - - let comp = BooleanArray::from(vec![ - Some(false), // 0 - Some(false), // 1 - Some(false), // 2 - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 3); // Some(false), None, Some(true) - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&a, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = BooleanArray::from(vec![ - Some(true), // False => keep it - Some(false), // None => keep it - None, // true => None - ]); - assert_eq!(&expected, res) - } - - struct Foo { - a: Option, - b: Option, - /// Whether the entry should be valid. - is_valid: bool, - } - - impl Foo { - fn new_valid(a: i32, b: bool) -> Foo { - Self { - a: Some(a), - b: Some(b), - is_valid: true, - } - } - - fn new_null() -> Foo { - Self { - a: None, - b: None, - is_valid: false, - } - } - } - - /// Struct Array equality is a bit weird -- we need to have the *child values* - /// correct even if the enclosing struct indicates it is null. But we - /// also need the top level is_valid bits to be correct. - fn create_foo_struct(values: Vec) -> StructArray { - let mut struct_array = StructBuilder::new( - vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Boolean, true), - ], - vec![ - Box::new(Int32Builder::with_capacity(values.len())), - Box::new(BooleanBuilder::with_capacity(values.len())), - ], - ); - - for value in values { - struct_array - .field_builder::(0) - .unwrap() - .append_option(value.a); - struct_array - .field_builder::(1) - .unwrap() - .append_option(value.b); - struct_array.append(value.is_valid); - } - - struct_array.finish() - } - - #[test] - fn test_nullif_struct_slices() { - let struct_array = create_foo_struct(vec![ - Foo::new_valid(7, true), - Foo::new_valid(15, false), - Foo::new_valid(8, true), - Foo::new_valid(12, false), - Foo::new_null(), - Foo::new_null(), - Foo::new_valid(42, true), - ]); - - // Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}), - // None, None - let struct_array = struct_array.slice(1, 5); - let comp = BooleanArray::from(vec![ - Some(false), // 0 - Some(false), // 1 - Some(false), // 2 - None, - Some(true), - Some(false), - None, - ]); - let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None - let comp = comp.as_any().downcast_ref::().unwrap(); - let res = nullif(&struct_array, comp).unwrap(); - let res = res.as_any().downcast_ref::().unwrap(); - - let expected = create_foo_struct(vec![ - // Some(false) -> keep - Foo::new_valid(15, false), - // None -> keep - Foo::new_valid(8, true), - // Some(true) -> null out. But child values are still there. - Foo { - a: Some(12), - b: Some(false), - is_valid: false, - }, - // Some(false) -> keep, but was null - Foo::new_null(), - // None -> keep, but was null - Foo::new_null(), - ]); - - assert_eq!(&expected, res); - } - - #[test] - fn test_nullif_no_nulls() { - let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]); - let comp = - BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]); - let res = nullif(&a, &comp).unwrap(); - let res = as_primitive_array::(res.as_ref()); - - let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]); - assert_eq!(res, &expected); - } } diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs new file mode 100644 index 000000000000..95fb973289a5 --- /dev/null +++ b/arrow/tests/array_cast.rs @@ -0,0 +1,407 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::builder::{ + PrimitiveDictionaryBuilder, StringDictionaryBuilder, UnionBuilder, +}; +use arrow_array::types::{ + ArrowDictionaryKeyType, Int16Type, Int32Type, Int64Type, Int8Type, + TimestampMicrosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, +}; +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, + Decimal128Array, DurationMicrosecondArray, DurationMillisecondArray, + DurationNanosecondArray, DurationSecondArray, FixedSizeBinaryArray, + FixedSizeListArray, Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, + Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, + IntervalYearMonthArray, LargeBinaryArray, LargeListArray, LargeStringArray, + ListArray, NullArray, PrimitiveArray, StringArray, StructArray, + Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, + UInt64Array, UInt8Array, UnionArray, +}; +use arrow_buffer::Buffer; +use arrow_cast::{can_cast_types, cast}; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType, Field, IntervalUnit, TimeUnit, UnionMode}; +use half::f16; +use std::sync::Arc; + +#[test] +fn test_cast_timestamp_to_string() { + let a = TimestampMillisecondArray::from(vec![ + Some(864000000005), + Some(1545696000001), + None, + ]) + .with_timezone("UTC".to_string()); + let array = Arc::new(a) as ArrayRef; + dbg!(&array); + let b = cast(&array, &DataType::Utf8).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(&DataType::Utf8, c.data_type()); + assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); + assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); + assert!(c.is_null(2)); +} + +#[test] +#[cfg_attr(miri, ignore)] // running forever +fn test_can_cast_types() { + // this function attempts to ensure that can_cast_types stays + // in sync with cast. It simply tries all combinations of + // types and makes sure that if `can_cast_types` returns + // true, so does `cast` + + let all_types = get_all_types(); + + for array in get_arrays_of_all_types() { + for to_type in &all_types { + println!("Test casting {:?} --> {:?}", array.data_type(), to_type); + let cast_result = cast(&array, to_type); + let reported_cast_ability = can_cast_types(array.data_type(), to_type); + + // check for mismatch + match (cast_result, reported_cast_ability) { + (Ok(_), false) => { + panic!("Was able to cast array {:?} from {:?} to {:?} but can_cast_types reported false", + array, array.data_type(), to_type) + } + (Err(e), true) => { + panic!("Was not able to cast array {:?} from {:?} to {:?} but can_cast_types reported true. \ + Error was {:?}", + array, array.data_type(), to_type, e) + } + // otherwise it was a match + _ => {} + }; + } + } +} + +/// Create instances of arrays with varying types for cast tests +fn get_arrays_of_all_types() -> Vec { + let tz_name = String::from("America/New_York"); + let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"]; + vec![ + Arc::new(BinaryArray::from(binary_data.clone())), + Arc::new(LargeBinaryArray::from(binary_data.clone())), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_primitive::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + make_dictionary_utf8::(), + Arc::new(make_list_array()), + Arc::new(make_large_list_array()), + Arc::new(make_fixed_size_list_array()), + Arc::new(make_fixed_size_binary_array()), + Arc::new(StructArray::from(vec![ + ( + Field::new("a", DataType::Boolean, false), + Arc::new(BooleanArray::from(vec![false, false, true, true])) + as Arc, + ), + ( + Field::new("b", DataType::Int32, false), + Arc::new(Int32Array::from(vec![42, 28, 19, 31])), + ), + ])), + Arc::new(make_union_array()), + Arc::new(NullArray::new(10)), + Arc::new(StringArray::from(vec!["foo", "bar"])), + Arc::new(LargeStringArray::from(vec!["foo", "bar"])), + Arc::new(BooleanArray::from(vec![true, false])), + Arc::new(Int8Array::from(vec![1, 2])), + Arc::new(Int16Array::from(vec![1, 2])), + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(Int64Array::from(vec![1, 2])), + Arc::new(UInt8Array::from(vec![1, 2])), + Arc::new(UInt16Array::from(vec![1, 2])), + Arc::new(UInt32Array::from(vec![1, 2])), + Arc::new(UInt64Array::from(vec![1, 2])), + Arc::new( + [Some(f16::from_f64(1.0)), Some(f16::from_f64(2.0))] + .into_iter() + .collect::(), + ), + Arc::new(Float32Array::from(vec![1.0, 2.0])), + Arc::new(Float64Array::from(vec![1.0, 2.0])), + Arc::new(TimestampSecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMillisecondArray::from(vec![1000, 2000])), + Arc::new(TimestampMicrosecondArray::from(vec![1000, 2000])), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000])), + Arc::new( + TimestampSecondArray::from(vec![1000, 2000]).with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMillisecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new( + TimestampMicrosecondArray::from(vec![1000, 2000]) + .with_timezone(tz_name.clone()), + ), + Arc::new(TimestampNanosecondArray::from(vec![1000, 2000]).with_timezone(tz_name)), + Arc::new(Date32Array::from(vec![1000, 2000])), + Arc::new(Date64Array::from(vec![1000, 2000])), + Arc::new(Time32SecondArray::from(vec![1000, 2000])), + Arc::new(Time32MillisecondArray::from(vec![1000, 2000])), + Arc::new(Time64MicrosecondArray::from(vec![1000, 2000])), + Arc::new(Time64NanosecondArray::from(vec![1000, 2000])), + Arc::new(IntervalYearMonthArray::from(vec![1000, 2000])), + Arc::new(IntervalDayTimeArray::from(vec![1000, 2000])), + Arc::new(IntervalMonthDayNanoArray::from(vec![1000, 2000])), + Arc::new(DurationSecondArray::from(vec![1000, 2000])), + Arc::new(DurationMillisecondArray::from(vec![1000, 2000])), + Arc::new(DurationMicrosecondArray::from(vec![1000, 2000])), + Arc::new(DurationNanosecondArray::from(vec![1000, 2000])), + Arc::new( + create_decimal_array(vec![Some(1), Some(2), Some(3), None], 38, 0).unwrap(), + ), + ] +} + +fn make_fixed_size_list_array() -> FixedSizeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Construct a fixed size list array from the above two + let list_data_type = + DataType::FixedSizeList(Box::new(Field::new("item", DataType::Int32, true)), 2); + let list_data = ArrayData::builder(list_data_type) + .len(5) + .add_child_data(value_data) + .build() + .unwrap(); + FixedSizeListArray::from(list_data) +} + +fn make_fixed_size_binary_array() -> FixedSizeBinaryArray { + let values: [u8; 15] = *b"hellotherearrow"; + + let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + .len(3) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + FixedSizeBinaryArray::from(array_data) +} + +fn make_list_array() -> ListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0, 3, 6, 8]); + + // Construct a list array from the above two + let list_data_type = + DataType::List(Box::new(Field::new("item", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + ListArray::from(list_data) +} + +fn make_large_list_array() -> LargeListArray { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a buffer for value offsets, for the nested array: + // [[0, 1, 2], [3, 4, 5], [6, 7]] + let value_offsets = Buffer::from_slice_ref([0i64, 3, 6, 8]); + + // Construct a list array from the above two + let list_data_type = + DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true))); + let list_data = ArrayData::builder(list_data_type) + .len(3) + .add_buffer(value_offsets) + .add_child_data(value_data) + .build() + .unwrap(); + LargeListArray::from(list_data) +} + +fn make_union_array() -> UnionArray { + let mut builder = UnionBuilder::with_capacity_dense(7); + builder.append::("a", 1).unwrap(); + builder.append::("b", 2).unwrap(); + builder.build().unwrap() +} + +/// Creates a dictionary with primitive dictionary values, and keys of type K +fn make_dictionary_primitive() -> ArrayRef { + // Pick Int32 arbitrarily for dictionary values + let mut b: PrimitiveDictionaryBuilder = + PrimitiveDictionaryBuilder::new(); + b.append(1).unwrap(); + b.append(2).unwrap(); + Arc::new(b.finish()) +} + +/// Creates a dictionary with utf8 values, and keys of type K +fn make_dictionary_utf8() -> ArrayRef { + // Pick Int32 arbitrarily for dictionary values + let mut b: StringDictionaryBuilder = StringDictionaryBuilder::new(); + b.append("foo").unwrap(); + b.append("bar").unwrap(); + Arc::new(b.finish()) +} + +fn create_decimal_array( + array: Vec>, + precision: u8, + scale: i8, +) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) +} + +// Get a selection of datatypes to try and cast to +fn get_all_types() -> Vec { + use DataType::*; + let tz_name = String::from("America/New_York"); + + vec![ + Null, + Boolean, + Int8, + Int16, + Int32, + UInt64, + UInt8, + UInt16, + UInt32, + UInt64, + Float16, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Timestamp(TimeUnit::Second, Some(tz_name.clone())), + Timestamp(TimeUnit::Millisecond, Some(tz_name.clone())), + Timestamp(TimeUnit::Microsecond, Some(tz_name.clone())), + Timestamp(TimeUnit::Nanosecond, Some(tz_name)), + Date32, + Date64, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + Interval(IntervalUnit::YearMonth), + Interval(IntervalUnit::DayTime), + Interval(IntervalUnit::MonthDayNano), + Binary, + FixedSizeBinary(10), + LargeBinary, + Utf8, + LargeUtf8, + List(Box::new(Field::new("item", DataType::Int8, true))), + List(Box::new(Field::new("item", DataType::Utf8, true))), + FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10), + FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10), + LargeList(Box::new(Field::new("item", DataType::Int8, true))), + LargeList(Box::new(Field::new("item", DataType::Utf8, false))), + Struct(vec![ + Field::new("f1", DataType::Int32, true), + Field::new("f2", DataType::Utf8, true), + ]), + Union( + vec![ + Field::new("f1", DataType::Int32, false), + Field::new("f2", DataType::Utf8, true), + ], + vec![0, 1], + UnionMode::Dense, + ), + Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), + Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), + Decimal128(38, 0), + ] +} + +#[test] +fn test_timestamp_cast_utf8() { + let array: PrimitiveArray = + vec![Some(37800000000), None, Some(86339000000)].into(); + let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); + + let expected = StringArray::from(vec![ + Some("1970-01-01 10:30:00"), + None, + Some("1970-01-01 23:58:59"), + ]); + + assert_eq!( + out.as_any().downcast_ref::().unwrap(), + &expected + ); + + let array: PrimitiveArray = + vec![Some(37800000000), None, Some(86339000000)].into(); + let array = array.with_timezone("Australia/Sydney".to_string()); + let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); + + let expected = StringArray::from(vec![ + Some("1970-01-01 20:30:00 +10:00"), + None, + Some("1970-01-02 09:58:59 +10:00"), + ]); + + assert_eq!( + out.as_any().downcast_ref::().unwrap(), + &expected + ); +}