diff --git a/Cargo.toml b/Cargo.toml index ef685f338f5..8c506cf135b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -172,6 +172,7 @@ compute_take = [] compute_temporal = [] compute_window = ["compute_concatenate"] compute_lower = [] +compute_upper = [] compute = [ "compute_aggregate", "compute_arithmetics", @@ -198,6 +199,7 @@ compute = [ "compute_temporal", "compute_window", "compute_lower", + "compute_upper" ] # base64 + io_ipc because arrow schemas are stored as base64-encoded ipc format. io_parquet = ["parquet2", "io_ipc", "base64", "futures"] diff --git a/src/compute/mod.rs b/src/compute/mod.rs index 5608e3ebfe5..99568b0aaf9 100644 --- a/src/compute/mod.rs +++ b/src/compute/mod.rs @@ -85,6 +85,9 @@ pub mod take; #[cfg(feature = "compute_temporal")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_temporal")))] pub mod temporal; +#[cfg(feature = "compute_upper")] +#[cfg_attr(docsrs, doc(cfg(feature = "compute_upper")))] +pub mod upper; mod utils; #[cfg(feature = "compute_window")] #[cfg_attr(docsrs, doc(cfg(feature = "compute_window")))] diff --git a/src/compute/upper.rs b/src/compute/upper.rs new file mode 100644 index 00000000000..6afbdd09286 --- /dev/null +++ b/src/compute/upper.rs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines kernel to extract a upper case of a \[Large\]StringArray + +use super::utils::utf8_apply; +use crate::array::*; +use crate::{ + datatypes::DataType, + error::{ArrowError, Result}, +}; + +/// Returns a new `Array` where each of each of the elements is upper-cased. +/// this function errors when the passed array is not a \[Large\]String array. +pub fn upper(array: &dyn Array) -> Result> { + match array.data_type() { + DataType::LargeUtf8 => Ok(Box::new(utf8_apply( + str::to_uppercase, + array + .as_any() + .downcast_ref::>() + .expect("A large string is expected"), + ))), + DataType::Utf8 => Ok(Box::new(utf8_apply( + str::to_uppercase, + array + .as_any() + .downcast_ref::>() + .expect("A string is expected"), + ))), + _ => Err(ArrowError::InvalidArgumentError(format!( + "upper does not support type {:?}", + array.data_type() + ))), + } +} + +/// Checks if an array of type `datatype` can perform upper operation +/// +/// # Examples +/// ``` +/// use arrow2::compute::upper::can_upper; +/// use arrow2::datatypes::{DataType}; +/// +/// let data_type = DataType::Utf8; +/// assert_eq!(can_upper(&data_type), true); +/// +/// let data_type = DataType::Null; +/// assert_eq!(can_upper(&data_type), false); +/// ``` +pub fn can_upper(data_type: &DataType) -> bool { + matches!(data_type, DataType::LargeUtf8 | DataType::Utf8) +} diff --git a/tests/it/compute/mod.rs b/tests/it/compute/mod.rs index d4bd0b008eb..9b5079be712 100644 --- a/tests/it/compute/mod.rs +++ b/tests/it/compute/mod.rs @@ -44,5 +44,7 @@ mod substring; mod take; #[cfg(feature = "compute_temporal")] mod temporal; +#[cfg(feature = "compute_upper")] +mod upper; #[cfg(feature = "compute_window")] mod window; diff --git a/tests/it/compute/upper.rs b/tests/it/compute/upper.rs new file mode 100644 index 00000000000..4daf35626a9 --- /dev/null +++ b/tests/it/compute/upper.rs @@ -0,0 +1,186 @@ +use arrow2::{array::*, compute::upper::*, error::Result}; + +fn with_nulls_utf8() -> Result<()> { + let cases = vec![ + // identity + ( + vec![Some("hello"), None, Some("world")], + vec![Some("HELLO"), None, Some("WORLD")], + ), + // part of input + ( + vec![Some("Hello"), None, Some("wOrld")], + vec![Some("HELLO"), None, Some("WORLD")], + ), + // all input + ( + vec![Some("hello"), None, Some("world")], + vec![Some("HELLO"), None, Some("WORLD")], + ), + // UTF8 characters + ( + vec![ + None, + Some("السلام عليكم"), + Some("Dobrý den"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("Olá"), + Some("Здравствуйте"), + Some("Hola"), + ], + vec![ + None, + Some("السلام عليكم"), + Some("DOBRÝ DEN"), + Some("שָׁלוֹם"), + Some("नमस्ते"), + Some("こんにちは"), + Some("안녕하세요"), + Some("你好"), + Some("OLÁ"), + Some("ЗДРАВСТВУЙТЕ"), + Some("HOLA"), + ], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from(&array); + let result = upper(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from(&expected); + + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn with_nulls_string() -> Result<()> { + with_nulls_utf8::() +} + +#[test] +fn with_nulls_large_string() -> Result<()> { + with_nulls_utf8::() +} + +fn without_nulls_utf8() -> Result<()> { + let cases = vec![ + // identity + (vec!["hello", "world"], vec!["HELLO", "WORLD"]), + // part of input + (vec!["Hello", "wOrld"], vec!["HELLO", "WORLD"]), + // all input + (vec!["HELLO", "WORLD"], vec!["HELLO", "WORLD"]), + // UTF8 characters + ( + vec![ + "السلام عليكم", + "Dobrý den", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "Olá", + "Здравствуйте", + "Hola", + ], + vec![ + "السلام عليكم", + "DOBRÝ DEN", + "שָׁלוֹם", + "नमस्ते", + "こんにちは", + "안녕하세요", + "你好", + "OLÁ", + "ЗДРАВСТВУЙТЕ", + "HOLA", + ], + ), + ]; + + cases + .into_iter() + .try_for_each::<_, Result<()>>(|(array, expected)| { + let array = Utf8Array::::from_slice(&array); + let result = upper(&array)?; + assert_eq!(array.len(), result.len()); + + let result = result.as_any().downcast_ref::>().unwrap(); + let expected = Utf8Array::::from_slice(&expected); + assert_eq!(&expected, result); + Ok(()) + })?; + + Ok(()) +} + +#[test] +fn without_nulls_string() -> Result<()> { + without_nulls_utf8::() +} + +#[test] +fn without_nulls_large_string() -> Result<()> { + without_nulls_utf8::() +} + +#[test] +fn consistency() { + use arrow2::datatypes::DataType::*; + use arrow2::datatypes::TimeUnit; + let datatypes = vec![ + Null, + Boolean, + UInt8, + UInt16, + UInt32, + UInt64, + Int8, + Int16, + Int32, + Int64, + Float32, + Float64, + Timestamp(TimeUnit::Second, None), + Timestamp(TimeUnit::Millisecond, None), + Timestamp(TimeUnit::Microsecond, None), + Timestamp(TimeUnit::Nanosecond, None), + Time64(TimeUnit::Microsecond), + Time64(TimeUnit::Nanosecond), + Date32, + Time32(TimeUnit::Second), + Time32(TimeUnit::Millisecond), + Date64, + Utf8, + LargeUtf8, + Binary, + LargeBinary, + Duration(TimeUnit::Second), + Duration(TimeUnit::Millisecond), + Duration(TimeUnit::Microsecond), + Duration(TimeUnit::Nanosecond), + ]; + + datatypes.into_iter().for_each(|d1| { + let array = new_null_array(d1.clone(), 10); + if can_upper(&d1) { + assert!(upper(array.as_ref()).is_ok()); + } else { + assert!(upper(array.as_ref()).is_err()); + } + }); +}