diff --git a/native/spark-expr/src/kernels/mod.rs b/native/spark-expr/src/kernels/mod.rs index 3669ff13a..88aa34b1a 100644 --- a/native/spark-expr/src/kernels/mod.rs +++ b/native/spark-expr/src/kernels/mod.rs @@ -17,5 +17,4 @@ //! Kernels -pub mod strings; pub(crate) mod temporal; diff --git a/native/spark-expr/src/kernels/strings.rs b/native/spark-expr/src/kernels/strings.rs deleted file mode 100644 index e26085760..000000000 --- a/native/spark-expr/src/kernels/strings.rs +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! String kernels - -use std::sync::Arc; - -use arrow::{ - array::* - , - compute::kernels::substring::{substring as arrow_substring, substring_by_char}, - datatypes::{DataType, Int32Type}, -}; -use datafusion_common::DataFusionError; - -pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result { - match array.data_type() { - DataType::LargeUtf8 => substring_by_char( - array - .as_any() - .downcast_ref::() - .expect("A large string is expected"), - start, - Some(length), - ) - .map_err(|e| e.into()) - .map(|t| make_array(t.into_data())), - DataType::Utf8 => substring_by_char( - array - .as_any() - .downcast_ref::() - .expect("A string is expected"), - start, - Some(length), - ) - .map_err(|e| e.into()) - .map(|t| make_array(t.into_data())), - DataType::Binary | DataType::LargeBinary => { - arrow_substring(array, start, Some(length)).map_err(|e| e.into()) - } - DataType::Dictionary(_, _) => { - let dict = as_dictionary_array::(array); - let values = substring(dict.values(), start, length)?; - let result = DictionaryArray::try_new(dict.keys().clone(), values)?; - Ok(Arc::new(result)) - } - dt => panic!("Unsupported input type for function 'substring': {:?}", dt), - } -} diff --git a/native/spark-expr/src/string_funcs/substring.rs b/native/spark-expr/src/string_funcs/substring.rs index 8abecf0ac..28f155479 100644 --- a/native/spark-expr/src/string_funcs/substring.rs +++ b/native/spark-expr/src/string_funcs/substring.rs @@ -17,9 +17,7 @@ #![allow(deprecated)] -use crate::kernels::strings::substring; use arrow::record_batch::RecordBatch; -use arrow_schema::{DataType, Schema}; use datafusion::logical_expr::ColumnarValue; use datafusion_common::DataFusionError; use datafusion_physical_expr::PhysicalExpr; @@ -29,6 +27,16 @@ use std::{ hash::Hash, sync::Arc, }; +use arrow_array::{make_array, Array, ArrayRef, DictionaryArray, LargeStringArray, StringArray}; +use arrow_array::cast::as_dictionary_array; + +use arrow::{ + array::* + , + compute::kernels::substring::{substring as arrow_substring, substring_by_char}, + datatypes::{Schema, DataType, Int32Type}, +}; + #[derive(Debug, Eq)] pub struct SubstringExpr { @@ -84,7 +92,7 @@ impl PhysicalExpr for SubstringExpr { let arg = self.child.evaluate(batch)?; match arg { ColumnarValue::Array(array) => { - let result = substring(&array, self.start, self.len)?; + let result = substring_kernel(&array, self.start, self.len)?; Ok(ColumnarValue::Array(result)) } @@ -110,3 +118,38 @@ impl PhysicalExpr for SubstringExpr { } } + +pub fn substring_kernel(array: &dyn Array, start: i64, length: u64) -> Result { + match array.data_type() { + DataType::LargeUtf8 => substring_by_char( + array + .as_any() + .downcast_ref::() + .expect("A large string is expected"), + start, + Some(length), + ) + .map_err(|e| e.into()) + .map(|t| make_array(t.into_data())), + DataType::Utf8 => substring_by_char( + array + .as_any() + .downcast_ref::() + .expect("A string is expected"), + start, + Some(length), + ) + .map_err(|e| e.into()) + .map(|t| make_array(t.into_data())), + DataType::Binary | DataType::LargeBinary => { + arrow_substring(array, start, Some(length)).map_err(|e| e.into()) + } + DataType::Dictionary(_, _) => { + let dict = as_dictionary_array::(array); + let values = substring_kernel(dict.values(), start, length)?; + let result = DictionaryArray::try_new(dict.keys().clone(), values)?; + Ok(Arc::new(result)) + } + dt => panic!("Unsupported input type for function 'substring': {:?}", dt), + } +}