From dac2132f157b1ef0dda04119d4a8b0221efeadec Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Mon, 6 Jun 2022 21:28:14 +0800 Subject: [PATCH] Add `Substring_by_char` (#1784) * add substring by char Signed-off-by: remzi <13716567376yh@gmail.com> * update docs Signed-off-by: remzi <13716567376yh@gmail.com> * update docs Signed-off-by: remzi <13716567376yh@gmail.com> * add benchmark Signed-off-by: remzi <13716567376yh@gmail.com> * Update arrow/src/compute/kernels/substring.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow/src/compute/kernels/substring.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow/src/compute/kernels/substring.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow/src/compute/kernels/substring.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> --- arrow/benches/string_kernels.rs | 14 +- arrow/src/compute/kernels/substring.rs | 220 ++++++++++++++++++++++++- 2 files changed, 228 insertions(+), 6 deletions(-) diff --git a/arrow/benches/string_kernels.rs b/arrow/benches/string_kernels.rs index 7df52a6bbd4c..6bbfc9c09839 100644 --- a/arrow/benches/string_kernels.rs +++ b/arrow/benches/string_kernels.rs @@ -22,13 +22,21 @@ use criterion::Criterion; extern crate arrow; use arrow::array::*; -use arrow::compute::kernels::substring::substring; +use arrow::compute::kernels::substring::*; use arrow::util::bench_util::*; fn bench_substring(arr: &dyn Array, start: i64, length: Option) { substring(criterion::black_box(arr), start, length).unwrap(); } +fn bench_substring_by_char( + arr: &GenericStringArray, + start: i64, + length: Option, +) { + substring_by_char(criterion::black_box(arr), start, length).unwrap(); +} + fn add_benchmark(c: &mut Criterion) { let size = 65536; let val_len = 1000; @@ -44,6 +52,10 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64))) }); + c.bench_function("substring utf8 by char", |b| { + b.iter(|| bench_substring_by_char(&arr_string, 1, Some((val_len - 1) as u64))) + }); + c.bench_function("substring fixed size binary array", |b| { b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64))) }); diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index f1b6e8d4aa79..1954307e9ac7 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -16,7 +16,8 @@ // under the License. //! Defines kernel to extract a substring of an Array -//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray +//! Supported array types: +//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray] use crate::array::DictionaryArray; use crate::buffer::MutableBuffer; @@ -29,7 +30,7 @@ use crate::{ use std::cmp::Ordering; use std::sync::Arc; -/// Returns an ArrayRef with substrings of all the elements in `array`. +/// Returns an [`ArrayRef`] with substrings of all the elements in `array`. /// /// # Arguments /// @@ -38,7 +39,7 @@ use std::sync::Arc; /// otherwise count from the end of the string. /// /// * `length`(option) - The length of all substrings. -/// If `length` is `None`, then the substring is from `start` to the end of the string. +/// If `length` is [None], then the substring is from `start` to the end of the string. /// /// Attention: Both `start` and `length` are counted by byte, not by char. /// @@ -53,9 +54,10 @@ use std::sync::Arc; /// ``` /// /// # Error -/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary -/// array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type. +/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`] +/// or [`DictionaryArray`] with supported array type as its value type. /// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array). +/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars. /// /// ## Example of trying to get an invalid utf-8 format substring /// ``` @@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length: Option) -> Result= 0`, then count from the start of the string, +/// otherwise count from the end of the string. +/// +/// * `length`(option) - The length of all substrings. +/// If `length` is `None`, then the substring is from `start` to the end of the string. +/// +/// Attention: Both `start` and `length` are counted by char. +/// +/// # Performance +/// This function is slower than [substring]. +/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer. +/// It is recommended to use [substring] if the input array only contains ASCII chars. +/// +/// # Basic usage +/// ``` +/// # use arrow::array::StringArray; +/// # use arrow::compute::kernels::substring::substring_by_char; +/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]); +/// let result = substring_by_char(&array, 1, Some(4)).unwrap(); +/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")])); +/// ``` +pub fn substring_by_char( + array: &GenericStringArray, + start: i64, + length: Option, +) -> Result> { + Ok(array + .iter() + .map(|val| { + val.map(|val| { + let char_count = val.chars().count(); + let start = if start >= 0 { + start.to_usize().unwrap().min(char_count) + } else { + char_count - (-start).to_usize().unwrap().min(char_count) + }; + let length = length.map_or(char_count - start, |length| { + length.to_usize().unwrap().min(char_count - start) + }); + + val.chars().skip(start).take(length).collect::() + }) + }) + .collect::>()) +} + fn binary_substring( array: &GenericBinaryArray, start: OffsetSize, @@ -1083,6 +1135,164 @@ mod tests { generic_string_with_non_zero_offset::() } + fn with_nulls_generic_string_by_char() -> Result<()> { + let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")]; + let cases = vec![ + // all-nulls array is always identical + (vec![None, None, None], 0, None, vec![None, None, None]), + // identity + ( + input_vals.clone(), + 0, + None, + vec![Some("hello"), None, Some("Γ ⊢x:T")], + ), + // 0 length -> Nothing + ( + input_vals.clone(), + 0, + Some(0), + vec![Some(""), None, Some("")], + ), + // high start -> Nothing + ( + input_vals.clone(), + 1000, + Some(0), + vec![Some(""), None, Some("")], + ), + // high negative start -> identity + ( + input_vals.clone(), + -1000, + None, + vec![Some("hello"), None, Some("Γ ⊢x:T")], + ), + // high length -> identity + ( + input_vals.clone(), + 0, + Some(1000), + vec![Some("hello"), None, Some("Γ ⊢x:T")], + ), + ]; + + cases.into_iter().try_for_each::<_, Result<()>>( + |(array, start, length, expected)| { + let array = GenericStringArray::::from(array); + let result = substring_by_char(&array, start, length)?; + assert_eq!(array.len(), result.len()); + + let expected = GenericStringArray::::from(expected); + assert_eq!(expected, result); + Ok(()) + }, + )?; + + Ok(()) + } + + #[test] + fn with_nulls_string_by_char() -> Result<()> { + with_nulls_generic_string_by_char::() + } + + #[test] + fn with_nulls_large_string_by_char() -> Result<()> { + with_nulls_generic_string_by_char::() + } + + fn without_nulls_generic_string_by_char() -> Result<()> { + let input_vals = vec!["hello", "", "Γ ⊢x:T"]; + let cases = vec![ + // empty array is always identical + (vec!["", "", ""], 0, None, vec!["", "", ""]), + // increase start + (input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]), + (input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]), + (input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]), + (input_vals.clone(), 3, None, vec!["lo", "", "x:T"]), + (input_vals.clone(), 10, None, vec!["", "", ""]), + // increase start negatively + (input_vals.clone(), -1, None, vec!["o", "", "T"]), + (input_vals.clone(), -2, None, vec!["lo", "", ":T"]), + (input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]), + (input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]), + // increase length + (input_vals.clone(), 1, Some(1), vec!["e", "", " "]), + (input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]), + (input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]), + (input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]), + (input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]), + (input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]), + (input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]), + (input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]), + ]; + + cases.into_iter().try_for_each::<_, Result<()>>( + |(array, start, length, expected)| { + let array = GenericStringArray::::from(array); + let result = substring_by_char(&array, start, length)?; + assert_eq!(array.len(), result.len()); + let expected = GenericStringArray::::from(expected); + assert_eq!(expected, result); + Ok(()) + }, + )?; + + Ok(()) + } + + #[test] + fn without_nulls_string_by_char() -> Result<()> { + without_nulls_generic_string_by_char::() + } + + #[test] + fn without_nulls_large_string_by_char() -> Result<()> { + without_nulls_generic_string_by_char::() + } + + fn generic_string_by_char_with_non_zero_offset() -> Result<()> { + let values = "S→T = Πx:S.T"; + let offsets = &[ + O::zero(), + O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()) + .unwrap(), + O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()) + .unwrap(), + O::from_usize(values.len()).unwrap(), + ]; + // set the first and third element to be valid + let bitmap = [0b101_u8]; + + let data = ArrayData::builder(GenericStringArray::::get_data_type()) + .len(2) + .add_buffer(Buffer::from_slice_ref(offsets)) + .add_buffer(Buffer::from(values)) + .null_bit_buffer(Some(Buffer::from(bitmap))) + .offset(1) + .build()?; + // array is `[null, "Πx:S.T"]` + let array = GenericStringArray::::from(data); + // result is `[null, "x:S.T"]` + let result = substring_by_char(&array, 1, None)?; + let expected = GenericStringArray::::from(vec![None, Some("x:S.T")]); + assert_eq!(result, expected); + + Ok(()) + } + + #[test] + fn string_with_non_zero_offset_by_char() -> Result<()> { + generic_string_by_char_with_non_zero_offset::() + } + + #[test] + fn large_string_with_non_zero_offset_by_char() -> Result<()> { + generic_string_by_char_with_non_zero_offset::() + } + #[test] fn dictionary() -> Result<()> { _dictionary::()?;