Skip to content

Commit

Permalink
Add Substring_by_char (#1784)
Browse files Browse the repository at this point in the history
* add substring by char

Signed-off-by: remzi <13716567376yh@gmail.com>

* update docs

Signed-off-by: remzi <13716567376yh@gmail.com>

* update docs

Signed-off-by: remzi <13716567376yh@gmail.com>

* add benchmark

Signed-off-by: remzi <13716567376yh@gmail.com>

* Update arrow/src/compute/kernels/substring.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* Update arrow/src/compute/kernels/substring.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* Update arrow/src/compute/kernels/substring.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

* Update arrow/src/compute/kernels/substring.rs

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>

Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com>
  • Loading branch information
HaoYang670 and tustvold authored Jun 6, 2022
1 parent d4df1d9 commit dac2132
Show file tree
Hide file tree
Showing 2 changed files with 228 additions and 6 deletions.
14 changes: 13 additions & 1 deletion arrow/benches/string_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,21 @@ use criterion::Criterion;
extern crate arrow;

use arrow::array::*;
use arrow::compute::kernels::substring::substring;
use arrow::compute::kernels::substring::*;
use arrow::util::bench_util::*;

fn bench_substring(arr: &dyn Array, start: i64, length: Option<u64>) {
substring(criterion::black_box(arr), start, length).unwrap();
}

fn bench_substring_by_char<O: OffsetSizeTrait>(
arr: &GenericStringArray<O>,
start: i64,
length: Option<u64>,
) {
substring_by_char(criterion::black_box(arr), start, length).unwrap();
}

fn add_benchmark(c: &mut Criterion) {
let size = 65536;
let val_len = 1000;
Expand All @@ -44,6 +52,10 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_substring(&arr_string, 1, Some((val_len - 1) as u64)))
});

c.bench_function("substring utf8 by char", |b| {
b.iter(|| bench_substring_by_char(&arr_string, 1, Some((val_len - 1) as u64)))
});

c.bench_function("substring fixed size binary array", |b| {
b.iter(|| bench_substring(&arr_fsb, 1, Some((val_len - 1) as u64)))
});
Expand Down
220 changes: 215 additions & 5 deletions arrow/src/compute/kernels/substring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
// under the License.

//! Defines kernel to extract a substring of an Array
//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray
//! Supported array types:
//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray]
use crate::array::DictionaryArray;
use crate::buffer::MutableBuffer;
Expand All @@ -29,7 +30,7 @@ use crate::{
use std::cmp::Ordering;
use std::sync::Arc;

/// Returns an ArrayRef with substrings of all the elements in `array`.
/// Returns an [`ArrayRef`] with substrings of all the elements in `array`.
///
/// # Arguments
///
Expand All @@ -38,7 +39,7 @@ use std::sync::Arc;
/// otherwise count from the end of the string.
///
/// * `length`(option) - The length of all substrings.
/// If `length` is `None`, then the substring is from `start` to the end of the string.
/// If `length` is [None], then the substring is from `start` to the end of the string.
///
/// Attention: Both `start` and `length` are counted by byte, not by char.
///
Expand All @@ -53,9 +54,10 @@ use std::sync::Arc;
/// ```
///
/// # Error
/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary
/// array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type.
/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`]
/// or [`DictionaryArray`] with supported array type as its value type.
/// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array).
/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars.
///
/// ## Example of trying to get an invalid utf-8 format substring
/// ```
Expand Down Expand Up @@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A
}
}

/// # Arguments
/// * `array` - The input string array
///
/// * `start` - The start index of all substrings.
/// If `start >= 0`, then count from the start of the string,
/// otherwise count from the end of the string.
///
/// * `length`(option) - The length of all substrings.
/// If `length` is `None`, then the substring is from `start` to the end of the string.
///
/// Attention: Both `start` and `length` are counted by char.
///
/// # Performance
/// This function is slower than [substring].
/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer.
/// It is recommended to use [substring] if the input array only contains ASCII chars.
///
/// # Basic usage
/// ```
/// # use arrow::array::StringArray;
/// # use arrow::compute::kernels::substring::substring_by_char;
/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]);
/// let result = substring_by_char(&array, 1, Some(4)).unwrap();
/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")]));
/// ```
pub fn substring_by_char<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: i64,
length: Option<u64>,
) -> Result<GenericStringArray<OffsetSize>> {
Ok(array
.iter()
.map(|val| {
val.map(|val| {
let char_count = val.chars().count();
let start = if start >= 0 {
start.to_usize().unwrap().min(char_count)
} else {
char_count - (-start).to_usize().unwrap().min(char_count)
};
let length = length.map_or(char_count - start, |length| {
length.to_usize().unwrap().min(char_count - start)
});

val.chars().skip(start).take(length).collect::<String>()
})
})
.collect::<GenericStringArray<OffsetSize>>())
}

fn binary_substring<OffsetSize: OffsetSizeTrait>(
array: &GenericBinaryArray<OffsetSize>,
start: OffsetSize,
Expand Down Expand Up @@ -1083,6 +1135,164 @@ mod tests {
generic_string_with_non_zero_offset::<i64>()
}

fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")];
let cases = vec![
// all-nulls array is always identical
(vec![None, None, None], 0, None, vec![None, None, None]),
// identity
(
input_vals.clone(),
0,
None,
vec![Some("hello"), None, Some("Γ ⊢x:T")],
),
// 0 length -> Nothing
(
input_vals.clone(),
0,
Some(0),
vec![Some(""), None, Some("")],
),
// high start -> Nothing
(
input_vals.clone(),
1000,
Some(0),
vec![Some(""), None, Some("")],
),
// high negative start -> identity
(
input_vals.clone(),
-1000,
None,
vec![Some("hello"), None, Some("Γ ⊢x:T")],
),
// high length -> identity
(
input_vals.clone(),
0,
Some(1000),
vec![Some("hello"), None, Some("Γ ⊢x:T")],
),
];

cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
let array = GenericStringArray::<O>::from(array);
let result = substring_by_char(&array, start, length)?;
assert_eq!(array.len(), result.len());

let expected = GenericStringArray::<O>::from(expected);
assert_eq!(expected, result);
Ok(())
},
)?;

Ok(())
}

#[test]
fn with_nulls_string_by_char() -> Result<()> {
with_nulls_generic_string_by_char::<i32>()
}

#[test]
fn with_nulls_large_string_by_char() -> Result<()> {
with_nulls_generic_string_by_char::<i64>()
}

fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> {
let input_vals = vec!["hello", "", "Γ ⊢x:T"];
let cases = vec![
// empty array is always identical
(vec!["", "", ""], 0, None, vec!["", "", ""]),
// increase start
(input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]),
(input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]),
(input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]),
(input_vals.clone(), 3, None, vec!["lo", "", "x:T"]),
(input_vals.clone(), 10, None, vec!["", "", ""]),
// increase start negatively
(input_vals.clone(), -1, None, vec!["o", "", "T"]),
(input_vals.clone(), -2, None, vec!["lo", "", ":T"]),
(input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]),
(input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]),
// increase length
(input_vals.clone(), 1, Some(1), vec!["e", "", " "]),
(input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]),
(input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]),
(input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]),
(input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]),
(input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]),
(input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]),
(input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]),
];

cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
let array = GenericStringArray::<O>::from(array);
let result = substring_by_char(&array, start, length)?;
assert_eq!(array.len(), result.len());
let expected = GenericStringArray::<O>::from(expected);
assert_eq!(expected, result);
Ok(())
},
)?;

Ok(())
}

#[test]
fn without_nulls_string_by_char() -> Result<()> {
without_nulls_generic_string_by_char::<i32>()
}

#[test]
fn without_nulls_large_string_by_char() -> Result<()> {
without_nulls_generic_string_by_char::<i64>()
}

fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() -> Result<()> {
let values = "S→T = Πx:S.T";
let offsets = &[
O::zero(),
O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap())
.unwrap(),
O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap())
.unwrap(),
O::from_usize(values.len()).unwrap(),
];
// set the first and third element to be valid
let bitmap = [0b101_u8];

let data = ArrayData::builder(GenericStringArray::<O>::get_data_type())
.len(2)
.add_buffer(Buffer::from_slice_ref(offsets))
.add_buffer(Buffer::from(values))
.null_bit_buffer(Some(Buffer::from(bitmap)))
.offset(1)
.build()?;
// array is `[null, "Πx:S.T"]`
let array = GenericStringArray::<O>::from(data);
// result is `[null, "x:S.T"]`
let result = substring_by_char(&array, 1, None)?;
let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]);
assert_eq!(result, expected);

Ok(())
}

#[test]
fn string_with_non_zero_offset_by_char() -> Result<()> {
generic_string_by_char_with_non_zero_offset::<i32>()
}

#[test]
fn large_string_with_non_zero_offset_by_char() -> Result<()> {
generic_string_by_char_with_non_zero_offset::<i64>()
}

#[test]
fn dictionary() -> Result<()> {
_dictionary::<Int8Type>()?;
Expand Down

0 comments on commit dac2132

Please sign in to comment.