-
Notifications
You must be signed in to change notification settings - Fork 875
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Substring_by_char
#1784
Add Substring_by_char
#1784
Changes from all commits
341d844
5184109
e68b989
f91ff1b
a059f17
03bff8e
cd2a659
fcc0ee3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,8 @@ | |
// under the License. | ||
|
||
//! Defines kernel to extract a substring of an Array | ||
//! Supported array types: \[Large\]StringArray, \[Large\]BinaryArray | ||
//! Supported array types: | ||
//! [GenericStringArray], [GenericBinaryArray], [FixedSizeBinaryArray], [DictionaryArray] | ||
use crate::array::DictionaryArray; | ||
use crate::buffer::MutableBuffer; | ||
|
@@ -29,7 +30,7 @@ use crate::{ | |
use std::cmp::Ordering; | ||
use std::sync::Arc; | ||
|
||
/// Returns an ArrayRef with substrings of all the elements in `array`. | ||
/// Returns an [`ArrayRef`] with substrings of all the elements in `array`. | ||
/// | ||
/// # Arguments | ||
/// | ||
|
@@ -38,7 +39,7 @@ use std::sync::Arc; | |
/// otherwise count from the end of the string. | ||
/// | ||
/// * `length`(option) - The length of all substrings. | ||
/// If `length` is `None`, then the substring is from `start` to the end of the string. | ||
/// If `length` is [None], then the substring is from `start` to the end of the string. | ||
/// | ||
/// Attention: Both `start` and `length` are counted by byte, not by char. | ||
/// | ||
|
@@ -53,9 +54,10 @@ use std::sync::Arc; | |
/// ``` | ||
/// | ||
/// # Error | ||
/// - The function errors when the passed array is not a \[Large\]String array, \[Large\]Binary | ||
/// array, or DictionaryArray with \[Large\]String or \[Large\]Binary as its value type. | ||
/// - The function errors when the passed array is not a [`GenericStringArray`], [`GenericBinaryArray`], [`FixedSizeBinaryArray`] | ||
/// or [`DictionaryArray`] with supported array type as its value type. | ||
/// - The function errors if the offset of a substring in the input array is at invalid char boundary (only for \[Large\]String array). | ||
/// It is recommended to use [`substring_by_char`] if the input array may contain non-ASCII chars. | ||
/// | ||
/// ## Example of trying to get an invalid utf-8 format substring | ||
/// ``` | ||
|
@@ -150,6 +152,56 @@ pub fn substring(array: &dyn Array, start: i64, length: Option<u64>) -> Result<A | |
} | ||
} | ||
|
||
/// # Arguments | ||
/// * `array` - The input string array | ||
/// | ||
/// * `start` - The start index of all substrings. | ||
/// If `start >= 0`, then count from the start of the string, | ||
/// otherwise count from the end of the string. | ||
/// | ||
/// * `length`(option) - The length of all substrings. | ||
/// If `length` is `None`, then the substring is from `start` to the end of the string. | ||
/// | ||
/// Attention: Both `start` and `length` are counted by char. | ||
/// | ||
/// # Performance | ||
/// This function is slower than [substring]. | ||
/// Theoretically, the time complexity is `O(n)` where `n` is the length of the value buffer. | ||
/// It is recommended to use [substring] if the input array only contains ASCII chars. | ||
/// | ||
/// # Basic usage | ||
/// ``` | ||
/// # use arrow::array::StringArray; | ||
/// # use arrow::compute::kernels::substring::substring_by_char; | ||
/// let array = StringArray::from(vec![Some("arrow"), None, Some("Γ ⊢x:T")]); | ||
/// let result = substring_by_char(&array, 1, Some(4)).unwrap(); | ||
/// assert_eq!(result, StringArray::from(vec![Some("rrow"), None, Some(" ⊢x:")])); | ||
/// ``` | ||
pub fn substring_by_char<OffsetSize: OffsetSizeTrait>( | ||
array: &GenericStringArray<OffsetSize>, | ||
start: i64, | ||
length: Option<u64>, | ||
) -> Result<GenericStringArray<OffsetSize>> { | ||
Ok(array | ||
.iter() | ||
.map(|val| { | ||
val.map(|val| { | ||
let char_count = val.chars().count(); | ||
let start = if start >= 0 { | ||
start.to_usize().unwrap().min(char_count) | ||
} else { | ||
char_count - (-start).to_usize().unwrap().min(char_count) | ||
}; | ||
let length = length.map_or(char_count - start, |length| { | ||
length.to_usize().unwrap().min(char_count - start) | ||
}); | ||
|
||
val.chars().skip(start).take(length).collect::<String>() | ||
}) | ||
}) | ||
.collect::<GenericStringArray<OffsetSize>>()) | ||
} | ||
|
||
fn binary_substring<OffsetSize: OffsetSizeTrait>( | ||
array: &GenericBinaryArray<OffsetSize>, | ||
start: OffsetSize, | ||
|
@@ -1083,6 +1135,164 @@ mod tests { | |
generic_string_with_non_zero_offset::<i64>() | ||
} | ||
|
||
fn with_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Loving the test coverage 👍 |
||
let input_vals = vec![Some("hello"), None, Some("Γ ⊢x:T")]; | ||
let cases = vec![ | ||
// all-nulls array is always identical | ||
(vec![None, None, None], 0, None, vec![None, None, None]), | ||
// identity | ||
( | ||
input_vals.clone(), | ||
0, | ||
None, | ||
vec![Some("hello"), None, Some("Γ ⊢x:T")], | ||
), | ||
// 0 length -> Nothing | ||
( | ||
input_vals.clone(), | ||
0, | ||
Some(0), | ||
vec![Some(""), None, Some("")], | ||
), | ||
// high start -> Nothing | ||
( | ||
input_vals.clone(), | ||
1000, | ||
Some(0), | ||
vec![Some(""), None, Some("")], | ||
), | ||
// high negative start -> identity | ||
( | ||
input_vals.clone(), | ||
-1000, | ||
None, | ||
vec![Some("hello"), None, Some("Γ ⊢x:T")], | ||
), | ||
// high length -> identity | ||
( | ||
input_vals.clone(), | ||
0, | ||
Some(1000), | ||
vec![Some("hello"), None, Some("Γ ⊢x:T")], | ||
), | ||
]; | ||
|
||
cases.into_iter().try_for_each::<_, Result<()>>( | ||
|(array, start, length, expected)| { | ||
let array = GenericStringArray::<O>::from(array); | ||
let result = substring_by_char(&array, start, length)?; | ||
assert_eq!(array.len(), result.len()); | ||
|
||
let expected = GenericStringArray::<O>::from(expected); | ||
assert_eq!(expected, result); | ||
Ok(()) | ||
}, | ||
)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn with_nulls_string_by_char() -> Result<()> { | ||
with_nulls_generic_string_by_char::<i32>() | ||
} | ||
|
||
#[test] | ||
fn with_nulls_large_string_by_char() -> Result<()> { | ||
with_nulls_generic_string_by_char::<i64>() | ||
} | ||
|
||
fn without_nulls_generic_string_by_char<O: OffsetSizeTrait>() -> Result<()> { | ||
let input_vals = vec!["hello", "", "Γ ⊢x:T"]; | ||
let cases = vec![ | ||
// empty array is always identical | ||
(vec!["", "", ""], 0, None, vec!["", "", ""]), | ||
// increase start | ||
(input_vals.clone(), 0, None, vec!["hello", "", "Γ ⊢x:T"]), | ||
(input_vals.clone(), 1, None, vec!["ello", "", " ⊢x:T"]), | ||
(input_vals.clone(), 2, None, vec!["llo", "", "⊢x:T"]), | ||
(input_vals.clone(), 3, None, vec!["lo", "", "x:T"]), | ||
(input_vals.clone(), 10, None, vec!["", "", ""]), | ||
// increase start negatively | ||
(input_vals.clone(), -1, None, vec!["o", "", "T"]), | ||
(input_vals.clone(), -2, None, vec!["lo", "", ":T"]), | ||
(input_vals.clone(), -4, None, vec!["ello", "", "⊢x:T"]), | ||
(input_vals.clone(), -10, None, vec!["hello", "", "Γ ⊢x:T"]), | ||
// increase length | ||
(input_vals.clone(), 1, Some(1), vec!["e", "", " "]), | ||
(input_vals.clone(), 1, Some(2), vec!["el", "", " ⊢"]), | ||
(input_vals.clone(), 1, Some(3), vec!["ell", "", " ⊢x"]), | ||
(input_vals.clone(), 1, Some(6), vec!["ello", "", " ⊢x:T"]), | ||
(input_vals.clone(), -4, Some(1), vec!["e", "", "⊢"]), | ||
(input_vals.clone(), -4, Some(2), vec!["el", "", "⊢x"]), | ||
(input_vals.clone(), -4, Some(3), vec!["ell", "", "⊢x:"]), | ||
(input_vals.clone(), -4, Some(4), vec!["ello", "", "⊢x:T"]), | ||
]; | ||
|
||
cases.into_iter().try_for_each::<_, Result<()>>( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps we could extract this as a function instead of duplicating it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked by #1801 |
||
|(array, start, length, expected)| { | ||
let array = GenericStringArray::<O>::from(array); | ||
let result = substring_by_char(&array, start, length)?; | ||
assert_eq!(array.len(), result.len()); | ||
let expected = GenericStringArray::<O>::from(expected); | ||
assert_eq!(expected, result); | ||
Ok(()) | ||
}, | ||
)?; | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn without_nulls_string_by_char() -> Result<()> { | ||
without_nulls_generic_string_by_char::<i32>() | ||
} | ||
|
||
#[test] | ||
fn without_nulls_large_string_by_char() -> Result<()> { | ||
without_nulls_generic_string_by_char::<i64>() | ||
} | ||
|
||
fn generic_string_by_char_with_non_zero_offset<O: OffsetSizeTrait>() -> Result<()> { | ||
let values = "S→T = Πx:S.T"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be easier to follow to just construct the regular array, and then call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tracked by #1801 |
||
let offsets = &[ | ||
O::zero(), | ||
O::from_usize(values.char_indices().nth(3).map(|(pos, _)| pos).unwrap()) | ||
.unwrap(), | ||
O::from_usize(values.char_indices().nth(6).map(|(pos, _)| pos).unwrap()) | ||
.unwrap(), | ||
O::from_usize(values.len()).unwrap(), | ||
]; | ||
// set the first and third element to be valid | ||
let bitmap = [0b101_u8]; | ||
|
||
let data = ArrayData::builder(GenericStringArray::<O>::get_data_type()) | ||
.len(2) | ||
.add_buffer(Buffer::from_slice_ref(offsets)) | ||
.add_buffer(Buffer::from(values)) | ||
.null_bit_buffer(Some(Buffer::from(bitmap))) | ||
.offset(1) | ||
.build()?; | ||
// array is `[null, "Πx:S.T"]` | ||
let array = GenericStringArray::<O>::from(data); | ||
// result is `[null, "x:S.T"]` | ||
let result = substring_by_char(&array, 1, None)?; | ||
let expected = GenericStringArray::<O>::from(vec![None, Some("x:S.T")]); | ||
assert_eq!(result, expected); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn string_with_non_zero_offset_by_char() -> Result<()> { | ||
generic_string_by_char_with_non_zero_offset::<i32>() | ||
} | ||
|
||
#[test] | ||
fn large_string_with_non_zero_offset_by_char() -> Result<()> { | ||
generic_string_by_char_with_non_zero_offset::<i64>() | ||
} | ||
|
||
#[test] | ||
fn dictionary() -> Result<()> { | ||
_dictionary::<Int8Type>()?; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Allocating a string temporary, only to copy out of it, is likely a significant portion of the slow-down. That combined with the null handling.
This could definitely be handled as a separate PR, but you might want to consider doing something like (not tested).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tracked by #1800