Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
delete unchecked fn
update doc

Signed-off-by: remzi <13716567376yh@gmail.com>
  • Loading branch information
HaoYang670 committed Mar 29, 2022
1 parent 79ea718 commit cd6029e
Showing 1 changed file with 27 additions and 35 deletions.
62 changes: 27 additions & 35 deletions arrow/src/array/array_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,50 +78,39 @@ impl<OffsetSize: StringOffsetSizeTrait> GenericStringArray<OffsetSize> {
self.data.buffers()[1].clone()
}

/// Returns the number of chars in the string at index.
/// # Assumption
/// The value stored at `i` must be a valid utf-8 string,
/// otherwise you will get an unexpected result.
/// Returns the number of chars in the string at index `i`.
/// # Panic
/// If an invalid utf-8 byte is found, the function will panic.
/// However, this function does not check every byte. So you might
/// get an unexpected result if the string is in invalid utf-8 format.
/// # Performance
/// This function has `O(n)` time complexity where `n` is the string length.
/// If you can make sure that all chars in string are in the range `U+0x0000` ~ `U+0x0080`,
/// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`,
/// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
/// # Safety
/// Caller is responsible for ensuring that index is within the array bounds.
pub unsafe fn num_chars_unchecked(&self, i: usize) -> usize {
let start = self.value_offsets().get_unchecked(i).to_usize().unwrap();
let end = self
.value_offsets()
.get_unchecked(i + 1)
.to_usize()
.unwrap();
pub fn num_chars(&self, i: usize) -> usize {
let offsets = self.value_offsets();
let start = offsets[i].to_usize().unwrap();
let end = offsets[i + 1].to_usize().unwrap();
let chars = &self.data.buffers()[1].as_slice()[start..end];

let mut char_iter = chars.iter();
let mut length: usize = 0;
while let Some(prefix) = char_iter.next() {
let ones = prefix.leading_ones() as usize;
if ones > 0 {
char_iter.nth(ones - 2);
}
match ones {
0 => {}
2..=4 => {
char_iter.nth(ones - 2);
}
_ => {
panic!("invalid utf-8 format");
}
};
length += 1;
}
length
}

/// Returns the number of chars in the string at index.
/// # Assumption
/// The value stored at `i` must be a valid utf-8 string,
/// otherwise you will get an unexpected result.
/// # Performance
/// This function has `O(n)` time complexity where `n` is the string length.
/// If you can make sure that all chars in string are in the range `U+0x0000` ~ `U+0x0080`,
/// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
pub fn num_chars(&self, i: usize) -> usize {
assert!(i < self.data.len(), "StringArray out of bounds access");
unsafe { self.num_chars_unchecked(i) }
}

/// Returns the element at index
/// # Safety
/// caller is responsible for ensuring that index is within the array bounds
Expand Down Expand Up @@ -451,9 +440,9 @@ mod tests {

#[test]
fn test_large_string_array_from_u8_slice() {
let values: Vec<&str> = vec!["hello", "", "parquet"];
let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];

// Array data: ["hello", "", "parquet"]
// Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
let string_array = LargeStringArray::from(values);

assert_eq!(3, string_array.len());
Expand All @@ -462,10 +451,13 @@ mod tests {
assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
assert_eq!("", string_array.value(1));
assert_eq!("", unsafe { string_array.value_unchecked(1) });
assert_eq!("parquet", string_array.value(2));
assert_eq!("parquet", unsafe { string_array.value_unchecked(2) });
assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
string_array.value_unchecked(2)
});
assert_eq!(5, string_array.value_offsets()[2]);
assert_eq!(7, string_array.value_length(2));
assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
assert_eq!(8, string_array.num_chars(2));
for i in 0..3 {
assert!(string_array.is_valid(i));
assert!(!string_array.is_null(i));
Expand Down

0 comments on commit cd6029e

Please sign in to comment.