diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 0b9b9d67e6c4..1b488bfaab9b 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -78,50 +78,39 @@ impl GenericStringArray { self.data.buffers()[1].clone() } - /// Returns the number of chars in the string at index. - /// # Assumption - /// The value stored at `i` must be a valid utf-8 string, - /// otherwise you will get an unexpected result. + /// Returns the number of chars in the string at index `i`. + /// # Panic + /// If an invalid utf-8 byte is found, the function will panic. + /// However, this function does not check every byte. So you might + /// get an unexpected result if the string is in invalid utf-8 format. /// # Performance /// This function has `O(n)` time complexity where `n` is the string length. - /// If you can make sure that all chars in string are in the range `U+0x0000` ~ `U+0x0080`, + /// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`, /// please use the function [`value_length`](#method.value_length) which has O(1) time complexity. - /// # Safety - /// Caller is responsible for ensuring that index is within the array bounds. - pub unsafe fn num_chars_unchecked(&self, i: usize) -> usize { - let start = self.value_offsets().get_unchecked(i).to_usize().unwrap(); - let end = self - .value_offsets() - .get_unchecked(i + 1) - .to_usize() - .unwrap(); + pub fn num_chars(&self, i: usize) -> usize { + let offsets = self.value_offsets(); + let start = offsets[i].to_usize().unwrap(); + let end = offsets[i + 1].to_usize().unwrap(); let chars = &self.data.buffers()[1].as_slice()[start..end]; let mut char_iter = chars.iter(); let mut length: usize = 0; while let Some(prefix) = char_iter.next() { let ones = prefix.leading_ones() as usize; - if ones > 0 { - char_iter.nth(ones - 2); - } + match ones { + 0 => {} + 2..=4 => { + char_iter.nth(ones - 2); + } + _ => { + panic!("invalid utf-8 format"); + } + }; length += 1; } length } - /// Returns the number of chars in the string at index. - /// # Assumption - /// The value stored at `i` must be a valid utf-8 string, - /// otherwise you will get an unexpected result. - /// # Performance - /// This function has `O(n)` time complexity where `n` is the string length. - /// If you can make sure that all chars in string are in the range `U+0x0000` ~ `U+0x0080`, - /// please use the function [`value_length`](#method.value_length) which has O(1) time complexity. - pub fn num_chars(&self, i: usize) -> usize { - assert!(i < self.data.len(), "StringArray out of bounds access"); - unsafe { self.num_chars_unchecked(i) } - } - /// Returns the element at index /// # Safety /// caller is responsible for ensuring that index is within the array bounds @@ -451,9 +440,9 @@ mod tests { #[test] fn test_large_string_array_from_u8_slice() { - let values: Vec<&str> = vec!["hello", "", "parquet"]; + let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]; - // Array data: ["hello", "", "parquet"] + // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"] let string_array = LargeStringArray::from(values); assert_eq!(3, string_array.len()); @@ -462,10 +451,13 @@ mod tests { assert_eq!("hello", unsafe { string_array.value_unchecked(0) }); assert_eq!("", string_array.value(1)); assert_eq!("", unsafe { string_array.value_unchecked(1) }); - assert_eq!("parquet", string_array.value(2)); - assert_eq!("parquet", unsafe { string_array.value_unchecked(2) }); + assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2)); + assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe { + string_array.value_unchecked(2) + }); assert_eq!(5, string_array.value_offsets()[2]); - assert_eq!(7, string_array.value_length(2)); + assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1 + assert_eq!(8, string_array.num_chars(2)); for i in 0..3 { assert!(string_array.is_valid(i)); assert!(!string_array.is_null(i));