diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs index dc0a5090ee8a7..4f010fba3a8df 100644 --- a/rust/arrow/src/array.rs +++ b/rust/arrow/src/array.rs @@ -627,6 +627,26 @@ impl<'a> From> for BinaryArray { } } +impl<'a> From> for BinaryArray { + fn from(v: Vec<&[u8]>) -> Self { + let mut offsets = vec![]; + let mut values = vec![]; + let mut length_so_far = 0; + offsets.push(length_so_far); + for s in &v { + length_so_far += s.len() as i32; + offsets.push(length_so_far as i32); + values.extend_from_slice(s); + } + let array_data = ArrayData::builder(DataType::Utf8) + .len(v.len()) + .add_buffer(Buffer::from(offsets.to_byte_slice())) + .add_buffer(Buffer::from(&values[..])) + .build(); + BinaryArray::from(array_data) + } +} + /// Creates a `BinaryArray` from `List` array impl From for BinaryArray { fn from(v: ListArray) -> Self { @@ -1155,6 +1175,36 @@ mod tests { } } + #[test] + fn test_binary_array_from_u8_slice() { + let values: Vec<&[u8]> = vec![ + &[b'h', b'e', b'l', b'l', b'o'], + &[], + &[ b'p', b'a', b'r', b'q', b'u', b'e', b't'] + ]; + + // Array data: ["hello", "", "parquet"] + let binary_array = BinaryArray::from(values); + + assert_eq!(3, binary_array.len()); + assert_eq!(0, binary_array.null_count()); + assert_eq!([b'h', b'e', b'l', b'l', b'o'], binary_array.value(0)); + assert_eq!("hello", binary_array.get_string(0)); + assert_eq!([] as [u8; 0], binary_array.value(1)); + assert_eq!("", binary_array.get_string(1)); + assert_eq!( + [b'p', b'a', b'r', b'q', b'u', b'e', b't'], + binary_array.value(2) + ); + assert_eq!("parquet", binary_array.get_string(2)); + assert_eq!(5, binary_array.value_offset(2)); + assert_eq!(7, binary_array.value_length(2)); + for i in 0..3 { + assert!(binary_array.is_valid(i)); + assert!(!binary_array.is_null(i)); + } + } + #[test] #[should_panic( expected = "BinaryArray can only be created from List arrays, mismatched \ diff --git a/rust/arrow/src/compute/array_ops.rs b/rust/arrow/src/compute/array_ops.rs index 9725e32bd986c..144783401f616 100644 --- a/rust/arrow/src/compute/array_ops.rs +++ b/rust/arrow/src/compute/array_ops.rs @@ -236,17 +236,14 @@ pub fn filter(array: &Array, filter: &BooleanArray) -> Result { DataType::Float64 => filter_array!(array, filter, Float64Array), DataType::Boolean => filter_array!(array, filter, BooleanArray), DataType::Utf8 => { - //TODO: this is inefficient and we should improve the Arrow impl to help make - // this more concise let b = array.as_any().downcast_ref::().unwrap(); - let mut values: Vec = Vec::with_capacity(b.len()); + let mut values: Vec<&[u8]> = Vec::with_capacity(b.len()); for i in 0..b.len() { if filter.value(i) { - values.push(b.get_string(i)); + values.push(b.value(i)); } } - let tmp: Vec<&str> = values.iter().map(|s| s.as_str()).collect(); - Ok(Arc::new(BinaryArray::from(tmp))) + Ok(Arc::new(BinaryArray::from(values))) } other => Err(ArrowError::ComputeError(format!( "filter not supported for {:?}", @@ -288,14 +285,12 @@ pub fn limit(array: &Array, num_rows_to_read: usize) -> Result { DataType::Float64 => limit_array!(array, num_rows_to_read, Float64Array), DataType::Boolean => limit_array!(array, num_rows_to_read, BooleanArray), DataType::Utf8 => { - //TODO: this is inefficient and we should improve the Arrow impl to help make this more concise let b = array.as_any().downcast_ref::().unwrap(); - let mut values: Vec = Vec::with_capacity(num_rows_to_read as usize); + let mut values: Vec<&[u8]> = Vec::with_capacity(num_rows_to_read as usize); for i in 0..num_rows_to_read { - values.push(b.get_string(i)); + values.push(b.value(i)); } - let tmp: Vec<&str> = values.iter().map(|s| s.as_str()).collect(); - Ok(Arc::new(BinaryArray::from(tmp))) + Ok(Arc::new(BinaryArray::from(values))) } other => Err(ArrowError::ComputeError(format!( "limit not supported for {:?}",