diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index aa332ae1f5b..f469b0a1da9 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -92,7 +92,10 @@ pub trait DataTypeExt { impl DataTypeExt for DataType { fn is_binary_like(&self) -> bool { use DataType::*; - matches!(self, Utf8 | Binary | LargeUtf8 | LargeBinary) + matches!( + self, + Utf8 | Binary | LargeUtf8 | LargeBinary | Utf8View | BinaryView + ) } fn is_struct(&self) -> bool { @@ -462,7 +465,11 @@ pub fn iter_str_array(arr: &dyn Array) -> Box> match arr.data_type() { DataType::Utf8 => Box::new(arr.as_string::().iter()), DataType::LargeUtf8 => Box::new(arr.as_string::().iter()), - _ => panic!("Expecting Utf8 or LargeUtf8, found {:?}", arr.data_type()), + DataType::Utf8View => Box::new(arr.as_string_view().iter()), + _ => panic!( + "Expecting Utf8, LargeUtf8 or Utf8View, found {:?}", + arr.data_type() + ), } } @@ -1558,7 +1565,7 @@ impl BufferExt for arrow_buffer::Buffer { mod tests { use super::*; use arrow_array::{new_empty_array, new_null_array, ListArray, StringArray}; - use arrow_array::{Float32Array, Int32Array, StructArray}; + use arrow_array::{Float32Array, Int32Array, StringViewArray, StructArray}; use arrow_buffer::OffsetBuffer; #[test] @@ -2523,4 +2530,66 @@ mod tests { &Int32Array::from(vec![1, 2]) as &dyn Array ); } + + #[test] + fn test_iter_str_array_utf8view_with_nulls() { + // Test Utf8View iteration with nulls + let values: Vec> = vec![ + Some("hello"), + None, + Some("world"), + None, + Some(""), + Some("test"), + ]; + let array = StringViewArray::from(values); + let mut iter = iter_str_array(&array); + + assert_eq!(iter.next(), Some(Some("hello"))); + assert_eq!(iter.next(), Some(None)); + assert_eq!(iter.next(), Some(Some("world"))); + assert_eq!(iter.next(), Some(None)); + assert_eq!(iter.next(), Some(Some(""))); + assert_eq!(iter.next(), Some(Some("test"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_iter_str_array_utf8view_large_strings() { + // Test Utf8View with large strings + let large_str = "a".repeat(1000); + let x_repeated = "x".repeat(500); + let values = vec![ + large_str.as_str(), + "medium", + "small", + "", + x_repeated.as_str(), + ]; + let array = StringViewArray::from(values); + let mut iter = iter_str_array(&array); + + let first = iter.next().unwrap().unwrap(); + assert_eq!(first.len(), 1000); + assert_eq!(iter.next(), Some(Some("medium"))); + assert_eq!(iter.next(), Some(Some("small"))); + assert_eq!(iter.next(), Some(Some(""))); + + let last = iter.next().unwrap().unwrap(); + assert_eq!(last.len(), 500); + assert_eq!(iter.next(), None); + } + + #[test] + fn test_iter_str_array_all_nulls_utf8view() { + // Test Utf8View array with all nulls + let values: Vec> = vec![None, None, None]; + let array = StringViewArray::from(values); + let mut iter = iter_str_array(&array); + + assert_eq!(iter.next(), Some(None)); + assert_eq!(iter.next(), Some(None)); + assert_eq!(iter.next(), Some(None)); + assert_eq!(iter.next(), None); + } } diff --git a/rust/lance-core/src/datatypes.rs b/rust/lance-core/src/datatypes.rs index 704c1c4dbe6..dd09e8c1555 100644 --- a/rust/lance-core/src/datatypes.rs +++ b/rust/lance-core/src/datatypes.rs @@ -160,6 +160,8 @@ impl TryFrom<&DataType> for LogicalType { DataType::Binary => "binary".to_string(), DataType::LargeUtf8 => "large_string".to_string(), DataType::LargeBinary => "large_binary".to_string(), + DataType::Utf8View => "string_view".to_string(), + DataType::BinaryView => "binary_view".to_string(), DataType::Date32 => "date32:day".to_string(), DataType::Date64 => "date64:ms".to_string(), DataType::Time32(tu) => format!("time32:{}", timeunit_to_str(tu)), @@ -254,6 +256,8 @@ impl TryFrom<&LogicalType> for DataType { "binary" => Some(Binary), "large_string" => Some(LargeUtf8), "large_binary" => Some(LargeBinary), + "string_view" => Some(Utf8View), + "binary_view" => Some(BinaryView), BLOB_LOGICAL_TYPE => Some(LargeBinary), "json" => Some(LargeBinary), "date32:day" => Some(Date32), diff --git a/rust/lance-core/src/datatypes/field.rs b/rust/lance-core/src/datatypes/field.rs index a091632c524..e3f266399e4 100644 --- a/rust/lance-core/src/datatypes/field.rs +++ b/rust/lance-core/src/datatypes/field.rs @@ -691,7 +691,17 @@ impl Field { if (dt.is_primitive() && other_dt.is_primitive()) || (dt.is_binary_like() && other_dt.is_binary_like()) => { - if dt != other_dt { + // View types (Utf8View, BinaryView) are stored as their non-view counterparts. + // Treat Utf8View as compatible with Utf8, and BinaryView as compatible with Binary. + let types_match = match (&dt, &other_dt) { + (DataType::Utf8View, DataType::Utf8) | (DataType::Utf8, DataType::Utf8View) => { + true + } + (DataType::BinaryView, DataType::Binary) + | (DataType::Binary, DataType::BinaryView) => true, + _ => dt == other_dt, + }; + if !types_match { return Err(Error::Schema { message: format!( "Attempt to project field by different types: {} and {}", diff --git a/rust/lance-datafusion/src/expr.rs b/rust/lance-datafusion/src/expr.rs index d7e6ce7a56a..c485cf82755 100644 --- a/rust/lance-datafusion/src/expr.rs +++ b/rust/lance-datafusion/src/expr.rs @@ -218,11 +218,13 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option match ty { DataType::Utf8 => Some(value.clone()), DataType::LargeUtf8 => Some(ScalarValue::LargeUtf8(val.clone())), + DataType::Utf8View => Some(ScalarValue::Utf8View(val.clone())), _ => None, }, ScalarValue::LargeUtf8(val) => match ty { DataType::Utf8 => Some(ScalarValue::Utf8(val.clone())), DataType::LargeUtf8 => Some(value.clone()), + DataType::Utf8View => Some(ScalarValue::Utf8View(val.clone())), _ => None, }, ScalarValue::Boolean(_) => match ty { @@ -408,6 +410,7 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option match ty { DataType::Binary => Some(ScalarValue::Binary(value.clone())), DataType::LargeBinary => Some(ScalarValue::LargeBinary(value.clone())), + DataType::BinaryView => Some(ScalarValue::BinaryView(value.clone())), DataType::FixedSizeBinary(len) => { if let Some(value) = value { if value.len() == *len as usize { diff --git a/rust/lance-encoding/src/data.rs b/rust/lance-encoding/src/data.rs index 8828673326f..fcbe369a7be 100644 --- a/rust/lance-encoding/src/data.rs +++ b/rust/lance-encoding/src/data.rs @@ -23,7 +23,7 @@ use arrow_array::{ cast::AsArray, new_empty_array, new_null_array, types::{ArrowDictionaryKeyType, UInt16Type, UInt32Type, UInt64Type, UInt8Type}, - Array, ArrayRef, OffsetSizeTrait, UInt64Array, + Array, ArrayRef, BinaryViewArray, OffsetSizeTrait, StringViewArray, UInt64Array, }; use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder, NullBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -583,6 +583,34 @@ impl VariableWidthBlock { fn into_arrow(self, data_type: DataType, validate: bool) -> Result { let data_buffer = self.data.into_buffer(); let offsets_buffer = self.offsets.into_buffer(); + + // Handle conversion from i32 to i64 offsets for compatibility. + // This is needed when reading legacy data that was stored with 32-bit offsets + // but is now being read as LargeUtf8/LargeBinary (which use 64-bit offsets). + // Note: For Utf8View/BinaryView types, this conversion is not triggered since + // we store them with 64-bit offsets (as seen in arrow_view_to_data_block). + let offsets_buffer = if self.bits_per_offset == 32 + && (data_type == DataType::LargeUtf8 || data_type == DataType::LargeBinary) + { + // Convert i32 offsets to i64 + let num_offsets = offsets_buffer.len() / 4; + let i32_slice = offsets_buffer.as_slice(); + let i64_offsets: Vec = (0..num_offsets) + .map(|i| { + let bytes = [ + i32_slice[i * 4], + i32_slice[i * 4 + 1], + i32_slice[i * 4 + 2], + i32_slice[i * 4 + 3], + ]; + i32::from_le_bytes(bytes) as i64 + }) + .collect(); + arrow_buffer::Buffer::from_vec(i64_offsets) + } else { + offsets_buffer + }; + let builder = ArrayDataBuilder::new(data_type) .add_buffer(offsets_buffer) .add_buffer(data_buffer) @@ -1224,6 +1252,82 @@ fn arrow_binary_to_data_block( }) } +fn arrow_view_to_data_block(arrays: &[ArrayRef], _num_values: u64) -> DataBlock { + // Convert view arrays directly to DataBlock using 64-bit offsets + // This avoids the overhead of creating intermediate LargeStringArray/LargeBinaryArray + let data_type = arrays[0].data_type(); + + // Pre-allocate vectors to avoid repeated reallocations + // all_offsets needs num_values + 1 elements (initial offset + one per value) + let total_values: usize = arrays.iter().map(|arr| arr.len()).sum(); + let mut all_offsets = Vec::with_capacity(total_values + 1); + // Pre-allocate data buffer with estimated capacity (sum of non-null values) + let estimated_data_size: usize = arrays + .iter() + .map(|arr| { + // Get the total buffer size for this array as an estimate + arr.to_data() + .buffers() + .iter() + .map(|b| b.len()) + .sum::() + }) + .sum(); + let mut all_data = Vec::with_capacity(estimated_data_size); + let mut cumulative_offset = 0i64; + + // Start with initial offset of 0 + all_offsets.push(0); + + match data_type { + DataType::BinaryView => { + for arr in arrays { + let view_arr = arr.as_any().downcast_ref::().unwrap(); + for i in 0..view_arr.len() { + if view_arr.is_null(i) { + // For null values, the offset doesn't advance + all_offsets.push(cumulative_offset); + } else { + let val = view_arr.value(i); + cumulative_offset += val.len() as i64; + all_offsets.push(cumulative_offset); + all_data.extend_from_slice(val); + } + } + } + } + DataType::Utf8View => { + for arr in arrays { + let view_arr = arr.as_any().downcast_ref::().unwrap(); + for i in 0..view_arr.len() { + if view_arr.is_null(i) { + // For null values, the offset doesn't advance + all_offsets.push(cumulative_offset); + } else { + let val = view_arr.value(i); + cumulative_offset += val.len() as i64; + all_offsets.push(cumulative_offset); + all_data.extend_from_slice(val.as_bytes()); + } + } + } + } + _ => unreachable!(), + } + + // Use the actual number of values processed (offsets.len() - 1) + // instead of the passed num_values parameter to ensure consistency + let num_values = (all_offsets.len() - 1) as u64; + + DataBlock::VariableWidth(VariableWidthBlock { + data: LanceBuffer::from(arrow_buffer::Buffer::from_vec(all_data)), + offsets: LanceBuffer::from(arrow_buffer::Buffer::from_vec(all_offsets)), + bits_per_offset: 64, + num_values, + block_info: BlockInfo::new(), + }) +} + fn encode_flat_data(arrays: &[ArrayRef], num_values: u64) -> LanceBuffer { let bytes_per_value = arrays[0].data_type().byte_width(); let mut buffer = Vec::with_capacity(num_values as usize * bytes_per_value); @@ -1464,7 +1568,7 @@ impl DataBlock { let mut encoded = match data_type { DataType::Binary | DataType::Utf8 => arrow_binary_to_data_block(arrays, num_values, 32), DataType::BinaryView | DataType::Utf8View => { - todo!() + arrow_view_to_data_block(arrays, num_values) } DataType::LargeBinary | DataType::LargeUtf8 => { arrow_binary_to_data_block(arrays, num_values, 64) @@ -1635,8 +1739,8 @@ mod tests { use arrow_array::{ make_array, new_null_array, types::{Int32Type, Int8Type}, - ArrayRef, DictionaryArray, Int8Array, LargeBinaryArray, StringArray, UInt16Array, - UInt8Array, + ArrayRef, BinaryViewArray, DictionaryArray, Int8Array, LargeBinaryArray, StringArray, + StringViewArray, UInt16Array, UInt8Array, }; use arrow_buffer::{BooleanBuffer, NullBuffer}; @@ -1646,7 +1750,7 @@ mod tests { use crate::buffer::LanceBuffer; - use super::{AllNullDataBlock, DataBlock}; + use super::{arrow_view_to_data_block, AllNullDataBlock, DataBlock}; use arrow_array::Array; @@ -2009,4 +2113,260 @@ mod tests { let total_nulls_size_in_bytes = concatenated_array.nulls().unwrap().len().div_ceil(8); assert!(block.data_size() == (total_buffer_size + total_nulls_size_in_bytes) as u64); } + + #[test] + fn test_string_view_to_data_block_basic() { + // Test basic string view conversion + let values = vec!["hello", "world", "test", "data"]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 4); + + assert_eq!(block.num_values(), 4); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!( + var_width.data, + LanceBuffer::copy_slice(b"helloworldtestdata") + ); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 5, 10, 14, 18]) + ); + } + + #[test] + fn test_string_view_to_data_block_with_nulls() { + // Test string view conversion with nulls + let values: Vec> = vec![Some("hello"), None, Some("world"), None, Some("")]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 5); + + assert_eq!(block.num_values(), 5); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + // Null values should have same offset as previous + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 5, 5, 10, 10, 10]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"helloworld")); + } + + #[test] + fn test_string_view_to_data_block_empty_strings() { + // Test string view conversion with empty strings + let values: Vec> = vec![Some(""), Some(""), Some(""), Some("test")]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 4); + + assert_eq!(block.num_values(), 4); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 0, 0, 0, 4]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"test")); + } + + #[test] + fn test_string_view_to_data_block_multiple_arrays() { + // Test concatenating multiple string view arrays + let values1 = vec!["hello", "world"]; + let values2 = vec!["foo", "bar"]; + let array1 = StringViewArray::from(values1); + let array2 = StringViewArray::from(values2); + let arr_refs: Vec = vec![Arc::new(array1), Arc::new(array2)]; + + let block = arrow_view_to_data_block(&arr_refs, 4); + + assert_eq!(block.num_values(), 4); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 5, 10, 13, 16]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"helloworldfoobar")); + } + + #[test] + fn test_string_view_to_data_block_large_strings() { + // Test with large strings + let large_str = "a".repeat(1000); + let values = vec![large_str.as_str(), "small", "medium"]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 3); + + assert_eq!(block.num_values(), 3); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!(var_width.data.len(), 1000 + 5 + 6); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 1000, 1005, 1011]) + ); + } + + #[test] + fn test_binary_view_to_data_block_basic() { + // Test basic binary view conversion + let values = vec![b"hello".as_slice(), b"world".as_slice(), b"test".as_slice()]; + let array = BinaryViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 3); + + assert_eq!(block.num_values(), 3); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"helloworldtest")); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 5, 10, 14]) + ); + } + + #[test] + fn test_binary_view_to_data_block_with_nulls() { + // Test binary view conversion with nulls + let values: Vec> = vec![ + Some(b"hello"), + None, + Some(b"world"), + None, + Some(b""), + Some(b"data"), + ]; + let array = BinaryViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 6); + + assert_eq!(block.num_values(), 6); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + // Check that null values don't advance the offset + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 5, 5, 10, 10, 10, 14]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"helloworlddata")); + } + + #[test] + fn test_binary_view_to_data_block_empty_binary() { + // Test binary view with empty binary values + let values: Vec> = vec![Some(b""), Some(b""), Some(b"test")]; + let array = BinaryViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 3); + + assert_eq!(block.num_values(), 3); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 0, 0, 4]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"test")); + } + + #[test] + fn test_binary_view_to_data_block_multiple_arrays() { + // Test concatenating multiple binary view arrays + let values1 = vec![b"a".as_slice(), b"b".as_slice()]; + let values2 = vec![b"c".as_slice(), b"d".as_slice()]; + let array1 = BinaryViewArray::from(values1); + let array2 = BinaryViewArray::from(values2); + let arr_refs: Vec = vec![Arc::new(array1), Arc::new(array2)]; + + let block = arrow_view_to_data_block(&arr_refs, 4); + + assert_eq!(block.num_values(), 4); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 1, 2, 3, 4]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"abcd")); + } + + #[test] + fn test_string_view_to_data_block_all_nulls() { + // Test with all null values + let values: Vec> = vec![None, None, None]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 3); + + assert_eq!(block.num_values(), 3); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + // All offsets should be 0 + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 0, 0, 0]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"")); + } + + #[test] + fn test_string_view_to_data_block_single_value() { + // Test with single value + let values = vec!["single"]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 1); + + assert_eq!(block.num_values(), 1); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(vec![0i64, 6]) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"single")); + } + + #[test] + fn test_string_view_to_data_block_mixed_nulls_and_empty() { + // Test complex case with mix of nulls, empty strings, and normal strings + let values: Vec> = vec![ + Some("start"), + None, + Some(""), + Some("middle"), + Some(""), + None, + Some("end"), + Some(""), + ]; + let array = StringViewArray::from(values); + let arr_ref = Arc::new(array) as ArrayRef; + + let block = arrow_view_to_data_block(std::slice::from_ref(&arr_ref), 8); + + assert_eq!(block.num_values(), 8); + let var_width = block.as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + let expected_offsets = vec![0i64, 5, 5, 5, 11, 11, 11, 14, 14]; + assert_eq!( + var_width.offsets, + LanceBuffer::reinterpret_vec(expected_offsets) + ); + assert_eq!(var_width.data, LanceBuffer::copy_slice(b"startmiddleend")); + } } diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs index a3bcfd3c27b..4b51303b790 100644 --- a/rust/lance-encoding/src/decoder.rs +++ b/rust/lance-encoding/src/decoder.rs @@ -563,6 +563,7 @@ impl CoreFieldDecoderStrategy { // DataType::is_primitive doesn't consider these primitive but we do DataType::Boolean | DataType::Null | DataType::FixedSizeBinary(_) => true, DataType::FixedSizeList(inner, _) => Self::is_primitive_legacy(inner.data_type()), + DataType::Utf8View | DataType::BinaryView => true, _ => false, } } diff --git a/rust/lance-encoding/src/previous/encoder.rs b/rust/lance-encoding/src/previous/encoder.rs index b6ab35722f7..fd517f53d46 100644 --- a/rust/lance-encoding/src/previous/encoder.rs +++ b/rust/lance-encoding/src/previous/encoder.rs @@ -152,8 +152,10 @@ impl CoreFieldEncodingStrategy { | DataType::FixedSizeList(_, _) | DataType::Binary | DataType::LargeBinary + | DataType::BinaryView | DataType::Utf8 - | DataType::LargeUtf8, + | DataType::LargeUtf8 + | DataType::Utf8View, ) } } @@ -273,11 +275,13 @@ pub struct CoreArrayEncodingStrategy { pub version: LanceFileVersion, } -const BINARY_DATATYPES: [DataType; 4] = [ +const BINARY_DATATYPES: [DataType; 6] = [ DataType::Binary, DataType::LargeBinary, + DataType::BinaryView, DataType::Utf8, DataType::LargeUtf8, + DataType::Utf8View, ]; impl CoreArrayEncodingStrategy { @@ -374,13 +378,18 @@ impl CoreArrayEncodingStrategy { value_encoder, ))) } - DataType::Utf8 | DataType::LargeUtf8 | DataType::Binary | DataType::LargeBinary => { + DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Utf8View + | DataType::Binary + | DataType::LargeBinary + | DataType::BinaryView => { if use_dict_encoding { let dict_indices_encoder = Self::choose_array_encoder( // We need to pass arrays to this method to figure out what kind of compression to - // use but we haven't actually calculated the indices yet. For now, we just assume + // use but we haven't actually calculated indices yet. For now, we just assume // worst case and use the full range. In the future maybe we can pass in statistics - // instead of the actual data + // instead of actual data &[Arc::new(UInt8Array::from_iter_values(0_u8..255_u8))], &DataType::UInt8, data_size, diff --git a/rust/lance-encoding/src/previous/encodings/logical/primitive.rs b/rust/lance-encoding/src/previous/encodings/logical/primitive.rs index 3326f62664b..fd6fc6e9cc3 100644 --- a/rust/lance-encoding/src/previous/encodings/logical/primitive.rs +++ b/rust/lance-encoding/src/previous/encodings/logical/primitive.rs @@ -3,7 +3,7 @@ use std::{fmt::Debug, ops::Range, sync::Arc, vec}; -use arrow_array::{cast::AsArray, make_array, Array, ArrayRef}; +use arrow_array::{cast::AsArray, make_array, Array, ArrayRef, BinaryViewArray, StringViewArray}; use arrow_buffer::bit_util; use arrow_schema::DataType; use futures::{future::BoxFuture, FutureExt}; @@ -295,7 +295,28 @@ impl DecodeArrayTask for PrimitiveFieldDecodeTask { .physical_decoder .decode(self.rows_to_skip, self.rows_to_take)?; - let array = make_array(block.into_arrow(self.data_type.clone(), self.should_validate)?); + // View types (Utf8View, BinaryView) are stored as their non-view counterparts. + // We need to decode as the non-view type and then convert back to the view type. + let decode_type = match self.data_type { + DataType::Utf8View => DataType::LargeUtf8, + DataType::BinaryView => DataType::LargeBinary, + _ => self.data_type.clone(), + }; + + let array = make_array(block.into_arrow(decode_type, self.should_validate)?); + + // Convert from non-view array to view array if needed + let array: ArrayRef = match self.data_type { + DataType::Utf8View => { + let string_arr = array.as_string::(); + Arc::new(StringViewArray::from(string_arr)) as ArrayRef + } + DataType::BinaryView => { + let binary_arr = array.as_binary::(); + Arc::new(BinaryViewArray::from(binary_arr)) as ArrayRef + } + _ => array, + }; // This is a bit of a hack to work around https://github.com/apache/arrow-rs/issues/6302 // @@ -531,3 +552,260 @@ impl FieldEncoder for PrimitiveFieldEncoder { std::future::ready(Ok(vec![EncodedColumn::default()])).boxed() } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::{ + Array, ArrayRef, BinaryViewArray, LargeBinaryArray, LargeStringArray, StringViewArray, + }; + use arrow_schema::DataType; + + /// Test that view types are correctly encoded and decoded + /// This test simulates full encode-decode cycle for view types + #[tokio::test] + async fn test_utf8view_encode_decode_cycle() { + use crate::data::DataBlock; + + // Create test data with Utf8View type + let values = vec!["hello", "world", "test", "", "view"]; + let utf8_view_array = StringViewArray::from(values); + + // Test encoding to DataBlock + let arrays: Vec = vec![Arc::new(utf8_view_array)]; + let num_values = arrays[0].len() as u64; + let block = DataBlock::from_arrays(&arrays, num_values); + + // Verify of block was created successfully + assert_eq!(block.num_values(), 5); + + // Test decoding back to LargeUtf8 (storage format) + let var_width = block.clone().as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + + // Decode to LargeUtf8 + let decoded_type = DataType::LargeUtf8; + let array_data = block.into_arrow(decoded_type, true).unwrap(); + let large_utf8 = LargeStringArray::from(array_data); + + // Verify of decoded values + assert_eq!(large_utf8.len(), 5); + assert_eq!(large_utf8.value(0), "hello"); + assert_eq!(large_utf8.value(1), "world"); + assert_eq!(large_utf8.value(2), "test"); + assert_eq!(large_utf8.value(3), ""); + assert_eq!(large_utf8.value(4), "view"); + + // Convert back to Utf8View (what that decoder does) + let converted_back: StringViewArray = large_utf8.iter().collect(); + assert_eq!(converted_back.len(), 5); + assert_eq!(converted_back.value(0), "hello"); + assert_eq!(converted_back.value(1), "world"); + assert_eq!(converted_back.value(2), "test"); + assert_eq!(converted_back.value(3), ""); + assert_eq!(converted_back.value(4), "view"); + } + + #[tokio::test] + async fn test_binaryview_encode_decode_cycle() { + use crate::data::DataBlock; + + // Create test data with BinaryView type + let values = vec![ + b"hello".as_slice(), + b"world".as_slice(), + b"test".as_slice(), + b"".as_slice(), + b"view".as_slice(), + ]; + let binary_view_array = BinaryViewArray::from(values); + + // Test encoding to DataBlock + let arrays: Vec = vec![Arc::new(binary_view_array)]; + let num_values = arrays[0].len() as u64; + let block = DataBlock::from_arrays(&arrays, num_values); + + // Verify of block was created successfully + assert_eq!(block.num_values(), 5); + + // Test decoding back to LargeBinary (storage format) + let var_width = block.clone().as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + + // Decode to LargeBinary + let decoded_type = DataType::LargeBinary; + let array_data = block.into_arrow(decoded_type, true).unwrap(); + let large_binary = LargeBinaryArray::from(array_data); + + // Verify of decoded values + assert_eq!(large_binary.len(), 5); + assert_eq!(large_binary.value(0), b"hello"); + assert_eq!(large_binary.value(1), b"world"); + assert_eq!(large_binary.value(2), b"test"); + assert_eq!(large_binary.value(3), b""); + assert_eq!(large_binary.value(4), b"view"); + + // Convert back to BinaryView (what that decoder does) + let converted_back: BinaryViewArray = large_binary.iter().collect(); + assert_eq!(converted_back.len(), 5); + assert_eq!(converted_back.value(0), b"hello"); + assert_eq!(converted_back.value(1), b"world"); + assert_eq!(converted_back.value(2), b"test"); + assert_eq!(converted_back.value(3), b""); + assert_eq!(converted_back.value(4), b"view"); + } + + #[tokio::test] + async fn test_utf8view_with_nulls_encode_decode() { + use crate::data::DataBlock; + + // Create test data with Utf8View type including nulls + let values: Vec> = vec![ + Some("hello"), + None, + Some("world"), + None, + Some(""), + Some("test"), + ]; + let utf8_view_array = StringViewArray::from(values); + + // Test encoding to DataBlock + let arrays: Vec = vec![Arc::new(utf8_view_array)]; + let num_values = arrays[0].len() as u64; + let block = DataBlock::from_arrays(&arrays, num_values); + + // Verify of block was created successfully + assert_eq!(block.num_values(), 6); + + // Decode to LargeUtf8 + let decoded_type = DataType::LargeUtf8; + // When the data has nulls, the block is wrapped in a NullableDataBlock + let array_data = block.into_arrow(decoded_type, true).unwrap(); + let large_utf8 = LargeStringArray::from(array_data); + + // Verify of decoded values including nulls + assert_eq!(large_utf8.len(), 6); + assert_eq!(large_utf8.null_count(), 2); + assert!(!large_utf8.is_null(0)); + assert!(large_utf8.is_null(1)); + assert!(!large_utf8.is_null(2)); + assert!(large_utf8.is_null(3)); + assert!(!large_utf8.is_null(4)); + assert!(!large_utf8.is_null(5)); + + assert_eq!(large_utf8.value(0), "hello"); + assert_eq!(large_utf8.value(2), "world"); + assert_eq!(large_utf8.value(4), ""); + assert_eq!(large_utf8.value(5), "test"); + + // Convert back to Utf8View + let converted_back: StringViewArray = large_utf8.iter().collect(); + assert_eq!(converted_back.null_count(), 2); + assert!(!converted_back.is_null(0)); + assert!(converted_back.is_null(1)); + assert!(!converted_back.is_null(2)); + assert!(converted_back.is_null(3)); + assert!(!converted_back.is_null(4)); + assert!(!converted_back.is_null(5)); + } + + #[tokio::test] + async fn test_utf8view_large_string_encode_decode() { + use crate::data::DataBlock; + + // Create test data with large strings (testing view handling of long strings) + let large_str = "a".repeat(1000); + let x_repeated = "x".repeat(500); + let values = vec![ + large_str.as_str(), + "medium", + "small", + "", + x_repeated.as_str(), + ]; + let utf8_view_array = StringViewArray::from(values); + + // Test encoding to DataBlock + let arrays: Vec = vec![Arc::new(utf8_view_array)]; + let num_values = arrays[0].len() as u64; + let block = DataBlock::from_arrays(&arrays, num_values); + + // Verify of block was created successfully + assert_eq!(block.num_values(), 5); + + // Decode to LargeUtf8 + let decoded_type = DataType::LargeUtf8; + let var_width = block.clone().as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + + let array_data = block.into_arrow(decoded_type, true).unwrap(); + let large_utf8 = LargeStringArray::from(array_data); + + // Verify of decoded values + assert_eq!(large_utf8.len(), 5); + assert_eq!(large_utf8.value(0).len(), 1000); + assert_eq!(large_utf8.value(1), "medium"); + assert_eq!(large_utf8.value(2), "small"); + assert_eq!(large_utf8.value(3), ""); + assert_eq!(large_utf8.value(4).len(), 500); + + // Convert back to Utf8View + let converted_back: StringViewArray = large_utf8.iter().collect(); + assert_eq!(converted_back.len(), 5); + assert_eq!(converted_back.value(0).len(), 1000); + assert_eq!(converted_back.value(1), "medium"); + assert_eq!(converted_back.value(2), "small"); + assert_eq!(converted_back.value(3), ""); + assert_eq!(converted_back.value(4).len(), 500); + } + + #[tokio::test] + async fn test_multiple_arrays_utf8view_encode_decode() { + use crate::data::DataBlock; + + // Create multiple StringView arrays to test concatenation + let values1 = vec!["hello", "world"]; + let values2 = vec!["foo", "bar"]; + let values3 = vec!["test", "data"]; + + let array1 = StringViewArray::from(values1); + let array2 = StringViewArray::from(values2); + let array3 = StringViewArray::from(values3); + + let arrays: Vec = vec![Arc::new(array1), Arc::new(array2), Arc::new(array3)]; + let num_values = arrays.iter().map(|a| a.len()).sum::() as u64; + let block = DataBlock::from_arrays(&arrays, num_values); + + // Verify of block was created successfully + assert_eq!(block.num_values(), 6); + + // Decode to LargeUtf8 + let decoded_type = DataType::LargeUtf8; + let var_width = block.clone().as_variable_width().unwrap(); + assert_eq!(var_width.bits_per_offset, 64); + + let array_data = block.into_arrow(decoded_type, true).unwrap(); + let large_utf8 = LargeStringArray::from(array_data); + + // Verify all values were concatenated correctly + assert_eq!(large_utf8.len(), 6); + assert_eq!(large_utf8.value(0), "hello"); + assert_eq!(large_utf8.value(1), "world"); + assert_eq!(large_utf8.value(2), "foo"); + assert_eq!(large_utf8.value(3), "bar"); + assert_eq!(large_utf8.value(4), "test"); + assert_eq!(large_utf8.value(5), "data"); + + // Convert back to Utf8View + let converted_back: StringViewArray = large_utf8.iter().collect(); + assert_eq!(converted_back.len(), 6); + assert_eq!(converted_back.value(0), "hello"); + assert_eq!(converted_back.value(1), "world"); + assert_eq!(converted_back.value(2), "foo"); + assert_eq!(converted_back.value(3), "bar"); + assert_eq!(converted_back.value(4), "test"); + assert_eq!(converted_back.value(5), "data"); + } +} diff --git a/rust/lance-index/src/scalar/bloomfilter.rs b/rust/lance-index/src/scalar/bloomfilter.rs index 3057323b5da..9facc4b7f58 100644 --- a/rust/lance-index/src/scalar/bloomfilter.rs +++ b/rust/lance-index/src/scalar/bloomfilter.rs @@ -286,6 +286,9 @@ impl BloomFilterIndex { datafusion_common::ScalarValue::LargeUtf8(Some(val)) => { Ok(sbbf.check(val.as_str())) } + datafusion_common::ScalarValue::Utf8View(Some(val)) => { + Ok(sbbf.check(val.as_str())) + } // Binary types datafusion_common::ScalarValue::Binary(Some(val)) => { Ok(sbbf.check(val.as_slice())) @@ -293,6 +296,9 @@ impl BloomFilterIndex { datafusion_common::ScalarValue::LargeBinary(Some(val)) => { Ok(sbbf.check(val.as_slice())) } + datafusion_common::ScalarValue::BinaryView(Some(val)) => { + Ok(sbbf.check(val.as_slice())) + } // Date and time types datafusion_common::ScalarValue::Date32(Some(val)) => Ok(sbbf.check(val)), datafusion_common::ScalarValue::Date64(Some(val)) => Ok(sbbf.check(val)), @@ -358,6 +364,9 @@ impl BloomFilterIndex { datafusion_common::ScalarValue::LargeUtf8(Some(val)) => { sbbf.check(val.as_str()) } + datafusion_common::ScalarValue::Utf8View(Some(val)) => { + sbbf.check(val.as_str()) + } // Binary types datafusion_common::ScalarValue::Binary(Some(val)) => { sbbf.check(val.as_slice()) @@ -365,6 +374,9 @@ impl BloomFilterIndex { datafusion_common::ScalarValue::LargeBinary(Some(val)) => { sbbf.check(val.as_slice()) } + datafusion_common::ScalarValue::BinaryView(Some(val)) => { + sbbf.check(val.as_slice()) + } // Date and time types datafusion_common::ScalarValue::Date32(Some(val)) => sbbf.check(val), datafusion_common::ScalarValue::Date64(Some(val)) => sbbf.check(val), @@ -757,6 +769,18 @@ impl BloomFilterProcessor { has_null } + fn process_string_view_array(sbbf: &mut Sbbf, array: &arrow_array::StringViewArray) -> bool { + let mut has_null = false; + for i in 0..array.len() { + if array.is_valid(i) { + sbbf.insert(array.value(i)); + } else { + has_null = true; + } + } + has_null + } + fn process_binary_array(sbbf: &mut Sbbf, array: &arrow_array::BinaryArray) -> bool { let mut has_null = false; for i in 0..array.len() { @@ -780,6 +804,18 @@ impl BloomFilterProcessor { } has_null } + + fn process_binary_view_array(sbbf: &mut Sbbf, array: &arrow_array::BinaryViewArray) -> bool { + let mut has_null = false; + for i in 0..array.len() { + if array.is_valid(i) { + sbbf.insert(array.value(i)); + } else { + has_null = true; + } + } + has_null + } } impl ZoneProcessor for BloomFilterProcessor { @@ -971,6 +1007,18 @@ impl ZoneProcessor for BloomFilterProcessor { .unwrap(); Self::process_large_string_array(sbbf, typed_array) } + DataType::Utf8View => { + let typed_array = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::invalid_input( + "Expected StringViewArray for Utf8View type", + location!(), + ) + })?; + Self::process_string_view_array(sbbf, typed_array) + } DataType::Binary => { let typed_array = array .as_any() @@ -985,6 +1033,18 @@ impl ZoneProcessor for BloomFilterProcessor { .unwrap(); Self::process_large_binary_array(sbbf, typed_array) } + DataType::BinaryView => { + let typed_array = array + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::invalid_input( + "Expected BinaryViewArray for BinaryView type", + location!(), + ) + })?; + Self::process_binary_view_array(sbbf, typed_array) + } _ => { return Err(Error::InvalidInput { source: format!( @@ -1077,9 +1137,11 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { // String types | DataType::Utf8 | DataType::LargeUtf8 + | DataType::Utf8View // Binary types | DataType::Binary | DataType::LargeBinary + | DataType::BinaryView // Date and time types | DataType::Date32 | DataType::Date64 @@ -1091,7 +1153,7 @@ impl ScalarIndexPlugin for BloomFilterIndexPlugin { _ => { return Err(Error::InvalidInput { source: format!( - "Bloom filter index does not support data type: {:?}. Supported types: Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64, Utf8, LargeUtf8, Binary, LargeBinary, Date32, Date64, Time32, Time64, Timestamp", + "Bloom filter index does not support data type: {:?}. Supported types: Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float32, Float64, Utf8, LargeUtf8, Utf8View, Binary, LargeBinary, BinaryView, Date32, Date64, Time32, Time64, Timestamp", field.data_type() ).into(), location: location!(), @@ -2179,4 +2241,152 @@ mod tests { _ => panic!("Expected AtMost search result from bloomfilter"), } } + + #[tokio::test] + async fn test_string_view_bloomfilter_index() { + let tmpdir = TempObjDir::default(); + let test_store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // Create string view data + let string_values: Vec = (0..200).map(|i| format!("value_{:03}", i)).collect(); + let string_data = arrow_array::StringViewArray::from_iter_values(string_values.iter()); + let schema = Arc::new(Schema::new(vec![Field::new( + VALUE_COLUMN_NAME, + DataType::Utf8View, + false, + )])); + let data = RecordBatch::try_new(schema.clone(), vec![Arc::new(string_data)]).unwrap(); + let data_stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema, + stream::once(std::future::ready(Ok(data))), + )); + let data_stream = add_row_addr(data_stream); + + BloomFilterIndexPlugin::train_bloomfilter_index( + data_stream, + test_store.as_ref(), + Some(BloomFilterIndexBuilderParams::new(100, 0.01)), // ~1% false positive rate + ) + .await + .unwrap(); + + // Load the index + let index = BloomFilterIndex::load(test_store.clone(), None, &LanceCache::no_cache()) + .await + .expect("Failed to load BloomFilterIndex"); + + // Should have 2 zones since we have 200 rows and zone size is 100 + assert_eq!(index.zones.len(), 2); + + // Test search for a value in the first zone + let query = BloomFilterQuery::Equals(ScalarValue::Utf8View(Some("value_050".to_string()))); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should match the first zone + let mut expected = RowAddrTreeMap::new(); + expected.insert_range(0..100); + assert_eq!(result, SearchResult::at_most(expected)); + + // Test search for a value in the second zone + let query = BloomFilterQuery::Equals(ScalarValue::Utf8View(Some("value_150".to_string()))); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should match the second zone + let mut expected = RowAddrTreeMap::new(); + expected.insert_range(100..200); + assert_eq!(result, SearchResult::at_most(expected)); + + // Test search for a value that doesn't exist + let query = + BloomFilterQuery::Equals(ScalarValue::Utf8View(Some("nonexistent_value".to_string()))); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should return empty since bloom filter correctly filters out this value + assert_eq!(result, SearchResult::at_most(RowAddrTreeMap::new())); + + // Test IsIn query with string view values + let query = BloomFilterQuery::IsIn(vec![ + ScalarValue::Utf8View(Some("value_025".to_string())), // First zone + ScalarValue::Utf8View(Some("value_175".to_string())), // Second zone + ScalarValue::Utf8View(Some("nonexistent".to_string())), // Not present + ]); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should match both zones + let mut expected = RowAddrTreeMap::new(); + expected.insert_range(0..200); + assert_eq!(result, SearchResult::at_most(expected)); + } + + #[tokio::test] + async fn test_binary_view_bloomfilter_index() { + let tmpdir = TempObjDir::default(); + let test_store = Arc::new(LanceIndexStore::new( + Arc::new(ObjectStore::local()), + tmpdir.clone(), + Arc::new(LanceCache::no_cache()), + )); + + // Create binary view data + let binary_values: Vec> = (0..100) + .map(|i| vec![i as u8, (i + 1) as u8, (i + 2) as u8]) + .collect(); + let binary_data = arrow_array::BinaryViewArray::from_iter_values(binary_values.iter()); + let schema = Arc::new(Schema::new(vec![Field::new( + VALUE_COLUMN_NAME, + DataType::BinaryView, + false, + )])); + let data = RecordBatch::try_new(schema.clone(), vec![Arc::new(binary_data)]).unwrap(); + let data_stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema, + stream::once(std::future::ready(Ok(data))), + )); + let data_stream = add_row_addr(data_stream); + + BloomFilterIndexPlugin::train_bloomfilter_index( + data_stream, + test_store.as_ref(), + Some(BloomFilterIndexBuilderParams::new(50, 0.05)), + ) + .await + .unwrap(); + + // Load the index + let index = BloomFilterIndex::load(test_store.clone(), None, &LanceCache::no_cache()) + .await + .expect("Failed to load BloomFilterIndex"); + + // Should have 2 zones since we have 100 rows and zone size is 50 + assert_eq!(index.zones.len(), 2); + + // Test search for a value in the first zone + let query = BloomFilterQuery::Equals(ScalarValue::BinaryView(Some(vec![25, 26, 27]))); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should match the first zone + let mut expected = RowAddrTreeMap::new(); + expected.insert_range(0..50); + assert_eq!(result, SearchResult::at_most(expected)); + + // Test search for a value in the second zone + let query = BloomFilterQuery::Equals(ScalarValue::BinaryView(Some(vec![75, 76, 77]))); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should match the second zone + let mut expected = RowAddrTreeMap::new(); + expected.insert_range(50..100); + assert_eq!(result, SearchResult::at_most(expected)); + + // Test search for a value that doesn't exist + let query = BloomFilterQuery::Equals(ScalarValue::BinaryView(Some(vec![255, 254, 253]))); + let result = index.search(&query, &NoOpMetricsCollector).await.unwrap(); + + // Should return empty since bloom filter correctly filters out this value + assert_eq!(result, SearchResult::at_most(RowAddrTreeMap::new())); + } } diff --git a/rust/lance/tests/query/primitives.rs b/rust/lance/tests/query/primitives.rs index b2b8b9db5c1..77f63a5b824 100644 --- a/rust/lance/tests/query/primitives.rs +++ b/rust/lance/tests/query/primitives.rs @@ -253,7 +253,7 @@ async fn test_query_timestamp(#[case] data_type: DataType) { #[rstest::rstest] #[case::utf8(DataType::Utf8)] #[case::large_utf8(DataType::LargeUtf8)] -// #[case::string_view(DataType::Utf8View)] // TODO: https://github.com/lancedb/lance/issues/5172 +#[case::utf8_view(DataType::Utf8View)] async fn test_query_string(#[case] data_type: DataType) { // Create arrays that include empty strings let string_values = vec![ @@ -310,7 +310,7 @@ async fn test_query_string(#[case] data_type: DataType) { #[rstest::rstest] #[case::binary(DataType::Binary)] #[case::large_binary(DataType::LargeBinary)] -// #[case::binary_view(DataType::BinaryView)] // TODO: https://github.com/lancedb/lance/issues/5172 +#[case::binary_view(DataType::BinaryView)] async fn test_query_binary(#[case] data_type: DataType) { // Create arrays that include empty binary let binary_values = vec![