Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 72 additions & 3 deletions rust/lance-arrow/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ pub trait DataTypeExt {
impl DataTypeExt for DataType {
fn is_binary_like(&self) -> bool {
use DataType::*;
matches!(self, Utf8 | Binary | LargeUtf8 | LargeBinary)
matches!(
self,
Utf8 | Binary | LargeUtf8 | LargeBinary | Utf8View | BinaryView
)
}

fn is_struct(&self) -> bool {
Expand Down Expand Up @@ -462,7 +465,11 @@ pub fn iter_str_array(arr: &dyn Array) -> Box<dyn Iterator<Item = Option<&str>>
match arr.data_type() {
DataType::Utf8 => Box::new(arr.as_string::<i32>().iter()),
DataType::LargeUtf8 => Box::new(arr.as_string::<i64>().iter()),
_ => panic!("Expecting Utf8 or LargeUtf8, found {:?}", arr.data_type()),
DataType::Utf8View => Box::new(arr.as_string_view().iter()),
_ => panic!(
"Expecting Utf8, LargeUtf8 or Utf8View, found {:?}",
arr.data_type()
),
}
}

Expand Down Expand Up @@ -1558,7 +1565,7 @@ impl BufferExt for arrow_buffer::Buffer {
mod tests {
use super::*;
use arrow_array::{new_empty_array, new_null_array, ListArray, StringArray};
use arrow_array::{Float32Array, Int32Array, StructArray};
use arrow_array::{Float32Array, Int32Array, StringViewArray, StructArray};
use arrow_buffer::OffsetBuffer;

#[test]
Expand Down Expand Up @@ -2523,4 +2530,66 @@ mod tests {
&Int32Array::from(vec![1, 2]) as &dyn Array
);
}

#[test]
fn test_iter_str_array_utf8view_with_nulls() {
// Test Utf8View iteration with nulls
let values: Vec<Option<&str>> = vec![
Some("hello"),
None,
Some("world"),
None,
Some(""),
Some("test"),
];
let array = StringViewArray::from(values);
let mut iter = iter_str_array(&array);

assert_eq!(iter.next(), Some(Some("hello")));
assert_eq!(iter.next(), Some(None));
assert_eq!(iter.next(), Some(Some("world")));
assert_eq!(iter.next(), Some(None));
assert_eq!(iter.next(), Some(Some("")));
assert_eq!(iter.next(), Some(Some("test")));
assert_eq!(iter.next(), None);
}

#[test]
fn test_iter_str_array_utf8view_large_strings() {
// Test Utf8View with large strings
let large_str = "a".repeat(1000);
let x_repeated = "x".repeat(500);
let values = vec![
large_str.as_str(),
"medium",
"small",
"",
x_repeated.as_str(),
];
let array = StringViewArray::from(values);
let mut iter = iter_str_array(&array);

let first = iter.next().unwrap().unwrap();
assert_eq!(first.len(), 1000);
assert_eq!(iter.next(), Some(Some("medium")));
assert_eq!(iter.next(), Some(Some("small")));
assert_eq!(iter.next(), Some(Some("")));

let last = iter.next().unwrap().unwrap();
assert_eq!(last.len(), 500);
assert_eq!(iter.next(), None);
}

#[test]
fn test_iter_str_array_all_nulls_utf8view() {
// Test Utf8View array with all nulls
let values: Vec<Option<&str>> = vec![None, None, None];
let array = StringViewArray::from(values);
let mut iter = iter_str_array(&array);

assert_eq!(iter.next(), Some(None));
assert_eq!(iter.next(), Some(None));
assert_eq!(iter.next(), Some(None));
assert_eq!(iter.next(), None);
}
}
4 changes: 4 additions & 0 deletions rust/lance-core/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ impl TryFrom<&DataType> for LogicalType {
DataType::Binary => "binary".to_string(),
DataType::LargeUtf8 => "large_string".to_string(),
DataType::LargeBinary => "large_binary".to_string(),
DataType::Utf8View => "string_view".to_string(),
DataType::BinaryView => "binary_view".to_string(),
DataType::Date32 => "date32:day".to_string(),
DataType::Date64 => "date64:ms".to_string(),
DataType::Time32(tu) => format!("time32:{}", timeunit_to_str(tu)),
Expand Down Expand Up @@ -254,6 +256,8 @@ impl TryFrom<&LogicalType> for DataType {
"binary" => Some(Binary),
"large_string" => Some(LargeUtf8),
"large_binary" => Some(LargeBinary),
"string_view" => Some(Utf8View),
"binary_view" => Some(BinaryView),
BLOB_LOGICAL_TYPE => Some(LargeBinary),
"json" => Some(LargeBinary),
"date32:day" => Some(Date32),
Expand Down
12 changes: 11 additions & 1 deletion rust/lance-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -691,7 +691,17 @@ impl Field {
if (dt.is_primitive() && other_dt.is_primitive())
|| (dt.is_binary_like() && other_dt.is_binary_like()) =>
{
if dt != other_dt {
// View types (Utf8View, BinaryView) are stored as their non-view counterparts.
// Treat Utf8View as compatible with Utf8, and BinaryView as compatible with Binary.
let types_match = match (&dt, &other_dt) {
(DataType::Utf8View, DataType::Utf8) | (DataType::Utf8, DataType::Utf8View) => {
true
}
(DataType::BinaryView, DataType::Binary)
| (DataType::Binary, DataType::BinaryView) => true,
_ => dt == other_dt,
};
if !types_match {
return Err(Error::Schema {
message: format!(
"Attempt to project field by different types: {} and {}",
Expand Down
3 changes: 3 additions & 0 deletions rust/lance-datafusion/src/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,13 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option<ScalarVa
ScalarValue::Utf8(val) => match ty {
DataType::Utf8 => Some(value.clone()),
DataType::LargeUtf8 => Some(ScalarValue::LargeUtf8(val.clone())),
DataType::Utf8View => Some(ScalarValue::Utf8View(val.clone())),
_ => None,
},
ScalarValue::LargeUtf8(val) => match ty {
DataType::Utf8 => Some(ScalarValue::Utf8(val.clone())),
DataType::LargeUtf8 => Some(value.clone()),
DataType::Utf8View => Some(ScalarValue::Utf8View(val.clone())),
_ => None,
},
ScalarValue::Boolean(_) => match ty {
Expand Down Expand Up @@ -408,6 +410,7 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option<ScalarVa
ScalarValue::Binary(value) => match ty {
DataType::Binary => Some(ScalarValue::Binary(value.clone())),
DataType::LargeBinary => Some(ScalarValue::LargeBinary(value.clone())),
DataType::BinaryView => Some(ScalarValue::BinaryView(value.clone())),
DataType::FixedSizeBinary(len) => {
if let Some(value) = value {
if value.len() == *len as usize {
Expand Down
Loading