-
Notifications
You must be signed in to change notification settings - Fork 847
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve performance of DictionaryArray::try_new() #1435
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -693,7 +693,7 @@ impl ArrayData { | |
// At the moment, constructing a DictionaryArray will also check this | ||
if !DataType::is_dictionary_key_type(key_type) { | ||
return Err(ArrowError::InvalidArgumentError(format!( | ||
"Dictionary values must be integer, but was {}", | ||
"Dictionary key type must be integer, but was {}", | ||
key_type | ||
))); | ||
} | ||
|
@@ -926,8 +926,8 @@ impl ArrayData { | |
/// | ||
/// 1. Null count is correct | ||
/// 2. All offsets are valid | ||
/// 3. All String data is valid UTF-8 | ||
/// 3. All dictionary offsets are valid | ||
/// 3. All String data is valid UTF-8 | ||
/// 4. All dictionary offsets are valid | ||
/// | ||
/// Does not (yet) check | ||
/// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) | ||
|
@@ -949,68 +949,68 @@ impl ArrayData { | |
))); | ||
} | ||
|
||
self.validate_dictionary_offset()?; | ||
|
||
// validate all children recursively | ||
self.child_data | ||
.iter() | ||
.enumerate() | ||
.try_for_each(|(i, child_data)| { | ||
child_data.validate_full().map_err(|e| { | ||
ArrowError::InvalidArgumentError(format!( | ||
"{} child #{} invalid: {}", | ||
self.data_type, i, e | ||
)) | ||
}) | ||
})?; | ||
|
||
Ok(()) | ||
} | ||
|
||
pub fn validate_dictionary_offset(&self) -> Result<()> { | ||
match &self.data_type { | ||
DataType::Utf8 => { | ||
self.validate_utf8::<i32>()?; | ||
} | ||
DataType::LargeUtf8 => { | ||
self.validate_utf8::<i64>()?; | ||
} | ||
DataType::Binary => { | ||
self.validate_offsets_full::<i32>(self.buffers[1].len())?; | ||
} | ||
DataType::Utf8 => self.validate_utf8::<i32>(), | ||
DataType::LargeUtf8 => self.validate_utf8::<i64>(), | ||
DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()), | ||
DataType::LargeBinary => { | ||
self.validate_offsets_full::<i64>(self.buffers[1].len())?; | ||
self.validate_offsets_full::<i64>(self.buffers[1].len()) | ||
} | ||
DataType::List(_) | DataType::Map(_, _) => { | ||
let child = &self.child_data[0]; | ||
self.validate_offsets_full::<i32>(child.len + child.offset)?; | ||
self.validate_offsets_full::<i32>(child.len + child.offset) | ||
} | ||
DataType::LargeList(_) => { | ||
let child = &self.child_data[0]; | ||
self.validate_offsets_full::<i64>(child.len + child.offset)?; | ||
self.validate_offsets_full::<i64>(child.len + child.offset) | ||
} | ||
DataType::Union(_, _) => { | ||
// Validate Union Array as part of implementing new Union semantics | ||
// See comments in `ArrayData::validate()` | ||
// https://github.com/apache/arrow-rs/issues/85 | ||
// | ||
// TODO file follow on ticket for full union validation | ||
Ok(()) | ||
} | ||
DataType::Dictionary(key_type, _value_type) => { | ||
let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); | ||
let max_value = dictionary_length - 1; | ||
match key_type.as_ref() { | ||
DataType::UInt8 => self.check_bounds::<u8>(max_value)?, | ||
DataType::UInt16 => self.check_bounds::<u16>(max_value)?, | ||
DataType::UInt32 => self.check_bounds::<u32>(max_value)?, | ||
DataType::UInt64 => self.check_bounds::<u64>(max_value)?, | ||
DataType::Int8 => self.check_bounds::<i8>(max_value)?, | ||
DataType::Int16 => self.check_bounds::<i16>(max_value)?, | ||
DataType::Int32 => self.check_bounds::<i32>(max_value)?, | ||
DataType::Int64 => self.check_bounds::<i64>(max_value)?, | ||
DataType::UInt8 => self.check_bounds::<u8>(max_value), | ||
DataType::UInt16 => self.check_bounds::<u16>(max_value), | ||
DataType::UInt32 => self.check_bounds::<u32>(max_value), | ||
DataType::UInt64 => self.check_bounds::<u64>(max_value), | ||
DataType::Int8 => self.check_bounds::<i8>(max_value), | ||
DataType::Int16 => self.check_bounds::<i16>(max_value), | ||
DataType::Int32 => self.check_bounds::<i32>(max_value), | ||
DataType::Int64 => self.check_bounds::<i64>(max_value), | ||
_ => unreachable!(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think "the dictionary validation logic" is only for the logic inside There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The validation logic for other data types are not for dictionary offset. |
||
} | ||
} | ||
_ => { | ||
// No extra validation check required for other types | ||
Ok(()) | ||
} | ||
}; | ||
|
||
// validate all children recursively | ||
self.child_data | ||
.iter() | ||
.enumerate() | ||
.try_for_each(|(i, child_data)| { | ||
child_data.validate_full().map_err(|e| { | ||
ArrowError::InvalidArgumentError(format!( | ||
"{} child #{} invalid: {}", | ||
self.data_type, i, e | ||
)) | ||
}) | ||
})?; | ||
|
||
Ok(()) | ||
} | ||
} | ||
|
||
/// Calls the `validate(item_index, range)` function for each of | ||
|
@@ -1736,7 +1736,7 @@ mod tests { | |
|
||
// Test creating a dictionary with a non integer type | ||
#[test] | ||
#[should_panic(expected = "Dictionary values must be integer, but was Utf8")] | ||
#[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] | ||
fn test_non_int_dictionary() { | ||
let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); | ||
let data_type = | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.