Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Moved is_ordered from Field to DataType::Dictionary #711

Merged
merged 1 commit into from
Dec 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion arrow-parquet-integration-testing/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ fn main() -> Result<()> {
.fields()
.iter()
.map(|x| match x.data_type() {
DataType::Dictionary(_, _) => Encoding::RleDictionary,
DataType::Dictionary(..) => Encoding::RleDictionary,
DataType::Utf8 | DataType::LargeUtf8 => {
if utf8_encoding == "delta" {
Encoding::DeltaLengthByteArray
Expand Down
5 changes: 3 additions & 2 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ impl<K: DictionaryKey> DictionaryArray<K> {

/// The canonical method to create a new [`DictionaryArray`].
pub fn from_data(keys: PrimitiveArray<K>, values: Arc<dyn Array>) -> Self {
let data_type = DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()));
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()), false);

Self {
data_type,
Expand Down Expand Up @@ -165,7 +166,7 @@ impl<K: DictionaryKey> DictionaryArray<K> {
impl<K: DictionaryKey> DictionaryArray<K> {
pub(crate) fn get_child(data_type: &DataType) -> &DataType {
match data_type {
DataType::Dictionary(_, values) => values.as_ref(),
DataType::Dictionary(_, values, _) => values.as_ref(),
DataType::Extension(_, inner, _) => Self::get_child(inner),
_ => panic!("DictionaryArray must be initialized with DataType::Dictionary"),
}
Expand Down
12 changes: 10 additions & 2 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for D
impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand All @@ -44,7 +48,11 @@ impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
pub fn new() -> Self {
let values = M::default();
Self {
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand Down
2 changes: 1 addition & 1 deletion src/array/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> Strin
};
dyn_display!(array, ListArray<i64>, f)
}
Dictionary(key_type, _) => match_integer_type!(key_type, |$T| {
Dictionary(key_type, ..) => match_integer_type!(key_type, |$T| {
let a = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
Expand Down
2 changes: 1 addition & 1 deletion src/array/ord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result<DynComparato
(LargeUtf8, LargeUtf8) => compare_string::<i64>(left, right),
(Binary, Binary) => compare_binary::<i32>(left, right),
(LargeBinary, LargeBinary) => compare_binary::<i64>(left, right),
(Dictionary(key_type_lhs, _), Dictionary(key_type_rhs, _)) => {
(Dictionary(key_type_lhs, ..), Dictionary(key_type_rhs, ..)) => {
match (key_type_lhs, key_type_rhs) {
(IntegerType::UInt8, IntegerType::UInt8) => dyn_dict!(u8, left, right),
(IntegerType::UInt16, IntegerType::UInt16) => dyn_dict!(u16, left, right),
Expand Down
2 changes: 1 addition & 1 deletion src/compute/arithmetics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ pub fn neg(array: &dyn Array) -> Box<dyn Array> {

/// Whether [`neg`] is supported for a given [`DataType`]
pub fn can_neg(data_type: &DataType) -> bool {
if let DataType::Dictionary(_, values) = data_type.to_logical_type() {
if let DataType::Dictionary(_, values, _) = data_type.to_logical_type() {
return can_neg(values.as_ref());
}

Expand Down
2 changes: 1 addition & 1 deletion src/compute/cast/dictionary_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ pub(super) fn dictionary_cast_dyn<K: DictionaryKey>(
let values = array.values();

match to_type {
DataType::Dictionary(to_keys_type, to_values_type) => {
DataType::Dictionary(to_keys_type, to_values_type, _) => {
let values = cast(values.as_ref(), to_values_type, options)?.into();

// create the appropriate array type
Expand Down
82 changes: 13 additions & 69 deletions src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,40 +80,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
match (from_type, to_type) {
(
Null,
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
)
| (
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
Null,
) => true,
(Struct(_), _) => false,
Expand All @@ -127,11 +99,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
(List(list_from), LargeList(list_to)) if list_from == list_to => true,
(LargeList(list_from), List(list_to)) if list_from == list_to => true,
(_, List(list_to)) => can_cast_types(from_type, list_to.data_type()),
(Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => {
(Dictionary(_, from_value_type, _), Dictionary(_, to_value_type, _)) => {
can_cast_types(from_value_type, to_value_type)
}
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type),
(Dictionary(_, value_type, _), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type, _)) => can_cast_types(from_type, value_type),

(_, Boolean) => is_numeric(from_type),
(Boolean, _) => {
Expand Down Expand Up @@ -376,40 +348,12 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
match (from_type, to_type) {
(
Null,
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
)
| (
Boolean
| Int8
| UInt8
| Int16
| UInt16
| Int32
| UInt32
| Float32
| Date32
| Time32(_)
| Int64
| UInt64
| Float64
| Date64
| List(_)
| Dictionary(_, _),
Boolean | Int8 | UInt8 | Int16 | UInt16 | Int32 | UInt32 | Float32 | Date32 | Time32(_)
| Int64 | UInt64 | Float64 | Date64 | List(_) | Dictionary(..),
Null,
) => Ok(new_null_array(to_type.clone(), array.len())),
(Struct(_), _) => Err(ArrowError::NotYetImplemented(
Expand Down Expand Up @@ -449,10 +393,10 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
Ok(Box::new(list_array))
}

(Dictionary(index_type, _), _) => match_integer_type!(index_type, |$T| {
(Dictionary(index_type, ..), _) => match_integer_type!(index_type, |$T| {
dictionary_cast_dyn::<$T>(array, to_type, options)
}),
(_, Dictionary(index_type, value_type)) => match_integer_type!(index_type, |$T| {
(_, Dictionary(index_type, value_type, _)) => match_integer_type!(index_type, |$T| {
cast_to_dictionary::<$T>(array, value_type, options)
}),
(_, Boolean) => match from_type {
Expand Down
4 changes: 2 additions & 2 deletions src/compute/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ pub fn sort_to_indices<I: Index>(
))),
}
}
DataType::Dictionary(key_type, value_type) => match value_type.as_ref() {
DataType::Dictionary(key_type, value_type, _) => match value_type.as_ref() {
DataType::Utf8 => Ok(sort_dict::<I, i32>(values, key_type, options, limit)),
DataType::LargeUtf8 => Ok(sort_dict::<I, i64>(values, key_type, options, limit)),
t => Err(ArrowError::NotYetImplemented(format!(
Expand Down Expand Up @@ -282,7 +282,7 @@ pub fn can_sort(data_type: &DataType) -> bool {
| DataType::UInt64
)
}
DataType::Dictionary(_, value_type) => {
DataType::Dictionary(_, value_type, _) => {
matches!(*value_type.as_ref(), DataType::Utf8 | DataType::LargeUtf8)
}
_ => false,
Expand Down
2 changes: 1 addition & 1 deletion src/compute/take/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,6 @@ pub fn can_take(data_type: &DataType) -> bool {
| DataType::Struct(_)
| DataType::List(_)
| DataType::LargeList(_)
| DataType::Dictionary(_, _)
| DataType::Dictionary(..)
)
}
26 changes: 2 additions & 24 deletions src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ pub struct Field {
pub nullable: bool,
/// The dictionary id of this field (currently un-used)
pub dict_id: i64,
/// Whether the dictionary's values are ordered
pub dict_is_ordered: bool,
/// A map of key-value pairs containing additional custom meta data.
pub metadata: Option<BTreeMap<String, String>>,
}
Expand All @@ -44,7 +42,6 @@ impl std::hash::Hash for Field {
self.name.hash(state);
self.data_type.hash(state);
self.nullable.hash(state);
self.dict_is_ordered.hash(state);
self.metadata.hash(state);
}
}
Expand All @@ -54,7 +51,6 @@ impl PartialEq for Field {
self.name == other.name
&& self.data_type == other.data_type
&& self.nullable == other.nullable
&& self.dict_is_ordered == other.dict_is_ordered
&& self.metadata == other.metadata
}
}
Expand All @@ -67,7 +63,6 @@ impl Field {
data_type,
nullable,
dict_id: 0,
dict_is_ordered: false,
metadata: None,
}
}
Expand All @@ -78,14 +73,12 @@ impl Field {
data_type: DataType,
nullable: bool,
dict_id: i64,
dict_is_ordered: bool,
) -> Self {
Field {
name: name.into(),
data_type,
nullable,
dict_id,
dict_is_ordered,
metadata: None,
}
}
Expand All @@ -98,7 +91,6 @@ impl Field {
data_type: self.data_type,
nullable: self.nullable,
dict_id: self.dict_id,
dict_is_ordered: self.dict_is_ordered,
metadata: Some(metadata),
}
}
Expand Down Expand Up @@ -143,16 +135,7 @@ impl Field {
#[inline]
pub const fn dict_id(&self) -> Option<i64> {
match self.data_type {
DataType::Dictionary(_, _) => Some(self.dict_id),
_ => None,
}
}

/// Returns whether this [`Field`]'s dictionary is ordered, if this is a dictionary type.
#[inline]
pub const fn dict_is_ordered(&self) -> Option<bool> {
match self.data_type {
DataType::Dictionary(_, _) => Some(self.dict_is_ordered),
DataType::Dictionary(_, _, _) => Some(self.dict_id),
_ => None,
}
}
Expand Down Expand Up @@ -197,11 +180,6 @@ impl Field {
"Fail to merge schema Field due to conflicting dict_id".to_string(),
));
}
if from.dict_is_ordered != self.dict_is_ordered {
return Err(ArrowError::InvalidArgumentError(
"Fail to merge schema Field due to conflicting dict_is_ordered".to_string(),
));
}
match &mut self.data_type {
DataType::Struct(nested_fields) => match &from.data_type {
DataType::Struct(from_nested_fields) => {
Expand Down Expand Up @@ -270,7 +248,7 @@ impl Field {
| DataType::Interval(_)
| DataType::LargeList(_)
| DataType::List(_)
| DataType::Dictionary(_, _)
| DataType::Dictionary(_, _, _)
| DataType::FixedSizeList(_, _)
| DataType::FixedSizeBinary(_)
| DataType::Utf8
Expand Down
4 changes: 2 additions & 2 deletions src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ pub enum DataType {
///
/// This type mostly used to represent low cardinality string
/// arrays or a limited set of primitive types as integers.
Dictionary(IntegerType, Box<DataType>),
Dictionary(IntegerType, Box<DataType>, bool),
/// Decimal value with precision and scale
/// precision is the number of digits in the number and
/// scale is the number of decimal places.
Expand Down Expand Up @@ -261,7 +261,7 @@ impl DataType {
Struct(_) => PhysicalType::Struct,
Union(_, _, _) => PhysicalType::Union,
Map(_, _) => PhysicalType::Map,
Dictionary(key, _) => PhysicalType::Dictionary(*key),
Dictionary(key, _, _) => PhysicalType::Dictionary(*key),
Extension(_, key, _) => key.to_physical_type(),
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/ffi/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ fn create_dictionary(
field: &Field,
parent: Arc<ArrowArray>,
) -> Result<Option<ArrowArrayChild<'static>>> {
if let DataType::Dictionary(_, values) = field.data_type() {
if let DataType::Dictionary(_, values, _) = field.data_type() {
let field = Field::new("", values.as_ref().clone(), true);
assert!(!array.dictionary.is_null());
let array = unsafe { &*array.dictionary };
Expand Down
9 changes: 5 additions & 4 deletions src/ffi/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ impl Ffi_ArrowSchema {
.collect::<Box<_>>();
let n_children = children_ptr.len() as i64;

let dictionary = if let DataType::Dictionary(_, values) = field.data_type() {
flags += field.dict_is_ordered().unwrap_or_default() as i64;
let dictionary = if let DataType::Dictionary(_, values, is_ordered) = field.data_type() {
flags += *is_ordered as i64;
// we do not store field info in the dict values, so can't recover it all :(
let field = Field::new("", values.as_ref().clone(), true);
Some(Box::new(Ffi_ArrowSchema::new(&field)))
Expand Down Expand Up @@ -214,7 +214,8 @@ pub(crate) unsafe fn to_field(schema: &Ffi_ArrowSchema) -> Result<Field> {
let data_type = if let Some(dictionary) = dictionary {
let indices = to_integer_type(schema.format())?;
let values = to_field(dictionary)?;
DataType::Dictionary(indices, Box::new(values.data_type().clone()))
let is_ordered = schema.flags & 1 == 1;
DataType::Dictionary(indices, Box::new(values.data_type().clone()), is_ordered)
} else {
to_data_type(schema)?
};
Expand Down Expand Up @@ -449,7 +450,7 @@ fn to_format(data_type: &DataType) -> String {
r
}
DataType::Map(_, _) => "+m".to_string(),
DataType::Dictionary(index, _) => to_format(&(*index).into()),
DataType::Dictionary(index, _, _) => to_format(&(*index).into()),
DataType::Extension(_, inner, _) => to_format(inner.as_ref()),
}
}
Expand Down
1 change: 1 addition & 0 deletions src/io/avro/read/nested.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ impl FixedItemsUtf8Dictionary {
data_type: DataType::Dictionary(
IntegerType::Int32,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<i32>::with_capacity(capacity),
values,
Expand Down
Loading