Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Simplified dictionary indexes.
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao committed Nov 7, 2021
1 parent f146097 commit 29edd98
Show file tree
Hide file tree
Showing 38 changed files with 272 additions and 507 deletions.
47 changes: 30 additions & 17 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ use std::sync::Arc;

use crate::{
bitmap::Bitmap,
datatypes::DataType,
datatypes::{DataType, IntegerType},
scalar::{new_scalar, Scalar},
types::{NativeType, NaturalDataType},
types::NativeType,
};

mod ffi;
Expand All @@ -16,19 +16,35 @@ pub use mutable::*;
use super::{new_empty_array, primitive::PrimitiveArray, Array};

/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
pub trait DictionaryKey:
NativeType + NaturalDataType + num_traits::NumCast + num_traits::FromPrimitive
{
pub trait DictionaryKey: NativeType + num_traits::NumCast + num_traits::FromPrimitive {
/// The corresponding [`IntegerType`] of this key
const KEY_TYPE: IntegerType;
}

impl DictionaryKey for i8 {}
impl DictionaryKey for i16 {}
impl DictionaryKey for i32 {}
impl DictionaryKey for i64 {}
impl DictionaryKey for u8 {}
impl DictionaryKey for u16 {}
impl DictionaryKey for u32 {}
impl DictionaryKey for u64 {}
impl DictionaryKey for i8 {
const KEY_TYPE: IntegerType = IntegerType::Int8;
}
impl DictionaryKey for i16 {
const KEY_TYPE: IntegerType = IntegerType::Int16;
}
impl DictionaryKey for i32 {
const KEY_TYPE: IntegerType = IntegerType::Int32;
}
impl DictionaryKey for i64 {
const KEY_TYPE: IntegerType = IntegerType::Int64;
}
impl DictionaryKey for u8 {
const KEY_TYPE: IntegerType = IntegerType::UInt8;
}
impl DictionaryKey for u16 {
const KEY_TYPE: IntegerType = IntegerType::UInt16;
}
impl DictionaryKey for u32 {
const KEY_TYPE: IntegerType = IntegerType::UInt32;
}
impl DictionaryKey for u64 {
const KEY_TYPE: IntegerType = IntegerType::UInt64;
}

/// An [`Array`] whose values are encoded by keys. This [`Array`] is useful when the cardinality of
/// values is low compared to the length of the [`Array`].
Expand Down Expand Up @@ -59,10 +75,7 @@ impl<K: DictionaryKey> DictionaryArray<K> {

/// The canonical method to create a new [`DictionaryArray`].
pub fn from_data(keys: PrimitiveArray<K>, values: Arc<dyn Array>) -> Self {
let data_type = DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
);
let data_type = DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()));

Self {
data_type,
Expand Down
10 changes: 2 additions & 8 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,7 @@ impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for D
impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(values.data_type().clone()),
),
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand All @@ -47,10 +44,7 @@ impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
pub fn new() -> Self {
let values = M::default();
Self {
data_type: DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(values.data_type().clone()),
),
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand Down
32 changes: 9 additions & 23 deletions src/array/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,6 @@ macro_rules! dyn_primitive {
}};
}

macro_rules! dyn_dict {
($array:expr, $ty:ty) => {{
let a = $array
.as_any()
.downcast_ref::<DictionaryArray<$ty>>()
.unwrap();
let keys = a.keys();
let display = get_display(a.values().as_ref());
Box::new(move |row: usize| display(keys.value(row) as usize))
}};
}

/// Returns a function of index returning the string representation of the _value_ of `array`.
/// This does not take nulls into account.
pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> String + 'a> {
Expand Down Expand Up @@ -170,17 +158,15 @@ pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> Strin
};
dyn_display!(array, ListArray<i64>, f)
}
Dictionary(key_type, _) => match key_type.as_ref() {
DataType::Int8 => dyn_dict!(array, i8),
DataType::Int16 => dyn_dict!(array, i16),
DataType::Int32 => dyn_dict!(array, i32),
DataType::Int64 => dyn_dict!(array, i64),
DataType::UInt8 => dyn_dict!(array, u8),
DataType::UInt16 => dyn_dict!(array, u16),
DataType::UInt32 => dyn_dict!(array, u32),
DataType::UInt64 => dyn_dict!(array, u64),
_ => unreachable!(),
},
Dictionary(key_type, _) => match_integer_type!(key_type, |$T| {
let a = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap();
let keys = a.keys();
let display = get_display(a.values().as_ref());
Box::new(move |row: usize| display(keys.value(row) as usize))
}),
Map(_, _) => todo!(),
Struct(_) => {
let a = array.as_any().downcast_ref::<StructArray>().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool {
struct_::equal(lhs, rhs)
}
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
dictionary::equal::<$T>(lhs, rhs)
Expand Down
2 changes: 1 addition & 1 deletion src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren
Union => ffi_dyn!(array, UnionArray),
Map => ffi_dyn!(array, MapArray),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
let array = array.as_any().downcast_ref::<DictionaryArray<$T>>().unwrap();
(
array.offset().unwrap(),
Expand Down
36 changes: 15 additions & 21 deletions src/array/growable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,6 @@ macro_rules! dyn_growable {
}};
}

macro_rules! dyn_dict_growable {
($ty:ty, $arrays:expr, $use_validity:expr, $capacity:expr) => {{
let arrays = $arrays
.iter()
.map(|array| {
array
.as_any()
.downcast_ref::<DictionaryArray<$ty>>()
.unwrap()
})
.collect::<Vec<_>>();
Box::new(dictionary::GrowableDictionary::<$ty>::new(
&arrays,
$use_validity,
$capacity,
))
}};
}

/// Creates a new [`Growable`] from an arbitrary number of [`Array`]s.
/// # Panics
/// This function panics iff
Expand Down Expand Up @@ -132,8 +113,21 @@ pub fn make_growable<'a>(
),
Union | Map => todo!(),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
dyn_dict_growable!($T, arrays, use_validity, capacity)
match_integer_type!(key_type, |$T| {
let arrays = arrays
.iter()
.map(|array| {
array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap()
})
.collect::<Vec<_>>();
Box::new(dictionary::GrowableDictionary::<$T>::new(
&arrays,
use_validity,
capacity,
))
})
}
}
Expand Down
29 changes: 6 additions & 23 deletions src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,28 +174,11 @@ macro_rules! fmt_dyn {
}};
}

macro_rules! with_match_dictionary_key_type {(
macro_rules! match_integer_type {(
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
) => ({
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
match $key_type {
DataType::Int8 => __with_ty__! { i8 },
DataType::Int16 => __with_ty__! { i16 },
DataType::Int32 => __with_ty__! { i32 },
DataType::Int64 => __with_ty__! { i64 },
DataType::UInt8 => __with_ty__! { u8 },
DataType::UInt16 => __with_ty__! { u16 },
DataType::UInt32 => __with_ty__! { u32 },
DataType::UInt64 => __with_ty__! { u64 },
_ => ::core::unreachable!("A dictionary key type can only be of integer types"),
}
})}

macro_rules! with_match_physical_dictionary_key_type {(
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
) => ({
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
use crate::datatypes::DictionaryIndexType::*;
use crate::datatypes::IntegerType::*;
match $key_type {
Int8 => __with_ty__! { i8 },
Int16 => __with_ty__! { i16 },
Expand Down Expand Up @@ -251,7 +234,7 @@ impl Display for dyn Array {
Struct => fmt_dyn!(self, StructArray, f),
Union => fmt_dyn!(self, UnionArray, f),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
fmt_dyn!(self, DictionaryArray::<$T>, f)
})
}
Expand Down Expand Up @@ -281,7 +264,7 @@ pub fn new_empty_array(data_type: DataType) -> Box<dyn Array> {
Union => Box::new(UnionArray::new_empty(data_type)),
Map => Box::new(MapArray::new_empty(data_type)),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
Box::new(DictionaryArray::<$T>::new_empty(data_type))
})
}
Expand Down Expand Up @@ -311,7 +294,7 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box<dyn Array> {
Union => Box::new(UnionArray::new_null(data_type, length)),
Map => Box::new(MapArray::new_null(data_type, length)),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
Box::new(DictionaryArray::<$T>::new_null(data_type, length))
})
}
Expand Down Expand Up @@ -349,7 +332,7 @@ pub fn clone(array: &dyn Array) -> Box<dyn Array> {
Union => clone_dyn!(array, UnionArray),
Map => clone_dyn!(array, MapArray),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
clone_dyn!(array, DictionaryArray::<$T>)
})
}
Expand Down
18 changes: 9 additions & 9 deletions src/array/ord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,15 +216,15 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result<DynComparato
(Binary, Binary) => compare_binary::<i32>(left, right),
(LargeBinary, LargeBinary) => compare_binary::<i64>(left, right),
(Dictionary(key_type_lhs, _), Dictionary(key_type_rhs, _)) => {
match (key_type_lhs.as_ref(), key_type_rhs.as_ref()) {
(UInt8, UInt8) => dyn_dict!(u8, left, right),
(UInt16, UInt16) => dyn_dict!(u16, left, right),
(UInt32, UInt32) => dyn_dict!(u32, left, right),
(UInt64, UInt64) => dyn_dict!(u64, left, right),
(Int8, Int8) => dyn_dict!(i8, left, right),
(Int16, Int16) => dyn_dict!(i16, left, right),
(Int32, Int32) => dyn_dict!(i32, left, right),
(Int64, Int64) => dyn_dict!(i64, left, right),
match (key_type_lhs, key_type_rhs) {
(IntegerType::UInt8, IntegerType::UInt8) => dyn_dict!(u8, left, right),
(IntegerType::UInt16, IntegerType::UInt16) => dyn_dict!(u16, left, right),
(IntegerType::UInt32, IntegerType::UInt32) => dyn_dict!(u32, left, right),
(IntegerType::UInt64, IntegerType::UInt64) => dyn_dict!(u64, left, right),
(IntegerType::Int8, IntegerType::Int8) => dyn_dict!(i8, left, right),
(IntegerType::Int16, IntegerType::Int16) => dyn_dict!(i16, left, right),
(IntegerType::Int32, IntegerType::Int32) => dyn_dict!(i32, left, right),
(IntegerType::Int64, IntegerType::Int64) => dyn_dict!(i64, left, right),
(lhs, _) => {
return Err(ArrowError::InvalidArgumentError(format!(
"Dictionaries do not support keys of type {:?}",
Expand Down
18 changes: 6 additions & 12 deletions src/compute/aggregate/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,6 @@ macro_rules! dyn_binary {
}};
}

macro_rules! dyn_dict {
($array:expr, $ty:ty) => {{
let array = $array
.as_any()
.downcast_ref::<DictionaryArray<$ty>>()
.unwrap();
estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref())
}};
}

/// Returns the total (heap) allocated size of the array in bytes.
/// # Implementation
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
Expand Down Expand Up @@ -106,8 +96,12 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize {
.sum::<usize>();
types + offsets + fields
}
Dictionary(key_type) => with_match_physical_dictionary_key_type!(key_type, |$T| {
dyn_dict!(array, $T)
Dictionary(key_type) => match_integer_type!(key_type, |$T| {
let array = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap();
estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref())
}),
Map => {
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
Expand Down
5 changes: 3 additions & 2 deletions src/compute/cast/dictionary_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ pub(super) fn dictionary_cast_dyn<K: DictionaryKey>(
let values = cast(values.as_ref(), to_values_type, options)?.into();

// create the appropriate array type
with_match_dictionary_key_type!(to_keys_type.as_ref(), |$T| {
key_cast!(keys, values, array, to_keys_type, $T)
let data_type = (*to_keys_type).into();
match_integer_type!(to_keys_type, |$T| {
key_cast!(keys, values, array, &data_type, $T)
})
}
_ => unpack_dictionary::<K>(keys, values.as_ref(), to_type, options),
Expand Down
31 changes: 6 additions & 25 deletions src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,31 +375,12 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
Ok(Box::new(list_array))
}

(Dictionary(index_type, _), _) => match **index_type {
DataType::Int8 => dictionary_cast_dyn::<i8>(array, to_type, options),
DataType::Int16 => dictionary_cast_dyn::<i16>(array, to_type, options),
DataType::Int32 => dictionary_cast_dyn::<i32>(array, to_type, options),
DataType::Int64 => dictionary_cast_dyn::<i64>(array, to_type, options),
DataType::UInt8 => dictionary_cast_dyn::<u8>(array, to_type, options),
DataType::UInt16 => dictionary_cast_dyn::<u16>(array, to_type, options),
DataType::UInt32 => dictionary_cast_dyn::<u32>(array, to_type, options),
DataType::UInt64 => dictionary_cast_dyn::<u64>(array, to_type, options),
_ => unreachable!(),
},
(_, Dictionary(index_type, value_type)) => match **index_type {
DataType::Int8 => cast_to_dictionary::<i8>(array, value_type, options),
DataType::Int16 => cast_to_dictionary::<i16>(array, value_type, options),
DataType::Int32 => cast_to_dictionary::<i32>(array, value_type, options),
DataType::Int64 => cast_to_dictionary::<i64>(array, value_type, options),
DataType::UInt8 => cast_to_dictionary::<u8>(array, value_type, options),
DataType::UInt16 => cast_to_dictionary::<u16>(array, value_type, options),
DataType::UInt32 => cast_to_dictionary::<u32>(array, value_type, options),
DataType::UInt64 => cast_to_dictionary::<u64>(array, value_type, options),
_ => Err(ArrowError::NotYetImplemented(format!(
"Casting from type {:?} to dictionary type {:?} not supported",
from_type, to_type,
))),
},
(Dictionary(index_type, _), _) => match_integer_type!(index_type, |$T| {
dictionary_cast_dyn::<$T>(array, to_type, options)
}),
(_, Dictionary(index_type, value_type)) => match_integer_type!(index_type, |$T| {
cast_to_dictionary::<$T>(array, value_type, options)
}),
(_, Boolean) => match from_type {
UInt8 => primitive_to_boolean_dyn::<u8>(array, to_type.clone()),
UInt16 => primitive_to_boolean_dyn::<u16>(array, to_type.clone()),
Expand Down
Loading

0 comments on commit 29edd98

Please sign in to comment.