Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Simplified dictionary indexes #584

Merged
merged 1 commit into from
Nov 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 30 additions & 17 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ use std::sync::Arc;

use crate::{
bitmap::Bitmap,
datatypes::DataType,
datatypes::{DataType, IntegerType},
scalar::{new_scalar, Scalar},
types::{NativeType, NaturalDataType},
types::NativeType,
};

mod ffi;
Expand All @@ -16,19 +16,35 @@ pub use mutable::*;
use super::{new_empty_array, primitive::PrimitiveArray, Array};

/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
pub trait DictionaryKey:
NativeType + NaturalDataType + num_traits::NumCast + num_traits::FromPrimitive
{
pub trait DictionaryKey: NativeType + num_traits::NumCast + num_traits::FromPrimitive {
/// The corresponding [`IntegerType`] of this key
const KEY_TYPE: IntegerType;
}

impl DictionaryKey for i8 {}
impl DictionaryKey for i16 {}
impl DictionaryKey for i32 {}
impl DictionaryKey for i64 {}
impl DictionaryKey for u8 {}
impl DictionaryKey for u16 {}
impl DictionaryKey for u32 {}
impl DictionaryKey for u64 {}
impl DictionaryKey for i8 {
const KEY_TYPE: IntegerType = IntegerType::Int8;
}
impl DictionaryKey for i16 {
const KEY_TYPE: IntegerType = IntegerType::Int16;
}
impl DictionaryKey for i32 {
const KEY_TYPE: IntegerType = IntegerType::Int32;
}
impl DictionaryKey for i64 {
const KEY_TYPE: IntegerType = IntegerType::Int64;
}
impl DictionaryKey for u8 {
const KEY_TYPE: IntegerType = IntegerType::UInt8;
}
impl DictionaryKey for u16 {
const KEY_TYPE: IntegerType = IntegerType::UInt16;
}
impl DictionaryKey for u32 {
const KEY_TYPE: IntegerType = IntegerType::UInt32;
}
impl DictionaryKey for u64 {
const KEY_TYPE: IntegerType = IntegerType::UInt64;
}

/// An [`Array`] whose values are encoded by keys. This [`Array`] is useful when the cardinality of
/// values is low compared to the length of the [`Array`].
Expand Down Expand Up @@ -59,10 +75,7 @@ impl<K: DictionaryKey> DictionaryArray<K> {

/// The canonical method to create a new [`DictionaryArray`].
pub fn from_data(keys: PrimitiveArray<K>, values: Arc<dyn Array>) -> Self {
let data_type = DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
);
let data_type = DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()));

Self {
data_type,
Expand Down
10 changes: 2 additions & 8 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,7 @@ impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for D
impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(values.data_type().clone()),
),
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand All @@ -47,10 +44,7 @@ impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
pub fn new() -> Self {
let values = M::default();
Self {
data_type: DataType::Dictionary(
Box::new(K::DATA_TYPE),
Box::new(values.data_type().clone()),
),
data_type: DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone())),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
Expand Down
32 changes: 9 additions & 23 deletions src/array/display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,6 @@ macro_rules! dyn_primitive {
}};
}

macro_rules! dyn_dict {
($array:expr, $ty:ty) => {{
let a = $array
.as_any()
.downcast_ref::<DictionaryArray<$ty>>()
.unwrap();
let keys = a.keys();
let display = get_display(a.values().as_ref());
Box::new(move |row: usize| display(keys.value(row) as usize))
}};
}

/// Returns a function of index returning the string representation of the _value_ of `array`.
/// This does not take nulls into account.
pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> String + 'a> {
Expand Down Expand Up @@ -170,17 +158,15 @@ pub fn get_value_display<'a>(array: &'a dyn Array) -> Box<dyn Fn(usize) -> Strin
};
dyn_display!(array, ListArray<i64>, f)
}
Dictionary(key_type, _) => match key_type.as_ref() {
DataType::Int8 => dyn_dict!(array, i8),
DataType::Int16 => dyn_dict!(array, i16),
DataType::Int32 => dyn_dict!(array, i32),
DataType::Int64 => dyn_dict!(array, i64),
DataType::UInt8 => dyn_dict!(array, u8),
DataType::UInt16 => dyn_dict!(array, u16),
DataType::UInt32 => dyn_dict!(array, u32),
DataType::UInt64 => dyn_dict!(array, u64),
_ => unreachable!(),
},
Dictionary(key_type, _) => match_integer_type!(key_type, |$T| {
let a = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap();
let keys = a.keys();
let display = get_display(a.values().as_ref());
Box::new(move |row: usize| display(keys.value(row) as usize))
}),
Map(_, _) => todo!(),
Struct(_) => {
let a = array.as_any().downcast_ref::<StructArray>().unwrap();
Expand Down
2 changes: 1 addition & 1 deletion src/array/equal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ pub fn equal(lhs: &dyn Array, rhs: &dyn Array) -> bool {
struct_::equal(lhs, rhs)
}
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
let lhs = lhs.as_any().downcast_ref().unwrap();
let rhs = rhs.as_any().downcast_ref().unwrap();
dictionary::equal::<$T>(lhs, rhs)
Expand Down
2 changes: 1 addition & 1 deletion src/array/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ pub fn offset_buffers_children_dictionary(array: &dyn Array) -> BuffersChildren
Union => ffi_dyn!(array, UnionArray),
Map => ffi_dyn!(array, MapArray),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
let array = array.as_any().downcast_ref::<DictionaryArray<$T>>().unwrap();
(
array.offset().unwrap(),
Expand Down
36 changes: 15 additions & 21 deletions src/array/growable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,6 @@ macro_rules! dyn_growable {
}};
}

macro_rules! dyn_dict_growable {
($ty:ty, $arrays:expr, $use_validity:expr, $capacity:expr) => {{
let arrays = $arrays
.iter()
.map(|array| {
array
.as_any()
.downcast_ref::<DictionaryArray<$ty>>()
.unwrap()
})
.collect::<Vec<_>>();
Box::new(dictionary::GrowableDictionary::<$ty>::new(
&arrays,
$use_validity,
$capacity,
))
}};
}

/// Creates a new [`Growable`] from an arbitrary number of [`Array`]s.
/// # Panics
/// This function panics iff
Expand Down Expand Up @@ -132,8 +113,21 @@ pub fn make_growable<'a>(
),
Union | Map => todo!(),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
dyn_dict_growable!($T, arrays, use_validity, capacity)
match_integer_type!(key_type, |$T| {
let arrays = arrays
.iter()
.map(|array| {
array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap()
})
.collect::<Vec<_>>();
Box::new(dictionary::GrowableDictionary::<$T>::new(
&arrays,
use_validity,
capacity,
))
})
}
}
Expand Down
29 changes: 6 additions & 23 deletions src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,28 +174,11 @@ macro_rules! fmt_dyn {
}};
}

macro_rules! with_match_dictionary_key_type {(
macro_rules! match_integer_type {(
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
) => ({
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
match $key_type {
DataType::Int8 => __with_ty__! { i8 },
DataType::Int16 => __with_ty__! { i16 },
DataType::Int32 => __with_ty__! { i32 },
DataType::Int64 => __with_ty__! { i64 },
DataType::UInt8 => __with_ty__! { u8 },
DataType::UInt16 => __with_ty__! { u16 },
DataType::UInt32 => __with_ty__! { u32 },
DataType::UInt64 => __with_ty__! { u64 },
_ => ::core::unreachable!("A dictionary key type can only be of integer types"),
}
})}

macro_rules! with_match_physical_dictionary_key_type {(
$key_type:expr, | $_:tt $T:ident | $($body:tt)*
) => ({
macro_rules! __with_ty__ {( $_ $T:ident ) => ( $($body)* )}
use crate::datatypes::DictionaryIndexType::*;
use crate::datatypes::IntegerType::*;
match $key_type {
Int8 => __with_ty__! { i8 },
Int16 => __with_ty__! { i16 },
Expand Down Expand Up @@ -251,7 +234,7 @@ impl Display for dyn Array {
Struct => fmt_dyn!(self, StructArray, f),
Union => fmt_dyn!(self, UnionArray, f),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
fmt_dyn!(self, DictionaryArray::<$T>, f)
})
}
Expand Down Expand Up @@ -281,7 +264,7 @@ pub fn new_empty_array(data_type: DataType) -> Box<dyn Array> {
Union => Box::new(UnionArray::new_empty(data_type)),
Map => Box::new(MapArray::new_empty(data_type)),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
Box::new(DictionaryArray::<$T>::new_empty(data_type))
})
}
Expand Down Expand Up @@ -311,7 +294,7 @@ pub fn new_null_array(data_type: DataType, length: usize) -> Box<dyn Array> {
Union => Box::new(UnionArray::new_null(data_type, length)),
Map => Box::new(MapArray::new_null(data_type, length)),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
Box::new(DictionaryArray::<$T>::new_null(data_type, length))
})
}
Expand Down Expand Up @@ -349,7 +332,7 @@ pub fn clone(array: &dyn Array) -> Box<dyn Array> {
Union => clone_dyn!(array, UnionArray),
Map => clone_dyn!(array, MapArray),
Dictionary(key_type) => {
with_match_physical_dictionary_key_type!(key_type, |$T| {
match_integer_type!(key_type, |$T| {
clone_dyn!(array, DictionaryArray::<$T>)
})
}
Expand Down
18 changes: 9 additions & 9 deletions src/array/ord.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,15 +216,15 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result<DynComparato
(Binary, Binary) => compare_binary::<i32>(left, right),
(LargeBinary, LargeBinary) => compare_binary::<i64>(left, right),
(Dictionary(key_type_lhs, _), Dictionary(key_type_rhs, _)) => {
match (key_type_lhs.as_ref(), key_type_rhs.as_ref()) {
(UInt8, UInt8) => dyn_dict!(u8, left, right),
(UInt16, UInt16) => dyn_dict!(u16, left, right),
(UInt32, UInt32) => dyn_dict!(u32, left, right),
(UInt64, UInt64) => dyn_dict!(u64, left, right),
(Int8, Int8) => dyn_dict!(i8, left, right),
(Int16, Int16) => dyn_dict!(i16, left, right),
(Int32, Int32) => dyn_dict!(i32, left, right),
(Int64, Int64) => dyn_dict!(i64, left, right),
match (key_type_lhs, key_type_rhs) {
(IntegerType::UInt8, IntegerType::UInt8) => dyn_dict!(u8, left, right),
(IntegerType::UInt16, IntegerType::UInt16) => dyn_dict!(u16, left, right),
(IntegerType::UInt32, IntegerType::UInt32) => dyn_dict!(u32, left, right),
(IntegerType::UInt64, IntegerType::UInt64) => dyn_dict!(u64, left, right),
(IntegerType::Int8, IntegerType::Int8) => dyn_dict!(i8, left, right),
(IntegerType::Int16, IntegerType::Int16) => dyn_dict!(i16, left, right),
(IntegerType::Int32, IntegerType::Int32) => dyn_dict!(i32, left, right),
(IntegerType::Int64, IntegerType::Int64) => dyn_dict!(i64, left, right),
(lhs, _) => {
return Err(ArrowError::InvalidArgumentError(format!(
"Dictionaries do not support keys of type {:?}",
Expand Down
18 changes: 6 additions & 12 deletions src/compute/aggregate/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,6 @@ macro_rules! dyn_binary {
}};
}

macro_rules! dyn_dict {
($array:expr, $ty:ty) => {{
let array = $array
.as_any()
.downcast_ref::<DictionaryArray<$ty>>()
.unwrap();
estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref())
}};
}

/// Returns the total (heap) allocated size of the array in bytes.
/// # Implementation
/// This estimation is the sum of the size of its buffers, validity, including nested arrays.
Expand Down Expand Up @@ -106,8 +96,12 @@ pub fn estimated_bytes_size(array: &dyn Array) -> usize {
.sum::<usize>();
types + offsets + fields
}
Dictionary(key_type) => with_match_physical_dictionary_key_type!(key_type, |$T| {
dyn_dict!(array, $T)
Dictionary(key_type) => match_integer_type!(key_type, |$T| {
let array = array
.as_any()
.downcast_ref::<DictionaryArray<$T>>()
.unwrap();
estimated_bytes_size(array.keys()) + estimated_bytes_size(array.values().as_ref())
}),
Map => {
let array = array.as_any().downcast_ref::<MapArray>().unwrap();
Expand Down
5 changes: 3 additions & 2 deletions src/compute/cast/dictionary_to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,9 @@ pub(super) fn dictionary_cast_dyn<K: DictionaryKey>(
let values = cast(values.as_ref(), to_values_type, options)?.into();

// create the appropriate array type
with_match_dictionary_key_type!(to_keys_type.as_ref(), |$T| {
key_cast!(keys, values, array, to_keys_type, $T)
let data_type = (*to_keys_type).into();
match_integer_type!(to_keys_type, |$T| {
key_cast!(keys, values, array, &data_type, $T)
})
}
_ => unpack_dictionary::<K>(keys, values.as_ref(), to_type, options),
Expand Down
31 changes: 6 additions & 25 deletions src/compute/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,31 +375,12 @@ pub fn cast(array: &dyn Array, to_type: &DataType, options: CastOptions) -> Resu
Ok(Box::new(list_array))
}

(Dictionary(index_type, _), _) => match **index_type {
DataType::Int8 => dictionary_cast_dyn::<i8>(array, to_type, options),
DataType::Int16 => dictionary_cast_dyn::<i16>(array, to_type, options),
DataType::Int32 => dictionary_cast_dyn::<i32>(array, to_type, options),
DataType::Int64 => dictionary_cast_dyn::<i64>(array, to_type, options),
DataType::UInt8 => dictionary_cast_dyn::<u8>(array, to_type, options),
DataType::UInt16 => dictionary_cast_dyn::<u16>(array, to_type, options),
DataType::UInt32 => dictionary_cast_dyn::<u32>(array, to_type, options),
DataType::UInt64 => dictionary_cast_dyn::<u64>(array, to_type, options),
_ => unreachable!(),
},
(_, Dictionary(index_type, value_type)) => match **index_type {
DataType::Int8 => cast_to_dictionary::<i8>(array, value_type, options),
DataType::Int16 => cast_to_dictionary::<i16>(array, value_type, options),
DataType::Int32 => cast_to_dictionary::<i32>(array, value_type, options),
DataType::Int64 => cast_to_dictionary::<i64>(array, value_type, options),
DataType::UInt8 => cast_to_dictionary::<u8>(array, value_type, options),
DataType::UInt16 => cast_to_dictionary::<u16>(array, value_type, options),
DataType::UInt32 => cast_to_dictionary::<u32>(array, value_type, options),
DataType::UInt64 => cast_to_dictionary::<u64>(array, value_type, options),
_ => Err(ArrowError::NotYetImplemented(format!(
"Casting from type {:?} to dictionary type {:?} not supported",
from_type, to_type,
))),
},
(Dictionary(index_type, _), _) => match_integer_type!(index_type, |$T| {
dictionary_cast_dyn::<$T>(array, to_type, options)
}),
(_, Dictionary(index_type, value_type)) => match_integer_type!(index_type, |$T| {
cast_to_dictionary::<$T>(array, value_type, options)
}),
(_, Boolean) => match from_type {
UInt8 => primitive_to_boolean_dyn::<u8>(array, to_type.clone()),
UInt16 => primitive_to_boolean_dyn::<u16>(array, to_type.clone()),
Expand Down
Loading