Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Improved dictionary (#1137)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorgecarleitao authored Jul 5, 2022
1 parent d87b38b commit 78a2a63
Show file tree
Hide file tree
Showing 36 changed files with 668 additions and 231 deletions.
16 changes: 10 additions & 6 deletions src/array/dictionary/ffi.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{
array::{FromFfi, PrimitiveArray, ToFfi},
error::Result,
error::Error,
ffi,
};

Expand All @@ -25,16 +25,20 @@ unsafe impl<K: DictionaryKey> ToFfi for DictionaryArray<K> {
}

impl<K: DictionaryKey, A: ffi::ArrowArrayRef> FromFfi<A> for DictionaryArray<K> {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
unsafe fn try_from_ffi(array: A) -> Result<Self, Error> {
// keys: similar to PrimitiveArray, but the datatype is the inner one
let validity = unsafe { array.validity() }?;
let values = unsafe { array.buffer::<K>(1) }?;

let data_type = K::PRIMITIVE.into();
let keys = PrimitiveArray::<K>::try_new(data_type, values, validity)?;
let values = array.dictionary()?.unwrap();
let data_type = array.data_type().clone();

let keys = PrimitiveArray::<K>::try_new(K::PRIMITIVE.into(), values, validity)?;
let values = array
.dictionary()?
.ok_or_else(|| Error::oos("Dictionary Array must contain a dictionary in ffi"))?;
let values = ffi::try_from(values)?;

Ok(DictionaryArray::<K>::from_data(keys, values))
// the assumption of this trait
DictionaryArray::<K>::try_new_unchecked(data_type, keys, values)
}
}
2 changes: 1 addition & 1 deletion src/array/dictionary/fmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub fn write_value<K: DictionaryKey, W: Write>(
let values = array.values();

if keys.is_valid(index) {
let key = keys.value(index).to_usize().unwrap();
let key = array.key_value(index);
get_display(values.as_ref(), null)(f, key)
} else {
write!(f, "{}", null)
Expand Down
17 changes: 1 addition & 16 deletions src/array/dictionary/iterator.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::bitmap::utils::{zip_validity, ZipValidity};
use crate::bitmap::utils::ZipValidity;
use crate::scalar::Scalar;
use crate::trusted_len::TrustedLen;

Expand Down Expand Up @@ -66,18 +66,3 @@ impl<'a, K: DictionaryKey> IntoIterator for &'a DictionaryArray<K> {
self.iter()
}
}

impl<'a, K: DictionaryKey> DictionaryArray<K> {
/// Returns an iterator of `Option<Box<dyn Array>>`
pub fn iter(&'a self) -> ZipIter<'a, K> {
zip_validity(
DictionaryValuesIter::new(self),
self.keys.validity().as_ref().map(|x| x.iter()),
)
}

/// Returns an iterator of `Box<dyn Array>`
pub fn values_iter(&'a self) -> ValuesIter<'a, K> {
DictionaryValuesIter::new(self)
}
}
235 changes: 199 additions & 36 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
use std::hint::unreachable_unchecked;

use crate::{
bitmap::Bitmap,
bitmap::{
utils::{zip_validity, ZipValidity},
Bitmap,
},
datatypes::{DataType, IntegerType},
error::Error,
scalar::{new_scalar, Scalar},
trusted_len::TrustedLen,
types::NativeType,
};

Expand All @@ -13,12 +20,23 @@ pub use iterator::*;
pub use mutable::*;

use super::{new_empty_array, primitive::PrimitiveArray, Array};
use crate::scalar::NullScalar;
use super::{new_null_array, specification::check_indexes};

/// Trait denoting [`NativeType`]s that can be used as keys of a dictionary.
pub trait DictionaryKey: NativeType + num_traits::NumCast + num_traits::FromPrimitive {
pub trait DictionaryKey: NativeType + TryInto<usize> + TryFrom<usize> {
/// The corresponding [`IntegerType`] of this key
const KEY_TYPE: IntegerType;

/// Represents this key as a `usize`.
/// # Safety
/// The caller _must_ have checked that the value can be casted to `usize`.
#[inline]
unsafe fn as_usize(self) -> usize {
match self.try_into() {
Ok(v) => v,
Err(_) => unreachable_unchecked(),
}
}
}

impl DictionaryKey for i8 {
Expand Down Expand Up @@ -46,47 +64,166 @@ impl DictionaryKey for u64 {
const KEY_TYPE: IntegerType = IntegerType::UInt64;
}

/// An [`Array`] whose values are encoded by keys. This [`Array`] is useful when the cardinality of
/// An [`Array`] whose values are stored as indices. This [`Array`] is useful when the cardinality of
/// values is low compared to the length of the [`Array`].
///
/// # Safety
/// This struct guarantees that each item of [`DictionaryArray::keys`] is castable to `usize` and
/// its value is smaller than [`DictionaryArray::values`]`.len()`. In other words, you can safely
/// use `unchecked` calls to retrive the values
#[derive(Clone)]
pub struct DictionaryArray<K: DictionaryKey> {
data_type: DataType,
keys: PrimitiveArray<K>,
values: Box<dyn Array>,
}

fn check_data_type(
key_type: IntegerType,
data_type: &DataType,
values_data_type: &DataType,
) -> Result<(), Error> {
if let DataType::Dictionary(key, value, _) = data_type.to_logical_type() {
if *key != key_type {
return Err(Error::oos(
"DictionaryArray must be initialized with a DataType::Dictionary whose integer is compatible to its keys",
));
}
if value.as_ref().to_logical_type() != values_data_type.to_logical_type() {
return Err(Error::oos(
"DictionaryArray must be initialized with a DataType::Dictionary whose value is equal to its values",
));
}
} else {
return Err(Error::oos(
"DictionaryArray must be initialized with logical DataType::Dictionary",
));
}
Ok(())
}

impl<K: DictionaryKey> DictionaryArray<K> {
/// Returns a new [`DictionaryArray`].
/// # Implementation
/// This function is `O(N)` where `N` is the length of keys
/// # Errors
/// This function errors iff
/// * the `data_type`'s logical type is not a `DictionaryArray`
/// * the `data_type`'s keys is not compatible with `keys`
/// * the `data_type`'s values's data_type is not equal with `values.data_type()`
/// * any of the keys's values is not represented in `usize` or is `>= values.len()`
pub fn try_new(
data_type: DataType,
keys: PrimitiveArray<K>,
values: Box<dyn Array>,
) -> Result<Self, Error> {
check_data_type(K::KEY_TYPE, &data_type, values.data_type())?;

check_indexes(keys.values(), values.len())?;

Ok(Self {
data_type,
keys,
values,
})
}

/// Returns a new [`DictionaryArray`].
/// # Implementation
/// This function is `O(N)` where `N` is the length of keys
/// # Errors
/// This function errors iff
/// * any of the keys's values is not represented in `usize` or is `>= values.len()`
pub fn try_from_keys(keys: PrimitiveArray<K>, values: Box<dyn Array>) -> Result<Self, Error> {
let data_type = Self::default_data_type(values.data_type().clone());
Self::try_new(data_type, keys, values)
}

/// Returns a new [`DictionaryArray`].
/// # Errors
/// This function errors iff
/// * the `data_type`'s logical type is not a `DictionaryArray`
/// * the `data_type`'s keys is not compatible with `keys`
/// * the `data_type`'s values's data_type is not equal with `values.data_type()`
/// # Safety
/// The caller must ensure that every keys's values is represented in `usize` and is `< values.len()`
pub unsafe fn try_new_unchecked(
data_type: DataType,
keys: PrimitiveArray<K>,
values: Box<dyn Array>,
) -> Result<Self, Error> {
check_data_type(K::KEY_TYPE, &data_type, values.data_type())?;

Ok(Self {
data_type,
keys,
values,
})
}

/// Returns a new empty [`DictionaryArray`].
pub fn new_empty(data_type: DataType) -> Self {
let values = Self::get_child(&data_type);
let values = Self::try_get_child(&data_type).unwrap();
let values = new_empty_array(values.clone());
let data_type = K::PRIMITIVE.into();
Self::from_data(PrimitiveArray::<K>::new_empty(data_type), values)
Self::try_new(
data_type,
PrimitiveArray::<K>::new_empty(K::PRIMITIVE.into()),
values,
)
.unwrap()
}

/// Returns an [`DictionaryArray`] whose all elements are null
#[inline]
pub fn new_null(data_type: DataType, length: usize) -> Self {
let values = Self::get_child(&data_type);
let data_type = K::PRIMITIVE.into();
Self::from_data(
PrimitiveArray::<K>::new_null(data_type, length),
new_empty_array(values.clone()),
let values = Self::try_get_child(&data_type).unwrap();
let values = new_null_array(values.clone(), 1);
Self::try_new(
data_type,
PrimitiveArray::<K>::new_null(K::PRIMITIVE.into(), length),
values,
)
.unwrap()
}

/// The canonical method to create a new [`DictionaryArray`].
pub fn from_data(keys: PrimitiveArray<K>, values: Box<dyn Array>) -> Self {
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(values.data_type().clone()), false);
/// Returns an iterator of [`Option<Box<dyn Scalar>>`].
/// # Implementation
/// This function will allocate a new [`Scalar`] per item and is usually not performant.
/// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.
pub fn iter(&self) -> ZipValidity<Box<dyn Scalar>, DictionaryValuesIter<K>> {
zip_validity(
DictionaryValuesIter::new(self),
self.keys.validity().as_ref().map(|x| x.iter()),
)
}

Self {
data_type,
keys,
values,
/// Returns an iterator of [`Box<dyn Scalar>`]
/// # Implementation
/// This function will allocate a new [`Scalar`] per item and is usually not performant.
/// Consider calling `keys_iter` and `values`, downcasting `values`, and iterating over that.
pub fn values_iter(&self) -> DictionaryValuesIter<K> {
DictionaryValuesIter::new(self)
}

/// Returns the [`DataType`] of this [`DictionaryArray`]
#[inline]
pub fn data_type(&self) -> &DataType {
&self.data_type
}

/// Returns whether the values of this [`DictionaryArray`] are ordered
#[inline]
pub fn is_ordered(&self) -> bool {
match self.data_type.to_logical_type() {
DataType::Dictionary(_, _, is_ordered) => *is_ordered,
_ => unreachable!(),
}
}

pub(crate) fn default_data_type(values_datatype: DataType) -> DataType {
DataType::Dictionary(K::KEY_TYPE, Box::new(values_datatype), false)
}

/// Creates a new [`DictionaryArray`] by slicing the existing [`DictionaryArray`].
/// # Panics
/// iff `offset + length > self.len()`.
Expand Down Expand Up @@ -124,10 +261,7 @@ impl<K: DictionaryKey> DictionaryArray<K> {
pub fn set_validity(&mut self, validity: Option<Bitmap>) {
self.keys.set_validity(validity);
}
}

// accessors
impl<K: DictionaryKey> DictionaryArray<K> {
/// Returns the length of this array
#[inline]
pub fn len(&self) -> usize {
Expand All @@ -147,21 +281,46 @@ impl<K: DictionaryKey> DictionaryArray<K> {
&self.keys
}

/// Returns an iterator of the keys' values of the [`DictionaryArray`] as `usize`
#[inline]
pub fn keys_values_iter(&self) -> impl TrustedLen<Item = usize> + Clone + '_ {
// safety - invariant of the struct
self.keys.values_iter().map(|x| unsafe { x.as_usize() })
}

/// Returns an iterator of the keys' of the [`DictionaryArray`] as `usize`
#[inline]
pub fn keys_iter(&self) -> impl TrustedLen<Item = Option<usize>> + Clone + '_ {
// safety - invariant of the struct
self.keys.iter().map(|x| x.map(|x| unsafe { x.as_usize() }))
}

/// Returns the keys' value of the [`DictionaryArray`] as `usize`
/// # Panics
/// This function panics iff `index >= self.len()`
#[inline]
pub fn key_value(&self, index: usize) -> usize {
// safety - invariant of the struct
unsafe { self.keys.values()[index].as_usize() }
}

/// Returns the values of the [`DictionaryArray`].
#[inline]
pub fn values(&self) -> &Box<dyn Array> {
&self.values
}

/// Returns the value of the [`DictionaryArray`] at position `i`.
/// # Implementation
/// This function will allocate a new [`Scalar`] and is usually not performant.
/// Consider calling `keys` and `values`, downcasting `values`, and iterating over that.
/// # Panic
/// This function panics iff `index >= self.len()`
#[inline]
pub fn value(&self, index: usize) -> Box<dyn Scalar> {
if self.keys.is_null(index) {
Box::new(NullScalar::new())
} else {
let index = self.keys.value(index).to_usize().unwrap();
new_scalar(self.values.as_ref(), index)
}
// safety - invariant of this struct
let index = unsafe { self.keys.value(index).as_usize() };
new_scalar(self.values.as_ref(), index)
}

/// Boxes self into a [`Box<dyn Array>`].
Expand All @@ -173,15 +332,16 @@ impl<K: DictionaryKey> DictionaryArray<K> {
pub fn arced(self) -> std::sync::Arc<dyn Array> {
std::sync::Arc::new(self)
}
}

impl<K: DictionaryKey> DictionaryArray<K> {
pub(crate) fn get_child(data_type: &DataType) -> &DataType {
match data_type {
pub(crate) fn try_get_child(data_type: &DataType) -> Result<&DataType, Error> {
Ok(match data_type.to_logical_type() {
DataType::Dictionary(_, values, _) => values.as_ref(),
DataType::Extension(_, inner, _) => Self::get_child(inner),
_ => panic!("DictionaryArray must be initialized with DataType::Dictionary"),
}
_ => {
return Err(Error::oos(
"Dictionaries must be initialized with DataType::Dictionary",
))
}
})
}
}

Expand Down Expand Up @@ -213,12 +373,15 @@ impl<K: DictionaryKey> Array for DictionaryArray<K> {
fn slice(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice(offset, length))
}

unsafe fn slice_unchecked(&self, offset: usize, length: usize) -> Box<dyn Array> {
Box::new(self.slice_unchecked(offset, length))
}

fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
Box::new(self.clone().with_validity(validity))
}

fn to_boxed(&self) -> Box<dyn Array> {
Box::new(self.clone())
}
Expand Down
Loading

0 comments on commit 78a2a63

Please sign in to comment.