Skip to content

Commit

Permalink
ArrayData Enumeration for Remaining Layouts (apache#3769)
Browse files Browse the repository at this point in the history
* Add StructArrayData

* Add ListArrayData

* Add DictionaryArrayData

* Format

* Add FixedSizeBinaryArrayData

* Add UnionArrayData

* Docs

* Add FixedSizeListArrayData

* Derive Debug and Clone

* Add RunArrayData

* Review feedback
  • Loading branch information
tustvold authored Mar 1, 2023
1 parent d440c24 commit 7852e76
Show file tree
Hide file tree
Showing 9 changed files with 819 additions and 18 deletions.
80 changes: 75 additions & 5 deletions arrow-data/src/data/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ mod private {
}

/// Types backed by a variable length slice of bytes
pub trait Bytes: private::BytesSealed {
pub trait Bytes: private::BytesSealed + std::fmt::Debug {
const TYPE: BytesType;
}

Expand Down Expand Up @@ -195,6 +195,7 @@ impl private::BytesOffsetSealed for i64 {
}

/// An enumeration of the types of [`ArrayDataBytesOffset`]
#[derive(Debug, Clone)]
pub enum ArrayDataBytes {
Binary(ArrayDataBytesOffset<[u8]>),
Utf8(ArrayDataBytesOffset<str>),
Expand All @@ -217,18 +218,29 @@ impl ArrayDataBytes {
}

/// An enumeration of the types of [`BytesArrayData`]
#[derive(Debug)]
pub enum ArrayDataBytesOffset<B: Bytes + ?Sized> {
Small(BytesArrayData<i32, B>),
Large(BytesArrayData<i64, B>),
}

impl<B: Bytes + ?Sized> Clone for ArrayDataBytesOffset<B> {
fn clone(&self) -> Self {
match self {
Self::Small(v) => Self::Small(v.clone()),
Self::Large(v) => Self::Large(v.clone()),
}
}
}

impl<O: BytesOffset, B: Bytes + ?Sized> From<BytesArrayData<O, B>> for ArrayDataBytes {
fn from(value: BytesArrayData<O, B>) -> Self {
B::upcast(O::upcast(value))
}
}

/// ArrayData for arrays of [`Bytes`]
/// ArrayData for [variable-sized arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) of [`Bytes`]
#[derive(Debug)]
pub struct BytesArrayData<O: BytesOffset, B: Bytes + ?Sized> {
data_type: DataType,
nulls: Option<NullBuffer>,
Expand All @@ -237,13 +249,25 @@ pub struct BytesArrayData<O: BytesOffset, B: Bytes + ?Sized> {
phantom: PhantomData<B>,
}

impl<O: BytesOffset, B: Bytes> BytesArrayData<O, B> {
impl<O: BytesOffset, B: Bytes + ?Sized> Clone for BytesArrayData<O, B> {
fn clone(&self) -> Self {
Self {
data_type: self.data_type.clone(),
nulls: self.nulls.clone(),
offsets: self.offsets.clone(),
values: self.values.clone(),
phantom: Default::default(),
}
}
}

impl<O: BytesOffset, B: Bytes + ?Sized> BytesArrayData<O, B> {
/// Creates a new [`BytesArrayData`]
///
/// # Safety
///
/// - Each consecutive window of `offsets` must identify a valid slice of `values`
/// - `nulls.len() == offsets.len() + 1`
/// - `nulls.len() == offsets.len() - 1`
/// - `data_type` must be valid for this layout
pub unsafe fn new_unchecked(
data_type: DataType,
Expand All @@ -270,7 +294,7 @@ impl<O: BytesOffset, B: Bytes> BytesArrayData<O, B> {

/// Returns the offsets
#[inline]
pub fn value_offsets(&self) -> &[O] {
pub fn offsets(&self) -> &[O] {
&self.offsets
}

Expand All @@ -286,3 +310,49 @@ impl<O: BytesOffset, B: Bytes> BytesArrayData<O, B> {
&self.data_type
}
}

/// ArrayData for [fixed-size arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout) of bytes
#[derive(Debug, Clone)]
pub struct FixedSizeBinaryArrayData {
data_type: DataType,
nulls: Option<NullBuffer>,
values: Buffer,
}

impl FixedSizeBinaryArrayData {
/// Creates a new [`FixedSizeBinaryArrayData`]
///
/// # Safety
///
/// - `data_type` must be valid for this layout
/// - `nulls.len() == values.len() / element_size`
pub unsafe fn new_unchecked(
data_type: DataType,
values: Buffer,
nulls: Option<NullBuffer>,
) -> Self {
Self {
data_type,
nulls,
values,
}
}

/// Returns the raw byte data
#[inline]
pub fn values(&self) -> &[u8] {
&self.values
}

/// Returns the null buffer if any
#[inline]
pub fn null_buffer(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}

/// Returns the data type of this array
#[inline]
pub fn data_type(&self) -> &DataType {
&self.data_type
}
}
174 changes: 174 additions & 0 deletions arrow-data/src/data/dictionary.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::data::types::DictionaryKeyType;
use crate::ArrayData;
use arrow_buffer::buffer::{NullBuffer, ScalarBuffer};
use arrow_buffer::ArrowNativeType;
use arrow_schema::DataType;

mod private {
use super::*;

pub trait DictionaryKeySealed {
/// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`]
fn downcast_ref(data: &ArrayDataDictionary) -> Option<&DictionaryArrayData<Self>>
where
Self: DictionaryKey;

/// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`]
fn downcast(data: ArrayDataDictionary) -> Option<DictionaryArrayData<Self>>
where
Self: DictionaryKey;

/// Cast [`DictionaryArrayData`] to [`ArrayDataDictionary`]
fn upcast(v: DictionaryArrayData<Self>) -> ArrayDataDictionary
where
Self: DictionaryKey;
}
}

/// Types of dictionary key used by dictionary arrays
pub trait DictionaryKey: private::DictionaryKeySealed + ArrowNativeType {
const TYPE: DictionaryKeyType;
}

macro_rules! dictionary {
($t:ty,$v:ident) => {
impl DictionaryKey for $t {
const TYPE: DictionaryKeyType = DictionaryKeyType::$v;
}
impl private::DictionaryKeySealed for $t {
fn downcast_ref(
data: &ArrayDataDictionary,
) -> Option<&DictionaryArrayData<Self>> {
match data {
ArrayDataDictionary::$v(v) => Some(v),
_ => None,
}
}

fn downcast(data: ArrayDataDictionary) -> Option<DictionaryArrayData<Self>> {
match data {
ArrayDataDictionary::$v(v) => Some(v),
_ => None,
}
}

fn upcast(v: DictionaryArrayData<Self>) -> ArrayDataDictionary {
ArrayDataDictionary::$v(v)
}
}
};
}

dictionary!(i8, Int8);
dictionary!(i16, Int16);
dictionary!(i32, Int32);
dictionary!(i64, Int64);
dictionary!(u8, UInt8);
dictionary!(u16, UInt16);
dictionary!(u32, UInt32);
dictionary!(u64, UInt64);

/// An enumeration of the types of [`DictionaryArrayData`]
#[derive(Debug, Clone)]
pub enum ArrayDataDictionary {
Int8(DictionaryArrayData<i8>),
Int16(DictionaryArrayData<i16>),
Int32(DictionaryArrayData<i32>),
Int64(DictionaryArrayData<i64>),
UInt8(DictionaryArrayData<u8>),
UInt16(DictionaryArrayData<u16>),
UInt32(DictionaryArrayData<u32>),
UInt64(DictionaryArrayData<u64>),
}

impl ArrayDataDictionary {
/// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`]
pub fn downcast_ref<K: DictionaryKey>(&self) -> Option<&DictionaryArrayData<K>> {
K::downcast_ref(self)
}

/// Downcast this [`ArrayDataDictionary`] to the corresponding [`DictionaryArrayData`]
pub fn downcast<K: DictionaryKey>(self) -> Option<DictionaryArrayData<K>> {
K::downcast(self)
}
}

impl<K: DictionaryKey> From<DictionaryArrayData<K>> for ArrayDataDictionary {
fn from(value: DictionaryArrayData<K>) -> Self {
K::upcast(value)
}
}

/// ArrayData for [dictionary arrays](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout)
#[derive(Debug, Clone)]
pub struct DictionaryArrayData<K: DictionaryKey> {
data_type: DataType,
nulls: Option<NullBuffer>,
keys: ScalarBuffer<K>,
child: Box<ArrayData>,
}

impl<K: DictionaryKey> DictionaryArrayData<K> {
/// Create a new [`DictionaryArrayData`]
///
/// # Safety
///
/// - `data_type` must be valid for this layout
/// - child must have a type matching `data_type`
/// - all values in `keys` must be `0 < v < child.len()` or be a null according to `nulls`
/// - `nulls` must have the same length as `child`
pub unsafe fn new_unchecked(
data_type: DataType,
keys: ScalarBuffer<K>,
nulls: Option<NullBuffer>,
child: ArrayData,
) -> Self {
Self {
data_type,
nulls,
keys,
child: Box::new(child),
}
}

/// Returns the null buffer if any
#[inline]
pub fn nulls(&self) -> Option<&NullBuffer> {
self.nulls.as_ref()
}

/// Returns the keys
#[inline]
pub fn keys(&self) -> &[K] {
&self.keys
}

/// Returns the child data
#[inline]
pub fn child(&self) -> &ArrayData {
self.child.as_ref()
}

/// Returns the data type of this array
#[inline]
pub fn data_type(&self) -> &DataType {
&self.data_type
}
}
Loading

0 comments on commit 7852e76

Please sign in to comment.