Skip to content

Commit

Permalink
ARROW-8426: [Rust] [Parquet] - Add more support for converting Dicts
Browse files Browse the repository at this point in the history
This adds more support for:

- When converting Arrow -> Parquet containing an Arrow Dictionary,
materialize the Dictionary values and send to Parquet to be encoded with
a dictionary or not according to the Parquet settings (deliberately not supporting
converting an Arrow Dictionary directly to Parquet DictEncoding, and right now this only supports String dictionaries)
- When converting Parquet -> Arrow, noticing that the Arrow schema
metadata in a Parquet file has a Dictionary type and converting the data
to an Arrow dictionary (right now this only supports String dictionaries)

I'm not sure if this is in a good enough state to merge or not yet, please let me know @nevi-me !

Closes apache#8402 from carols10cents/dict

Lead-authored-by: Carol (Nichols || Goulding) <carol.nichols@gmail.com>
Co-authored-by: Neville Dipale <nevilledips@gmail.com>
Co-authored-by: Jake Goulding <jake.goulding@gmail.com>
Signed-off-by: Neville Dipale <nevilledips@gmail.com>
  • Loading branch information
3 people authored and GeorgeAp committed Jun 7, 2021
1 parent fe5fcff commit d6e3ff8
Show file tree
Hide file tree
Showing 6 changed files with 615 additions and 401 deletions.
57 changes: 56 additions & 1 deletion rust/arrow/src/array/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use crate::util::bit_util;
/// An generic representation of Arrow array data which encapsulates common attributes and
/// operations for Arrow array. Specific operations for different arrays types (e.g.,
/// primitive, list, struct) are implemented in `Array`.
#[derive(PartialEq, Debug, Clone)]
#[derive(Debug, Clone)]
pub struct ArrayData {
/// The data type for this array data
data_type: DataType,
Expand Down Expand Up @@ -209,6 +209,61 @@ impl ArrayData {
}
}

impl PartialEq for ArrayData {
fn eq(&self, other: &Self) -> bool {
assert_eq!(
self.data_type(),
other.data_type(),
"Data types not the same"
);
assert_eq!(self.len(), other.len(), "Lengths not the same");
// TODO: when adding tests for this, test that we can compare with arrays that have offsets
assert_eq!(self.offset(), other.offset(), "Offsets not the same");
assert_eq!(self.null_count(), other.null_count());
// compare buffers excluding padding
let self_buffers = self.buffers();
let other_buffers = other.buffers();
assert_eq!(self_buffers.len(), other_buffers.len());
self_buffers.iter().zip(other_buffers).for_each(|(s, o)| {
compare_buffer_regions(
s,
self.offset(), // TODO mul by data length
o,
other.offset(), // TODO mul by data len
);
});
// assert_eq!(self.buffers(), other.buffers());

assert_eq!(self.child_data(), other.child_data());
// null arrays can skip the null bitmap, thus only compare if there are no nulls
if self.null_count() != 0 || other.null_count() != 0 {
compare_buffer_regions(
self.null_buffer().unwrap(),
self.offset(),
other.null_buffer().unwrap(),
other.offset(),
)
}
true
}
}

/// A helper to compare buffer regions of 2 buffers.
/// Compares the length of the shorter buffer.
fn compare_buffer_regions(
left: &Buffer,
left_offset: usize,
right: &Buffer,
right_offset: usize,
) {
// for convenience, we assume that the buffer lengths are only unequal if one has padding,
// so we take the shorter length so we can discard the padding from the longer length
let shorter_len = left.len().min(right.len());
let s_sliced = left.bit_slice(left_offset, shorter_len);
let o_sliced = right.bit_slice(right_offset, shorter_len);
assert_eq!(s_sliced, o_sliced);
}

/// Builder for `ArrayData` type
#[derive(Debug)]
pub struct ArrayDataBuilder {
Expand Down
30 changes: 23 additions & 7 deletions rust/arrow/src/ipc/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -641,17 +641,23 @@ pub(crate) fn get_fb_dictionary<'a: 'b, 'b>(
fbb: &mut FlatBufferBuilder<'a>,
) -> WIPOffset<ipc::DictionaryEncoding<'b>> {
// We assume that the dictionary index type (as an integer) has already been
// validated elsewhere, and can safely assume we are dealing with signed
// integers
// validated elsewhere, and can safely assume we are dealing with integers
let mut index_builder = ipc::IntBuilder::new(fbb);
index_builder.add_is_signed(true);

match *index_type {
Int8 => index_builder.add_bitWidth(8),
Int16 => index_builder.add_bitWidth(16),
Int32 => index_builder.add_bitWidth(32),
Int64 => index_builder.add_bitWidth(64),
Int8 | Int16 | Int32 | Int64 => index_builder.add_is_signed(true),
UInt8 | UInt16 | UInt32 | UInt64 => index_builder.add_is_signed(false),
_ => {}
}

match *index_type {
Int8 | UInt8 => index_builder.add_bitWidth(8),
Int16 | UInt16 => index_builder.add_bitWidth(16),
Int32 | UInt32 => index_builder.add_bitWidth(32),
Int64 | UInt64 => index_builder.add_bitWidth(64),
_ => {}
}

let index_builder = index_builder.finish();

let mut builder = ipc::DictionaryEncodingBuilder::new(fbb);
Expand Down Expand Up @@ -773,6 +779,16 @@ mod tests {
123,
true,
),
Field::new_dict(
"dictionary<uint8, uint32>",
DataType::Dictionary(
Box::new(DataType::UInt8),
Box::new(DataType::UInt32),
),
true,
123,
true,
),
],
md,
);
Expand Down
Loading

0 comments on commit d6e3ff8

Please sign in to comment.