ARROW-8426: [Rust] [Parquet] - Add more support for converting Dicts

This adds more support for: - When converting Arrow -> Parquet containing an Arrow Dictionary, materialize the Dictionary values and send to Parquet to be encoded with a dictionary or not according to the Parquet settings (deliberately not supporting converting an Arrow Dictionary directly to Parquet DictEncoding, and right now this only supports String dictionaries) - When converting Parquet -> Arrow, noticing that the Arrow schema metadata in a Parquet file has a Dictionary type and converting the data to an Arrow dictionary (right now this only supports String dictionaries) I'm not sure if this is in a good enough state to merge or not yet, please let me know @nevi-me ! Closes apache#8402 from carols10cents/dict Lead-authored-by: Carol (Nichols || Goulding) <carol.nichols@gmail.com> Co-authored-by: Neville Dipale <nevilledips@gmail.com> Co-authored-by: Jake Goulding <jake.goulding@gmail.com> Signed-off-by: Neville Dipale <nevilledips@gmail.com>
sirensolutions · Jun 7, 2021 · d6e3ff8 · d6e3ff8
1 parent fe5fcff
commit d6e3ff8
Show file tree

Hide file tree

Showing 6 changed files with 615 additions and 401 deletions.
diff --git a/rust/arrow/src/array/data.rs b/rust/arrow/src/array/data.rs
@@ -29,7 +29,7 @@ use crate::util::bit_util;
 /// An generic representation of Arrow array data which encapsulates common attributes and
 /// operations for Arrow array. Specific operations for different arrays types (e.g.,
 /// primitive, list, struct) are implemented in `Array`.
-#[derive(PartialEq, Debug, Clone)]
+#[derive(Debug, Clone)]
 pub struct ArrayData {
     /// The data type for this array data
     data_type: DataType,
@@ -209,6 +209,61 @@ impl ArrayData {
     }
 }
 
+impl PartialEq for ArrayData {
+    fn eq(&self, other: &Self) -> bool {
+        assert_eq!(
+            self.data_type(),
+            other.data_type(),
+            "Data types not the same"
+        );
+        assert_eq!(self.len(), other.len(), "Lengths not the same");
+        // TODO: when adding tests for this, test that we can compare with arrays that have offsets
+        assert_eq!(self.offset(), other.offset(), "Offsets not the same");
+        assert_eq!(self.null_count(), other.null_count());
+        // compare buffers excluding padding
+        let self_buffers = self.buffers();
+        let other_buffers = other.buffers();
+        assert_eq!(self_buffers.len(), other_buffers.len());
+        self_buffers.iter().zip(other_buffers).for_each(|(s, o)| {
+            compare_buffer_regions(
+                s,
+                self.offset(), // TODO mul by data length
+                o,
+                other.offset(), // TODO mul by data len
+            );
+        });
+        // assert_eq!(self.buffers(), other.buffers());
+
+        assert_eq!(self.child_data(), other.child_data());
+        // null arrays can skip the null bitmap, thus only compare if there are no nulls
+        if self.null_count() != 0 || other.null_count() != 0 {
+            compare_buffer_regions(
+                self.null_buffer().unwrap(),
+                self.offset(),
+                other.null_buffer().unwrap(),
+                other.offset(),
+            )
+        }
+        true
+    }
+}
+
+/// A helper to compare buffer regions of 2 buffers.
+/// Compares the length of the shorter buffer.
+fn compare_buffer_regions(
+    left: &Buffer,
+    left_offset: usize,
+    right: &Buffer,
+    right_offset: usize,
+) {
+    // for convenience, we assume that the buffer lengths are only unequal if one has padding,
+    // so we take the shorter length so we can discard the padding from the longer length
+    let shorter_len = left.len().min(right.len());
+    let s_sliced = left.bit_slice(left_offset, shorter_len);
+    let o_sliced = right.bit_slice(right_offset, shorter_len);
+    assert_eq!(s_sliced, o_sliced);
+}
+
 /// Builder for `ArrayData` type
 #[derive(Debug)]
 pub struct ArrayDataBuilder {

diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs
@@ -641,17 +641,23 @@ pub(crate) fn get_fb_dictionary<'a: 'b, 'b>(
     fbb: &mut FlatBufferBuilder<'a>,
 ) -> WIPOffset<ipc::DictionaryEncoding<'b>> {
     // We assume that the dictionary index type (as an integer) has already been
-    // validated elsewhere, and can safely assume we are dealing with signed
-    // integers
+    // validated elsewhere, and can safely assume we are dealing with integers
     let mut index_builder = ipc::IntBuilder::new(fbb);
-    index_builder.add_is_signed(true);
+
     match *index_type {
-        Int8 => index_builder.add_bitWidth(8),
-        Int16 => index_builder.add_bitWidth(16),
-        Int32 => index_builder.add_bitWidth(32),
-        Int64 => index_builder.add_bitWidth(64),
+        Int8 | Int16 | Int32 | Int64 => index_builder.add_is_signed(true),
+        UInt8 | UInt16 | UInt32 | UInt64 => index_builder.add_is_signed(false),
         _ => {}
     }
+
+    match *index_type {
+        Int8 | UInt8 => index_builder.add_bitWidth(8),
+        Int16 | UInt16 => index_builder.add_bitWidth(16),
+        Int32 | UInt32 => index_builder.add_bitWidth(32),
+        Int64 | UInt64 => index_builder.add_bitWidth(64),
+        _ => {}
+    }
+
     let index_builder = index_builder.finish();
 
     let mut builder = ipc::DictionaryEncodingBuilder::new(fbb);
@@ -773,6 +779,16 @@ mod tests {
                     123,
                     true,
                 ),
+                Field::new_dict(
+                    "dictionary<uint8, uint32>",
+                    DataType::Dictionary(
+                        Box::new(DataType::UInt8),
+                        Box::new(DataType::UInt32),
+                    ),
+                    true,
+                    123,
+                    true,
+                ),
             ],
             md,
         );