Skip to content
This repository has been archived by the owner on Feb 18, 2024. It is now read-only.

Commit

Permalink
Added support to read dict-encoded required primitive types from parq…
Browse files Browse the repository at this point in the history
…uet (#402)
  • Loading branch information
Dandandan authored Sep 13, 2021
1 parent 227ab3b commit ce3b0e9
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 0 deletions.
36 changes: 36 additions & 0 deletions src/io/parquet/read/primitive/basic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,32 @@ fn read_dict_buffer_optional<T, A, F>(
}
}

fn read_dict_buffer_required<T, A, F>(
indices_buffer: &[u8],
additional: usize,
dict: &PrimitivePageDict<T>,
values: &mut MutableBuffer<A>,
validity: &mut MutableBitmap,
op: F,
) where
T: NativeType,
A: ArrowNativeType,
F: Fn(T) -> A,
{
let dict_values = dict.values();

// SPEC: Data page format: the bit width used to encode the entry ids stored as 1 byte (max bit width = 32),
// SPEC: followed by the values encoded using RLE/Bit packed described above (with the given bit width).
let bit_width = indices_buffer[0];
let indices_buffer = &indices_buffer[1..];

let indices = hybrid_rle::HybridRleDecoder::new(indices_buffer, bit_width as u32, additional);

values.extend(indices.map(|index| op(dict_values[index as usize])));

validity.extend_constant(additional, true);
}

fn read_nullable<T, A, F>(
validity_buffer: &[u8],
values_buffer: &[u8],
Expand Down Expand Up @@ -170,6 +196,16 @@ where
op,
)
}
(Encoding::PlainDictionary | Encoding::RleDictionary, Some(dict), false) => {
read_dict_buffer_required(
values_buffer,
additional,
dict.as_any().downcast_ref().unwrap(),
values,
validity,
op,
)
}
// it can happen that there is a dictionary but the encoding is plain because
// it falled back.
(Encoding::Plain, _, true) => read_nullable(
Expand Down
10 changes: 10 additions & 0 deletions tests/it/io/parquet/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,16 @@ fn v1_int64_nullable_dict() -> Result<()> {
test_pyarrow_integration(0, 1, "basic", true, false)
}

#[test]
fn v2_int64_required_dict() -> Result<()> {
test_pyarrow_integration(0, 2, "basic", true, true)
}

#[test]
fn v1_int64_required_dict() -> Result<()> {
test_pyarrow_integration(0, 1, "basic", true, true)
}

#[test]
fn v2_utf8_nullable() -> Result<()> {
test_pyarrow_integration(2, 2, "basic", false, false)
Expand Down

0 comments on commit ce3b0e9

Please sign in to comment.