Skip to content

Commit

Permalink
Parquet: fix and test inconsistent Boolean encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Aug 1, 2023
1 parent 6b1c723 commit c0cd7e0
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 2 deletions.
5 changes: 4 additions & 1 deletion cpp/src/parquet/encoding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEnco
throw ParquetException("direct put to boolean from " + values.type()->ToString() +
" not supported");
}
// Put arrow array cannot mix with PlainEncoder<BooleanType>::PutImpl.
DCHECK_EQ(0, bit_writer_.bytes_written());

const auto& data = checked_cast<const ::arrow::BooleanArray&>(values);
if (data.null_count() == 0) {
Expand All @@ -354,6 +356,7 @@ class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEnco
sink_.length(), n_valid);

for (int64_t i = 0; i < data.length(); i++) {
// Only valid boolean data will call `writer.Next`.
if (data.IsValid(i)) {
if (data.Value(i)) {
writer.Set();
Expand All @@ -365,7 +368,7 @@ class PlainEncoder<BooleanType> : public EncoderImpl, virtual public BooleanEnco
}
writer.Finish();
}
sink_.UnsafeAdvance(data.length());
sink_.UnsafeAdvance(data.length() - data.null_count());
}

private:
Expand Down
35 changes: 34 additions & 1 deletion cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) {
decoder->SetData(num_values, buf->data(), static_cast<int>(buf->size()));

typename EncodingTraits<ByteArrayType>::Accumulator acc;
acc.builder.reset(new ::arrow::StringBuilder);
acc.builder = std::make_unique<::arrow::StringBuilder>();
ASSERT_EQ(num_values,
decoder->DecodeArrow(static_cast<int>(values->length()),
static_cast<int>(values->null_count()),
Expand Down Expand Up @@ -665,6 +665,33 @@ class EncodingAdHocTyped : public ::testing::Test {
::arrow::AssertArraysEqual(*values, *result, /*verbose=*/true);
}

void PlainTwice(int seed) {
auto values_single = GetValues(seed);
auto encoder = MakeTypedEncoder<ParquetType>(
Encoding::PLAIN, /*use_dictionary=*/false, column_descr());
auto decoder = MakeTypedDecoder<ParquetType>(Encoding::PLAIN, column_descr());

ASSERT_NO_THROW(encoder->Put(*values_single));
ASSERT_NO_THROW(encoder->Put(*values_single));
auto buf = encoder->FlushValues();

EXPECT_OK_AND_ASSIGN(auto values,
::arrow::Concatenate({values_single, values_single}));
decoder->SetData(static_cast<int>(values->length()), buf->data(),
static_cast<int>(buf->size()));

BuilderType acc(arrow_type(), ::arrow::default_memory_pool());
ASSERT_EQ(values->length() - values->null_count(),
decoder->DecodeArrow(static_cast<int>(values->length()),
static_cast<int>(values->null_count()),
values->null_bitmap_data(), values->offset(), &acc));

std::shared_ptr<::arrow::Array> result;
ASSERT_OK(acc.Finish(&result));
ASSERT_EQ(100, result->length());
::arrow::AssertArraysEqual(*values, *result, /*verbose=*/true);
}

void ByteStreamSplit(int seed) {
if (!std::is_same<ParquetType, FloatType>::value &&
!std::is_same<ParquetType, DoubleType>::value) {
Expand Down Expand Up @@ -882,6 +909,12 @@ TYPED_TEST(EncodingAdHocTyped, PlainArrowDirectPut) {
}
}

TYPED_TEST(EncodingAdHocTyped, PlainArrowDirectPut2) {
for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
this->PlainTwice(seed);
}
}

TYPED_TEST(EncodingAdHocTyped, ByteStreamSplitArrowDirectPut) {
for (auto seed : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}) {
this->ByteStreamSplit(seed);
Expand Down

0 comments on commit c0cd7e0

Please sign in to comment.