Skip to content

Commit

Permalink
PARQUET-708: account for "worst case scenario" in MaxBufferSize for b…
Browse files Browse the repository at this point in the history
…it_width > 1

Author: Uwe L. Korn <uwelk@xhochy.com>

Closes apache#155 from xhochy/parquet-708 and squashes the following commits:

a537717 [Uwe L. Korn] Test for more bit widths
f346435 [Uwe L. Korn] PARQUET-708: account for "worst case scenario" in MaxBufferSize for bit_width > 1

Change-Id: I687aa41b8c589de3daf79236a64d7b4c99c39929
  • Loading branch information
xhochy authored and wesm committed Sep 6, 2016
1 parent 1db05b2 commit 9367102
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 2 deletions.
32 changes: 32 additions & 0 deletions cpp/src/parquet/column/levels-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,36 @@ TEST(TestLevelEncoder, MinimumBufferSize) {
ASSERT_EQ(kNumToEncode, encode_count);
}

TEST(TestLevelEncoder, MinimumBufferSize2) {
// PARQUET-708
// Test the worst case for bit_width=2 consisting of
// LiteralRun(size=8)
// RepeatedRun(size=8)
// LiteralRun(size=8)
// ...
const int kNumToEncode = 1024;

std::vector<int16_t> levels;
for (int i = 0; i < kNumToEncode; ++i) {
// This forces a literal run of 00000001
// followed by eight 1s
if ((i % 16) < 7) {
levels.push_back(0);
} else {
levels.push_back(1);
}
}

for (int bit_width = 1; bit_width <= 8; bit_width++) {
std::vector<uint8_t> output(
LevelEncoder::MaxBufferSize(Encoding::RLE, bit_width, kNumToEncode));

LevelEncoder encoder;
encoder.Init(Encoding::RLE, bit_width, kNumToEncode, output.data(), output.size());
int encode_count = encoder.Encode(kNumToEncode, levels.data());

ASSERT_EQ(kNumToEncode, encode_count);
}
}

} // namespace parquet
8 changes: 6 additions & 2 deletions cpp/src/parquet/util/rle-encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,12 @@ class RleEncoder {

/// Returns the maximum byte size it could take to encode 'num_values'.
static int MaxBufferSize(int bit_width, int num_values) {
int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0);
int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN);
// For a bit_width > 1, the worst case is the repetition of "literal run of length 8
// and then a repeated run of length 8".
// 8 values per smallest run, 8 bits per byte
// int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8);
int bytes_per_run = bit_width;
int num_runs = BitUtil::Ceil(num_values, 8);
int literal_max_size = num_runs + num_runs * bytes_per_run;

// In the very worst case scenario, the data is a concatenation of repeated
Expand Down

0 comments on commit 9367102

Please sign in to comment.