diff --git a/cpp/src/parquet/column/levels-test.cc b/cpp/src/parquet/column/levels-test.cc index cb7cc3863cdf6..126873c0e9877 100644 --- a/cpp/src/parquet/column/levels-test.cc +++ b/cpp/src/parquet/column/levels-test.cc @@ -206,4 +206,36 @@ TEST(TestLevelEncoder, MinimumBufferSize) { ASSERT_EQ(kNumToEncode, encode_count); } +TEST(TestLevelEncoder, MinimumBufferSize2) { + // PARQUET-708 + // Test the worst case for bit_width=2 consisting of + // LiteralRun(size=8) + // RepeatedRun(size=8) + // LiteralRun(size=8) + // ... + const int kNumToEncode = 1024; + + std::vector levels; + for (int i = 0; i < kNumToEncode; ++i) { + // This forces a literal run of 00000001 + // followed by eight 1s + if ((i % 16) < 7) { + levels.push_back(0); + } else { + levels.push_back(1); + } + } + + for (int bit_width = 1; bit_width <= 8; bit_width++) { + std::vector output( + LevelEncoder::MaxBufferSize(Encoding::RLE, bit_width, kNumToEncode)); + + LevelEncoder encoder; + encoder.Init(Encoding::RLE, bit_width, kNumToEncode, output.data(), output.size()); + int encode_count = encoder.Encode(kNumToEncode, levels.data()); + + ASSERT_EQ(kNumToEncode, encode_count); + } +} + } // namespace parquet diff --git a/cpp/src/parquet/util/rle-encoding.h b/cpp/src/parquet/util/rle-encoding.h index 15fd5504ba17f..8fa1c416d23d3 100644 --- a/cpp/src/parquet/util/rle-encoding.h +++ b/cpp/src/parquet/util/rle-encoding.h @@ -171,8 +171,12 @@ class RleEncoder { /// Returns the maximum byte size it could take to encode 'num_values'. static int MaxBufferSize(int bit_width, int num_values) { - int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0); - int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN); + // For a bit_width > 1, the worst case is the repetition of "literal run of length 8 + // and then a repeated run of length 8". + // 8 values per smallest run, 8 bits per byte + // int bytes_per_run = BitUtil::Ceil(bit_width * 8, 8); + int bytes_per_run = bit_width; + int num_runs = BitUtil::Ceil(num_values, 8); int literal_max_size = num_runs + num_runs * bytes_per_run; // In the very worst case scenario, the data is a concatenation of repeated