Skip to content

Commit

Permalink
PARQUET-446: Hide Thrift compiled headers and Boost from public API, …
Browse files Browse the repository at this point in the history
…#include scrubbing

This is the completion of work I started in PARQUET-442. This also resolves PARQUET-277 as no boost headers are included in the public API anymore.

I've done some scrubbing of #includes using Google's Clang-based include-what-you-use tool. PARQUET-522 can also be resolved when this is merged.

Author: Wes McKinney <wes@cloudera.com>

Closes apache#49 from wesm/PARQUET-446 and squashes the following commits:

e805a0c [Wes McKinney] Use int64_t for scanner batch sizes
503b1c1 [Wes McKinney] Fix mixed-up include guard names
4c02d2b [Wes McKinney] Refactor monolithic encodings/encodings.h
9e28fc3 [Wes McKinney] Finished IWYU path. Some imported impala code left unchanged for now
6d4af8e [Wes McKinney] Some initial IWYU
5be40d6 [Wes McKinney] Remove outdated TODO
2e39062 [Wes McKinney] Remove any boost #include dependencies
07059ca [Wes McKinney] Remove serialized-page.* files, move serialized-page-test to parquet/file
9458b36 [Wes McKinney] Add more headers to parquet.h public API
b4b0412 [Wes McKinney] Remove Thrift compiled headers from public API and general use outside of deserialized-related internal headers and code paths. Add unit test to enforce this

Change-Id: Ib12b600cf6318e9da514b39fd8db2225bd6c9e09
  • Loading branch information
wesm authored and julienledem committed Feb 15, 2016
1 parent 7c6e60d commit 52fe079
Show file tree
Hide file tree
Showing 53 changed files with 753 additions and 597 deletions.
1 change: 1 addition & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ install(FILES
types.h
DESTINATION include/parquet)

ADD_PARQUET_TEST(public-api-test)
ADD_PARQUET_TEST(reader-test)
2 changes: 0 additions & 2 deletions cpp/src/parquet/column/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,8 @@ install(FILES
page.h
levels.h
reader.h
serialized-page.h
scanner.h
DESTINATION include/parquet/column)

ADD_PARQUET_TEST(column-reader-test)
ADD_PARQUET_TEST(levels-test)
ADD_PARQUET_TEST(serialized-page-test)
10 changes: 4 additions & 6 deletions cpp/src/parquet/column/column-reader-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
// specific language governing permissions and limitations
// under the License.

#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <memory>
#include <string>
#include <vector>

Expand All @@ -28,15 +28,13 @@
#include "parquet/column/page.h"
#include "parquet/column/reader.h"
#include "parquet/column/test-util.h"

#include "parquet/util/output.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema/types.h"
#include "parquet/util/test-common.h"

using std::string;
using std::vector;
using std::shared_ptr;
using parquet::FieldRepetitionType;
using parquet::SchemaElement;

namespace parquet_cpp {

Expand Down
124 changes: 60 additions & 64 deletions cpp/src/parquet/column/levels-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,98 +15,94 @@
// specific language governing permissions and limitations
// under the License.

#include <cstdlib>
#include <iostream>
#include <sstream>
#include <cstdint>
#include <string>
#include <vector>

#include <gtest/gtest.h>

#include "parquet/thrift/parquet_types.h"
#include "parquet/column/levels.h"
#include "parquet/types.h"

using std::string;

namespace parquet_cpp {

class TestLevels : public ::testing::Test {
public:
int GenerateLevels(int min_repeat_factor, int max_repeat_factor,
int max_level, std::vector<int16_t>& input_levels) {
int total_count = 0;
// for each repetition count upto max_repeat_factor
for (int repeat = min_repeat_factor; repeat <= max_repeat_factor; repeat++) {
// repeat count increase by a factor of 2 for every iteration
int repeat_count = (1 << repeat);
// generate levels for repetition count upto the maximum level
int value = 0;
int bwidth = 0;
while (value <= max_level) {
for (int i = 0; i < repeat_count; i++) {
input_levels[total_count++] = value;
}
value = (2 << bwidth) - 1;
bwidth++;
int GenerateLevels(int min_repeat_factor, int max_repeat_factor,
int max_level, std::vector<int16_t>& input_levels) {
int total_count = 0;
// for each repetition count upto max_repeat_factor
for (int repeat = min_repeat_factor; repeat <= max_repeat_factor; repeat++) {
// repeat count increase by a factor of 2 for every iteration
int repeat_count = (1 << repeat);
// generate levels for repetition count upto the maximum level
int value = 0;
int bwidth = 0;
while (value <= max_level) {
for (int i = 0; i < repeat_count; i++) {
input_levels[total_count++] = value;
}
value = (2 << bwidth) - 1;
bwidth++;
}
return total_count;
}
return total_count;
}

void VerifyLevelsEncoding(parquet::Encoding::type encoding, int max_level,
std::vector<int16_t>& input_levels) {
LevelEncoder encoder;
LevelDecoder decoder;
int levels_count = 0;
std::vector<int16_t> output_levels;
std::vector<uint8_t> bytes;
int num_levels = input_levels.size();
output_levels.resize(num_levels);
bytes.resize(2 * num_levels);
ASSERT_EQ(num_levels, output_levels.size());
ASSERT_EQ(2 * num_levels, bytes.size());
// start encoding and decoding
if (encoding == parquet::Encoding::RLE) {
// leave space to write the rle length value
encoder.Init(encoding, max_level, num_levels,
bytes.data() + sizeof(uint32_t), bytes.size());

levels_count = encoder.Encode(num_levels, input_levels.data());
(reinterpret_cast<uint32_t*>(bytes.data()))[0] = encoder.len();

} else {
encoder.Init(encoding, max_level, num_levels,
bytes.data(), bytes.size());
levels_count = encoder.Encode(num_levels, input_levels.data());
}
void VerifyLevelsEncoding(Encoding::type encoding, int max_level,
std::vector<int16_t>& input_levels) {
LevelEncoder encoder;
LevelDecoder decoder;
int levels_count = 0;
std::vector<int16_t> output_levels;
std::vector<uint8_t> bytes;
int num_levels = input_levels.size();
output_levels.resize(num_levels);
bytes.resize(2 * num_levels);
ASSERT_EQ(num_levels, output_levels.size());
ASSERT_EQ(2 * num_levels, bytes.size());
// start encoding and decoding
if (encoding == Encoding::RLE) {
// leave space to write the rle length value
encoder.Init(encoding, max_level, num_levels,
bytes.data() + sizeof(uint32_t), bytes.size());

levels_count = encoder.Encode(num_levels, input_levels.data());
(reinterpret_cast<uint32_t*>(bytes.data()))[0] = encoder.len();

} else {
encoder.Init(encoding, max_level, num_levels,
bytes.data(), bytes.size());
levels_count = encoder.Encode(num_levels, input_levels.data());
}

ASSERT_EQ(num_levels, levels_count);
ASSERT_EQ(num_levels, levels_count);

decoder.Init(encoding, max_level, num_levels, bytes.data());
levels_count = decoder.Decode(num_levels, output_levels.data());
decoder.Init(encoding, max_level, num_levels, bytes.data());
levels_count = decoder.Decode(num_levels, output_levels.data());

ASSERT_EQ(num_levels, levels_count);
ASSERT_EQ(num_levels, levels_count);

for (int i = 0; i < num_levels; i++) {
EXPECT_EQ(input_levels[i], output_levels[i]);
}
for (int i = 0; i < num_levels; i++) {
EXPECT_EQ(input_levels[i], output_levels[i]);
}
};
}

TEST(TestLevels, TestEncodeDecodeLevels) {
// test levels with maximum bit-width from 1 to 8
// increase the repetition count for each iteration by a factor of 2

// test levels with maximum bit-width from 1 to 8
// increase the repetition count for each iteration by a factor of 2
TEST_F(TestLevels, TestEncodeDecodeLevels) {
int min_repeat_factor = 0;
int max_repeat_factor = 7; // 128
int max_bit_width = 8;
std::vector<int16_t> input_levels;
parquet::Encoding::type encodings[2] = {parquet::Encoding::RLE,
parquet::Encoding::BIT_PACKED};
Encoding::type encodings[2] = {Encoding::RLE, Encoding::BIT_PACKED};

// for each encoding
for (int encode = 0; encode < 2; encode++) {
parquet::Encoding::type encoding = encodings[encode];
Encoding::type encoding = encodings[encode];
// BIT_PACKED requires a sequence of atleast 8
if (encoding == parquet::Encoding::BIT_PACKED) min_repeat_factor = 3;
if (encoding == Encoding::BIT_PACKED) min_repeat_factor = 3;

// for each maximum bit-width
for (int bit_width = 1; bit_width <= max_bit_width; bit_width++) {
Expand Down
29 changes: 16 additions & 13 deletions cpp/src/parquet/column/levels.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
#ifndef PARQUET_COLUMN_LEVELS_H
#define PARQUET_COLUMN_LEVELS_H

#include <memory>

#include "parquet/exception.h"
#include "parquet/thrift/parquet_types.h"
#include "parquet/encodings/encodings.h"
#include "parquet/types.h"
#include "parquet/util/rle-encoding.h"

namespace parquet_cpp {
Expand All @@ -30,16 +31,16 @@ class LevelEncoder {
LevelEncoder() {}

// Initialize the LevelEncoder.
void Init(parquet::Encoding::type encoding, int16_t max_level,
void Init(Encoding::type encoding, int16_t max_level,
int num_buffered_values, uint8_t* data, int data_size) {
bit_width_ = BitUtil::Log2(max_level + 1);
encoding_ = encoding;
switch (encoding) {
case parquet::Encoding::RLE: {
case Encoding::RLE: {
rle_encoder_.reset(new RleEncoder(data, data_size, bit_width_));
break;
}
case parquet::Encoding::BIT_PACKED: {
case Encoding::BIT_PACKED: {
int num_bytes = BitUtil::Ceil(num_buffered_values * bit_width_, 8);
bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
break;
Expand All @@ -56,7 +57,7 @@ class LevelEncoder {
throw ParquetException("Level encoders are not initialized.");
}

if (encoding_ == parquet::Encoding::RLE) {
if (encoding_ == Encoding::RLE) {
for (size_t i = 0; i < batch_size; ++i) {
if (!rle_encoder_->Put(*(levels + i))) {
break;
Expand All @@ -78,14 +79,16 @@ class LevelEncoder {
}

int32_t len() {
assert(encoding_ == parquet::Encoding::RLE);
if (encoding_ != Encoding::RLE) {
throw ParquetException("Only implemented for RLE encoding");
}
return rle_length_;
}

private:
int bit_width_;
int rle_length_;
parquet::Encoding::type encoding_;
Encoding::type encoding_;
std::unique_ptr<RleEncoder> rle_encoder_;
std::unique_ptr<BitWriter> bit_packed_encoder_;
};
Expand All @@ -96,20 +99,20 @@ class LevelDecoder {
LevelDecoder() {}

// Initialize the LevelDecoder and return the number of bytes consumed
size_t Init(parquet::Encoding::type encoding, int16_t max_level,
size_t Init(Encoding::type encoding, int16_t max_level,
int num_buffered_values, const uint8_t* data) {
uint32_t num_bytes = 0;
uint32_t total_bytes = 0;
bit_width_ = BitUtil::Log2(max_level + 1);
encoding_ = encoding;
switch (encoding) {
case parquet::Encoding::RLE: {
case Encoding::RLE: {
num_bytes = *reinterpret_cast<const uint32_t*>(data);
const uint8_t* decoder_data = data + sizeof(uint32_t);
rle_decoder_.reset(new RleDecoder(decoder_data, num_bytes, bit_width_));
return sizeof(uint32_t) + num_bytes;
}
case parquet::Encoding::BIT_PACKED: {
case Encoding::BIT_PACKED: {
num_bytes = BitUtil::Ceil(num_buffered_values * bit_width_, 8);
bit_packed_decoder_.reset(new BitReader(data, num_bytes));
return num_bytes;
Expand All @@ -127,7 +130,7 @@ class LevelDecoder {
throw ParquetException("Level decoders are not initialized.");
}

if (encoding_ == parquet::Encoding::RLE) {
if (encoding_ == Encoding::RLE) {
for (size_t i = 0; i < batch_size; ++i) {
if (!rle_decoder_->Get(levels + i)) {
break;
Expand All @@ -147,7 +150,7 @@ class LevelDecoder {

private:
int bit_width_;
parquet::Encoding::type encoding_;
Encoding::type encoding_;
std::unique_ptr<RleDecoder> rle_decoder_;
std::unique_ptr<BitReader> bit_packed_decoder_;
};
Expand Down
Loading

0 comments on commit 52fe079

Please sign in to comment.