Skip to content

Commit

Permalink
PARQUET-503: Reenable parquet 2.0 encoding implementations.
Browse files Browse the repository at this point in the history
Author: Nong Li <nongli@gmail.com>

Closes apache#35 from nongli/parquet-503 and squashes the following commits:

cb2a4e1 [Nong Li] PARQUET-503: Reenable parquet 2.0 encoding implementations.

Change-Id: Id3801ddb44164bcc63adc3ee83250d33c1d7e191
  • Loading branch information
nongli authored and julienledem committed Feb 2, 2016
1 parent 41eed7e commit 88e5e0e
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 29 deletions.
10 changes: 5 additions & 5 deletions cpp/src/parquet/encodings/delta-bit-pack-encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
using Decoder<TYPE>::num_values_;

void InitBlock() {
uint64_t block_size;
int32_t block_size;
if (!decoder_.GetVlqInt(&block_size)) ParquetException::EofException();
if (!decoder_.GetVlqInt(&num_mini_blocks_)) ParquetException::EofException();
if (!decoder_.GetVlqInt(&values_current_block_)) {
Expand Down Expand Up @@ -104,17 +104,17 @@ class DeltaBitPackDecoder : public Decoder<TYPE> {
}

BitReader decoder_;
uint64_t values_current_block_;
uint64_t num_mini_blocks_;
int32_t values_current_block_;
int32_t num_mini_blocks_;
uint64_t values_per_mini_block_;
uint64_t values_current_mini_block_;

int64_t min_delta_;
int32_t min_delta_;
int mini_block_idx_;
std::vector<uint8_t> delta_bit_widths_;
int delta_bit_width_;

int64_t last_value_;
int32_t last_value_;
};
} // namespace parquet_cpp

Expand Down
8 changes: 3 additions & 5 deletions cpp/src/parquet/encodings/encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,8 @@ class Encoder {

#include "parquet/encodings/plain-encoding.h"
#include "parquet/encodings/dictionary-encoding.h"

// The encoding tools changed and these are missing the ZigZag functions
// #include "parquet/encodings/delta-bit-pack-encoding.h"
// #include "parquet/encodings/delta-length-byte-array-encoding.h"
// #include "parquet/encodings/delta-byte-array-encoding.h"
#include "parquet/encodings/delta-bit-pack-encoding.h"
#include "parquet/encodings/delta-length-byte-array-encoding.h"
#include "parquet/encodings/delta-byte-array-encoding.h"

#endif // PARQUET_ENCODINGS_ENCODINGS_H
12 changes: 7 additions & 5 deletions cpp/src/parquet/util/bit-stream-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,10 @@ class BitWriter {
/// room. The value is written byte aligned.
/// For more details on vlq:
/// en.wikipedia.org/wiki/Variable-length_quantity
bool PutVlqInt(int32_t v);
bool PutVlqInt(uint32_t v);

// Writes an int zigzag encoded.
bool PutZigZagVlqInt(int32_t v);

/// Get a pointer to the next aligned byte and advance the underlying buffer
/// by num_bytes.
Expand Down Expand Up @@ -135,17 +138,16 @@ class BitReader {
/// the buffer.
bool GetVlqInt(int32_t* v);

// Reads a zigzag encoded int `into` v.
bool GetZigZagVlqInt(int32_t* v);

/// Returns the number of bytes left in the stream, not including the current
/// byte (i.e., there may be an additional fraction of a byte).
int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }

/// Maximum byte length of a vlq encoded int
static const int MAX_VLQ_BYTE_LEN = 5;

// TODO(nongli): implementations to be fixed given changes in Impala
// bool GetZigZagVlqInt(int64_t* v);
// bool PutZigZagVlqInt(int32_t v);

private:
const uint8_t* buffer_;
int max_bytes_;
Expand Down
26 changes: 12 additions & 14 deletions cpp/src/parquet/util/bit-stream-utils.inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
return true;
}

inline bool BitWriter::PutVlqInt(int32_t v) {
inline bool BitWriter::PutVlqInt(uint32_t v) {
bool result = true;
while ((v & 0xFFFFFF80) != 0L) {
result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
Expand Down Expand Up @@ -152,20 +152,18 @@ inline bool BitReader::GetVlqInt(int32_t* v) {
return true;
}

// TODO(nongli): review/test these implementations given divergence in Impala
// functions

// inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
// uint32_t u = (v << 1) ^ (v >> 31);
// return PutVlqInt(u);
// }
inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
uint32_t u = (v << 1) ^ (v >> 31);
return PutVlqInt(u);
}

// inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
// uint64_t u;
// if (!GetVlqInt(&u)) return false;
// *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
// return true;
// }
inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
int32_t u_signed;
if (!GetVlqInt(&u_signed)) return false;
uint32_t u = static_cast<uint32_t>(u_signed);
*reinterpret_cast<uint32_t*>(v) = (u >> 1) ^ -(u & 1);
return true;
}

} // namespace parquet_cpp

Expand Down
19 changes: 19 additions & 0 deletions cpp/src/parquet/util/bit-util-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <gtest/gtest.h>

#include "parquet/util/bit-util.h"
#include "parquet/util/bit-stream-utils.inline.h"
#include "parquet/util/cpu-info.h"

namespace parquet_cpp {
Expand Down Expand Up @@ -161,4 +162,22 @@ TEST(BitUtil, RoundUpDown) {
EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
}

void TestZigZag(int32_t v) {
uint8_t buffer[BitReader::MAX_VLQ_BYTE_LEN];
BitWriter writer(buffer, sizeof(buffer));
BitReader reader(buffer, sizeof(buffer));
writer.PutZigZagVlqInt(v);
int32_t result;
EXPECT_TRUE(reader.GetZigZagVlqInt(&result));
EXPECT_EQ(v, result);
}

TEST(BitStreamUtil, ZigZag) {
TestZigZag(0);
TestZigZag(1);
TestZigZag(-1);
TestZigZag(std::numeric_limits<int32_t>::max());
TestZigZag(-std::numeric_limits<int32_t>::max());
}

} // namespace parquet_cpp

0 comments on commit 88e5e0e

Please sign in to comment.