Skip to content

Commit

Permalink
PARQUET-769: Add support for Brotli compression
Browse files Browse the repository at this point in the history
Author: Uwe L. Korn <uwelk@xhochy.com>

Closes apache#194 from xhochy/PARQUET-769 and squashes the following commits:

aad390f [Uwe L. Korn] Pass buffer sizes also as in parameter
9847171 [Uwe L. Korn] make format
855250d [Uwe L. Korn] make format
40e93de [Uwe L. Korn] Add FindBrotli
47b9d03 [Uwe L. Korn] PARQUET-769: Add support for Brotli compression

Change-Id: I2f6b1c03f0b7e7d83f64d0e34859eb09f2702838
  • Loading branch information
xhochy authored and wesm committed Nov 26, 2016
1 parent 187bedb commit 776e399
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 2 deletions.
10 changes: 10 additions & 0 deletions cpp/src/parquet/column/column-writer-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,11 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithSnappyCompression) {
Encoding::PLAIN, Compression::SNAPPY, false, false, LARGE_SIZE);
}

TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithBrotliCompression) {
this->TestRequiredWithSettings(
Encoding::PLAIN, Compression::BROTLI, false, false, LARGE_SIZE);
}

TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithGzipCompression) {
this->TestRequiredWithSettings(
Encoding::PLAIN, Compression::GZIP, false, false, LARGE_SIZE);
Expand All @@ -274,6 +279,11 @@ TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndSnappyCompression) {
Encoding::PLAIN, Compression::SNAPPY, false, true, LARGE_SIZE);
}

TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndBrotliCompression) {
this->TestRequiredWithSettings(
Encoding::PLAIN, Compression::BROTLI, false, true, LARGE_SIZE);
}

TYPED_TEST(TestPrimitiveWriter, RequiredPlainWithStatsAndGzipCompression) {
this->TestRequiredWithSettings(
Encoding::PLAIN, Compression::GZIP, false, true, LARGE_SIZE);
Expand Down
53 changes: 53 additions & 0 deletions cpp/src/parquet/compression/brotli-codec.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <cstdint>
#include <cstdlib>
#include <brotli/decode.h>
#include <brotli/encode.h>

#include "parquet/compression/codec.h"
#include "parquet/exception.h"

namespace parquet {

void BrotliCodec::Decompress(
int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) {
size_t output_size = output_len;
if (BrotliDecoderDecompress(input_len, input, &output_size, output_buffer) !=
BROTLI_DECODER_RESULT_SUCCESS) {
throw parquet::ParquetException("Corrupt brotli compressed data.");
}
}

int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) {
return BrotliEncoderMaxCompressedSize(input_len);
}

int64_t BrotliCodec::Compress(int64_t input_len, const uint8_t* input,
int64_t output_buffer_len, uint8_t* output_buffer) {
size_t output_len = output_buffer_len;
// TODO: Make quality configurable. We use 8 as a default as it is the best
// trade-off for Parquet workload
if (BrotliEncoderCompress(8, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE, input_len,
input, &output_len, output_buffer) == BROTLI_FALSE) {
throw parquet::ParquetException("Brotli compression failure.");
}
return output_len;
}

} // namespace parquet
4 changes: 4 additions & 0 deletions cpp/src/parquet/compression/codec-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ TEST(TestCompressors, Snappy) {
CheckCodec<SnappyCodec>();
}

TEST(TestCompressors, Brotli) {
CheckCodec<BrotliCodec>();
}

TEST(TestCompressors, GZip) {
CheckCodec<GZipCodec>();
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/compression/codec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ std::unique_ptr<Codec> Codec::Create(Compression::type codec_type) {
ParquetException::NYI("LZO codec not implemented");
break;
case Compression::BROTLI:
ParquetException::NYI("BROTLI codec not implemented");
result.reset(new BrotliCodec());
break;
default:
ParquetException::NYI("Unrecognized codec");
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/parquet/compression/codec.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ class SnappyCodec : public Codec {
virtual const char* name() const { return "snappy"; }
};

// Brotli codec.
class BrotliCodec : public Codec {
public:
void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
uint8_t* output_buffer) override;

int64_t Compress(int64_t input_len, const uint8_t* input,
int64_t output_buffer_len, uint8_t* output_buffer) override;

int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;

const char* name() const override { return "brotli"; }
};

// GZip codec.
class GZipCodec : public Codec {
public:
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/parquet/file/file-deserialize-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ TEST_F(TestPageSerde, TestFailLargePageHeaders) {
}

TEST_F(TestPageSerde, Compression) {
Compression::type codec_types[2] = {Compression::GZIP, Compression::SNAPPY};
Compression::type codec_types[3] = {
Compression::GZIP, Compression::SNAPPY, Compression::BROTLI};

// This is a dummy number
data_page_header_.num_values = 32;
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/parquet/file/file-serialize-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ TYPED_TEST(TestSerialize, SmallFileSnappy) {
this->FileSerializeTest(Compression::SNAPPY);
}

TYPED_TEST(TestSerialize, SmallFileBrotli) {
this->FileSerializeTest(Compression::BROTLI);
}

TYPED_TEST(TestSerialize, SmallFileGzip) {
this->FileSerializeTest(Compression::GZIP);
}
Expand Down

0 comments on commit 776e399

Please sign in to comment.