Skip to content

Commit

Permalink
PARQUET-547: Refactor templates to all be based on DataType structs
Browse files Browse the repository at this point in the history
Author: Wes McKinney <wesm@apache.org>

Closes apache#91 from wesm/PARQUET-547 and squashes the following commits:

97b8b9a [Wes McKinney] Refactor templates to all be based on DataType subclasses

Change-Id: I5fb05d2ec3892016b7bccf7c0349bae56e3214e0
  • Loading branch information
wesm committed Apr 30, 2016
1 parent dded816 commit ff2017a
Show file tree
Hide file tree
Showing 16 changed files with 149 additions and 153 deletions.
30 changes: 15 additions & 15 deletions cpp/src/parquet/column/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ ColumnReader::ColumnReader(const ColumnDescriptor* descr,
num_decoded_values_(0),
allocator_(allocator) {}

template <int TYPE>
void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {
template <typename DType>
void TypedColumnReader<DType>::ConfigureDictionary(const DictionaryPage* page) {
int encoding = static_cast<int>(page->encoding());
if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
page->encoding() == Encoding::PLAIN) {
Expand All @@ -51,7 +51,7 @@ void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {

if (page->encoding() == Encoding::PLAIN_DICTIONARY ||
page->encoding() == Encoding::PLAIN) {
PlainDecoder<TYPE> dictionary(descr_);
PlainDecoder<DType> dictionary(descr_);
dictionary.SetData(page->num_values(), page->data(), page->size());

// The dictionary is fully decoded during DictionaryDecoder::Init, so the
Expand All @@ -60,7 +60,7 @@ void TypedColumnReader<TYPE>::ConfigureDictionary(const DictionaryPage* page) {
// TODO(wesm): investigate whether this all-or-nothing decoding of the
// dictionary makes sense and whether performance can be improved

auto decoder = std::make_shared<DictionaryDecoder<TYPE> >(descr_, allocator_);
auto decoder = std::make_shared<DictionaryDecoder<DType> >(descr_, allocator_);
decoder->SetDict(&dictionary);
decoders_[encoding] = decoder;
} else {
Expand All @@ -77,8 +77,8 @@ static bool IsDictionaryIndexEncoding(const Encoding::type& e) {
e == Encoding::PLAIN_DICTIONARY;
}

template <int TYPE>
bool TypedColumnReader<TYPE>::ReadNewPage() {
template <typename DType>
bool TypedColumnReader<DType>::ReadNewPage() {
// Loop until we find the next data page.
const uint8_t* buffer;

Expand Down Expand Up @@ -147,7 +147,7 @@ bool TypedColumnReader<TYPE>::ReadNewPage() {
} else {
switch (encoding) {
case Encoding::PLAIN: {
std::shared_ptr<DecoderType> decoder(new PlainDecoder<TYPE>(descr_));
std::shared_ptr<DecoderType> decoder(new PlainDecoder<DType>(descr_));
decoders_[static_cast<int>(encoding)] = decoder;
current_decoder_ = decoder.get();
break;
Expand Down Expand Up @@ -227,13 +227,13 @@ std::shared_ptr<ColumnReader> ColumnReader::Make(
// ----------------------------------------------------------------------
// Instantiate templated classes

template class TypedColumnReader<Type::BOOLEAN>;
template class TypedColumnReader<Type::INT32>;
template class TypedColumnReader<Type::INT64>;
template class TypedColumnReader<Type::INT96>;
template class TypedColumnReader<Type::FLOAT>;
template class TypedColumnReader<Type::DOUBLE>;
template class TypedColumnReader<Type::BYTE_ARRAY>;
template class TypedColumnReader<Type::FIXED_LEN_BYTE_ARRAY>;
template class TypedColumnReader<BooleanType>;
template class TypedColumnReader<Int32Type>;
template class TypedColumnReader<Int64Type>;
template class TypedColumnReader<Int96Type>;
template class TypedColumnReader<FloatType>;
template class TypedColumnReader<DoubleType>;
template class TypedColumnReader<ByteArrayType>;
template class TypedColumnReader<FLBAType>;

} // namespace parquet
30 changes: 15 additions & 15 deletions cpp/src/parquet/column/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,10 @@ class ColumnReader {
};

// API to read values from a single column. This is the main client facing API.
template <int TYPE>
template <typename DType>
class TypedColumnReader : public ColumnReader {
public:
typedef typename type_traits<TYPE>::value_type T;
typedef typename DType::c_type T;

TypedColumnReader(const ColumnDescriptor* schema,
std::unique_ptr<PageReader> pager,
Expand All @@ -131,7 +131,7 @@ class TypedColumnReader : public ColumnReader {
T* values, int64_t* values_read);

private:
typedef Decoder<TYPE> DecoderType;
typedef Decoder<DType> DecoderType;

// Advance to the next data page
virtual bool ReadNewPage();
Expand All @@ -153,14 +153,14 @@ class TypedColumnReader : public ColumnReader {
};


template <int TYPE>
inline int64_t TypedColumnReader<TYPE>::ReadValues(int64_t batch_size, T* out) {
template <typename DType>
inline int64_t TypedColumnReader<DType>::ReadValues(int64_t batch_size, T* out) {
int64_t num_decoded = current_decoder_->Decode(out, batch_size);
return num_decoded;
}

template <int TYPE>
inline int64_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_levels,
template <typename DType>
inline int64_t TypedColumnReader<DType>::ReadBatch(int batch_size, int16_t* def_levels,
int16_t* rep_levels, T* values, int64_t* values_read) {
// HasNext invokes ReadNewPage
if (!HasNext()) {
Expand Down Expand Up @@ -208,14 +208,14 @@ inline int64_t TypedColumnReader<TYPE>::ReadBatch(int batch_size, int16_t* def_l
}


typedef TypedColumnReader<Type::BOOLEAN> BoolReader;
typedef TypedColumnReader<Type::INT32> Int32Reader;
typedef TypedColumnReader<Type::INT64> Int64Reader;
typedef TypedColumnReader<Type::INT96> Int96Reader;
typedef TypedColumnReader<Type::FLOAT> FloatReader;
typedef TypedColumnReader<Type::DOUBLE> DoubleReader;
typedef TypedColumnReader<Type::BYTE_ARRAY> ByteArrayReader;
typedef TypedColumnReader<Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayReader;
typedef TypedColumnReader<BooleanType> BoolReader;
typedef TypedColumnReader<Int32Type> Int32Reader;
typedef TypedColumnReader<Int64Type> Int64Reader;
typedef TypedColumnReader<Int96Type> Int96Reader;
typedef TypedColumnReader<FloatType> FloatReader;
typedef TypedColumnReader<DoubleType> DoubleReader;
typedef TypedColumnReader<ByteArrayType> ByteArrayReader;
typedef TypedColumnReader<FLBAType> FixedLenByteArrayReader;

} // namespace parquet

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/parquet/column/scanner-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ class TestFlatScanner : public ::testing::Test {
}

void CheckResults(int batch_size, const ColumnDescriptor *d) {
TypedScanner<Type::type_num>* scanner =
reinterpret_cast<TypedScanner<Type::type_num>* >(scanner_.get());
TypedScanner<Type>* scanner =
reinterpret_cast<TypedScanner<Type>* >(scanner_.get());
T val;
bool is_null = false;
int16_t def_level;
Expand Down Expand Up @@ -243,8 +243,8 @@ TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) {
data_buffer_, pages_);
num_levels_ = 1 * 100;
InitScanner(&d);
TypedScanner<FLBAType::type_num>* scanner =
reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
TypedScanner<FLBAType>* scanner =
reinterpret_cast<TypedScanner<FLBAType>* >(scanner_.get());
ASSERT_EQ(10, scanner->descr()->type_precision());
ASSERT_EQ(2, scanner->descr()->type_scale());
ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length());
Expand All @@ -258,8 +258,8 @@ TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) {
data_buffer_, pages_);
num_levels_ = 1 * 100;
InitScanner(&d);
TypedScanner<FLBAType::type_num>* scanner =
reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
TypedScanner<FLBAType>* scanner =
reinterpret_cast<TypedScanner<FLBAType>* >(scanner_.get());
scanner->SetBatchSize(batch_size);
std::stringstream ss_fail;
for (int i = 0; i < num_levels_; i++) {
Expand Down
46 changes: 23 additions & 23 deletions cpp/src/parquet/column/scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,17 +91,17 @@ class Scanner {
};


template <int TYPE>
template <typename DType>
class TypedScanner : public Scanner {
public:
typedef typename type_traits<TYPE>::value_type T;
typedef typename DType::c_type T;

explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
MemoryAllocator* allocator = default_allocator()) :
Scanner(reader, batch_size, allocator) {
typed_reader_ = static_cast<TypedColumnReader<TYPE>*>(reader.get());
int value_byte_size = type_traits<TYPE>::value_byte_size;
typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader.get());
int value_byte_size = type_traits<DType::type_num>::value_byte_size;
value_buffer_.Resize(batch_size_ * value_byte_size);
values_ = reinterpret_cast<T*>(&value_buffer_[0]);
}
Expand Down Expand Up @@ -183,7 +183,7 @@ class TypedScanner : public Scanner {
}

if (is_null) {
std::string null_fmt = format_fwf<Type::BYTE_ARRAY>(width);
std::string null_fmt = format_fwf<ByteArrayType>(width);
snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
} else {
FormatValue(&val, buffer, sizeof(buffer), width);
Expand All @@ -193,55 +193,55 @@ class TypedScanner : public Scanner {

private:
// The ownership of this object is expressed through the reader_ variable in the base
TypedColumnReader<TYPE>* typed_reader_;
TypedColumnReader<DType>* typed_reader_;

inline void FormatValue(void* val, char* buffer, int bufsize, int width);

T* values_;
};


template <int TYPE>
inline void TypedScanner<TYPE>::FormatValue(void* val, char* buffer,
template <typename DType>
inline void TypedScanner<DType>::FormatValue(void* val, char* buffer,
int bufsize, int width) {
std::string fmt = format_fwf<TYPE>(width);
std::string fmt = format_fwf<DType>(width);
snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
}

template <>
inline void TypedScanner<Type::INT96>::FormatValue(
inline void TypedScanner<Int96Type>::FormatValue(
void* val, char* buffer, int bufsize, int width) {
std::string fmt = format_fwf<Type::INT96>(width);
std::string fmt = format_fwf<Int96Type>(width);
std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}

template <>
inline void TypedScanner<Type::BYTE_ARRAY>::FormatValue(
inline void TypedScanner<ByteArrayType>::FormatValue(
void* val, char* buffer, int bufsize, int width) {
std::string fmt = format_fwf<Type::BYTE_ARRAY>(width);
std::string fmt = format_fwf<ByteArrayType>(width);
std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}

template <>
inline void TypedScanner<Type::FIXED_LEN_BYTE_ARRAY>::FormatValue(
inline void TypedScanner<FLBAType>::FormatValue(
void* val, char* buffer, int bufsize, int width) {
std::string fmt = format_fwf<Type::FIXED_LEN_BYTE_ARRAY>(width);
std::string fmt = format_fwf<FLBAType>(width);
std::string result = FixedLenByteArrayToString(
*reinterpret_cast<FixedLenByteArray*>(val),
descr()->type_length());
snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
}

typedef TypedScanner<Type::BOOLEAN> BoolScanner;
typedef TypedScanner<Type::INT32> Int32Scanner;
typedef TypedScanner<Type::INT64> Int64Scanner;
typedef TypedScanner<Type::INT96> Int96Scanner;
typedef TypedScanner<Type::FLOAT> FloatScanner;
typedef TypedScanner<Type::DOUBLE> DoubleScanner;
typedef TypedScanner<Type::BYTE_ARRAY> ByteArrayScanner;
typedef TypedScanner<Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayScanner;
typedef TypedScanner<BooleanType> BoolScanner;
typedef TypedScanner<Int32Type> Int32Scanner;
typedef TypedScanner<Int64Type> Int64Scanner;
typedef TypedScanner<Int96Type> Int96Scanner;
typedef TypedScanner<FloatType> FloatScanner;
typedef TypedScanner<DoubleType> DoubleScanner;
typedef TypedScanner<ByteArrayType> ByteArrayScanner;
typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;

} // namespace parquet

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/parquet/column/test-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class DataPageBuilder {

void AppendValues(const ColumnDescriptor *d, const vector<T>& values,
Encoding::type encoding = Encoding::PLAIN) {
PlainEncoder<Type::type_num> encoder(d);
PlainEncoder<Type> encoder(d);
encoder.Encode(&values[0], values.size(), sink_);

num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
Expand Down Expand Up @@ -195,7 +195,7 @@ void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor *d,
if (encoding != Encoding::PLAIN) {
ParquetException::NYI("only plain encoding currently implemented");
}
PlainEncoder<Type::BOOLEAN> encoder(d);
PlainEncoder<BooleanType> encoder(d);
encoder.Encode(values, values.size(), sink_);

num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
Expand Down
23 changes: 11 additions & 12 deletions cpp/src/parquet/column/writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ int64_t ColumnWriter::Close() {
// ----------------------------------------------------------------------
// TypedColumnWriter

template <int TYPE>
TypedColumnWriter<TYPE>::TypedColumnWriter(const ColumnDescriptor* schema,
template <typename Type>
TypedColumnWriter<Type>::TypedColumnWriter(const ColumnDescriptor* schema,
std::unique_ptr<PageWriter> pager, int64_t expected_rows,
MemoryAllocator* allocator) :
ColumnWriter(schema, std::move(pager), expected_rows, allocator) {
// TODO(PARQUET-590) Get decoder type from WriterProperties
current_encoder_ = std::unique_ptr<EncoderType>(
new PlainEncoder<TYPE>(schema, allocator));
new PlainEncoder<Type>(schema, allocator));
}

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -170,14 +170,13 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(
// ----------------------------------------------------------------------
// Instantiate templated classes

template class TypedColumnWriter<Type::BOOLEAN>;
template class TypedColumnWriter<Type::INT32>;
template class TypedColumnWriter<Type::INT64>;
template class TypedColumnWriter<Type::INT96>;
template class TypedColumnWriter<Type::FLOAT>;
template class TypedColumnWriter<Type::DOUBLE>;
template class TypedColumnWriter<Type::BYTE_ARRAY>;
template class TypedColumnWriter<Type::FIXED_LEN_BYTE_ARRAY>;

template class TypedColumnWriter<BooleanType>;
template class TypedColumnWriter<Int32Type>;
template class TypedColumnWriter<Int64Type>;
template class TypedColumnWriter<Int96Type>;
template class TypedColumnWriter<FloatType>;
template class TypedColumnWriter<DoubleType>;
template class TypedColumnWriter<ByteArrayType>;
template class TypedColumnWriter<FLBAType>;

} // namespace parquet
32 changes: 15 additions & 17 deletions cpp/src/parquet/column/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ class ColumnWriter {
};

// API to write values to a single column. This is the main client facing API.
template <int TYPE>
template <typename DType>
class TypedColumnWriter : public ColumnWriter {
public:
typedef typename type_traits<TYPE>::value_type T;
typedef typename DType::c_type T;

TypedColumnWriter(const ColumnDescriptor* schema,
std::unique_ptr<PageWriter> pager, int64_t expected_rows,
Expand All @@ -116,7 +116,7 @@ class TypedColumnWriter : public ColumnWriter {
T* values);

private:
typedef Encoder<TYPE> EncoderType;
typedef Encoder<DType> EncoderType;

// Write values to a temporary buffer before they are encoded into pages
void WriteValues(int64_t num_values, T* values);
Expand All @@ -135,8 +135,8 @@ class TypedColumnWriter : public ColumnWriter {
// See also: parquet-column/../column/impl/ColumnWriteStoreV2.java:sizeCheck
const int64_t PAGE_VALUE_COUNT = 1000;

template <int TYPE>
inline void TypedColumnWriter<TYPE>::WriteBatch(int64_t num_values, int16_t* def_levels,
template <typename DType>
inline void TypedColumnWriter<DType>::WriteBatch(int64_t num_values, int16_t* def_levels,
int16_t* rep_levels, T* values) {
int64_t values_to_write = 0;

Expand Down Expand Up @@ -185,22 +185,20 @@ inline void TypedColumnWriter<TYPE>::WriteBatch(int64_t num_values, int16_t* def
}
}

template <int TYPE>
void TypedColumnWriter<TYPE>::WriteValues(int64_t num_values, T* values) {
template <typename DType>
void TypedColumnWriter<DType>::WriteValues(int64_t num_values, T* values) {
current_encoder_->Encode(values, num_values, values_sink_.get());
}


typedef TypedColumnWriter<Type::BOOLEAN> BoolWriter;
typedef TypedColumnWriter<Type::INT32> Int32Writer;
typedef TypedColumnWriter<Type::INT64> Int64Writer;
typedef TypedColumnWriter<Type::INT96> Int96Writer;
typedef TypedColumnWriter<Type::FLOAT> FloatWriter;
typedef TypedColumnWriter<Type::DOUBLE> DoubleWriter;
typedef TypedColumnWriter<Type::BYTE_ARRAY> ByteArrayWriter;
typedef TypedColumnWriter<Type::FIXED_LEN_BYTE_ARRAY> FixedLenByteArrayWriter;
typedef TypedColumnWriter<BooleanType> BoolWriter;
typedef TypedColumnWriter<Int32Type> Int32Writer;
typedef TypedColumnWriter<Int64Type> Int64Writer;
typedef TypedColumnWriter<Int96Type> Int96Writer;
typedef TypedColumnWriter<FloatType> FloatWriter;
typedef TypedColumnWriter<DoubleType> DoubleWriter;
typedef TypedColumnWriter<ByteArrayType> ByteArrayWriter;
typedef TypedColumnWriter<FLBAType> FixedLenByteArrayWriter;

} // namespace parquet

#endif // PARQUET_COLUMN_READER_H

Loading

0 comments on commit ff2017a

Please sign in to comment.