diff --git a/cpp/src/parquet/column/column-reader-test.cc b/cpp/src/parquet/column/column-reader-test.cc index 855669a9fc76d..6cd8925cfe70e 100644 --- a/cpp/src/parquet/column/column-reader-test.cc +++ b/cpp/src/parquet/column/column-reader-test.cc @@ -155,5 +155,70 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) { ExecuteDict(num_pages, levels_per_page, &descr); } +TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { + max_def_level_ = 0; + max_rep_level_ = 0; + NodePtr type = schema::Int32("a", Repetition::REQUIRED); + const ColumnDescriptor descr(type, max_def_level_, max_rep_level_); + shared_ptr dummy = std::make_shared(); + + shared_ptr dict_page = std::make_shared(dummy, + 0, Encoding::PLAIN); + shared_ptr data_page = MakeDataPage(&descr, {}, 0, + Encoding::RLE_DICTIONARY, {}, 0, {}, 0, {}, 0); + pages_.push_back(dict_page); + pages_.push_back(data_page); + InitReader(&descr); + // Tests Dict : PLAIN, Data : RLE_DICTIONARY + ASSERT_NO_THROW(reader_->HasNext()); + pages_.clear(); + + dict_page = std::make_shared(dummy, + 0, Encoding::PLAIN_DICTIONARY); + data_page = MakeDataPage(&descr, {}, 0, + Encoding::PLAIN_DICTIONARY, {}, 0, {}, 0, {}, 0); + pages_.push_back(dict_page); + pages_.push_back(data_page); + InitReader(&descr); + // Tests Dict : PLAIN_DICTIONARY, Data : PLAIN_DICTIONARY + ASSERT_NO_THROW(reader_->HasNext()); + pages_.clear(); + + data_page = MakeDataPage(&descr, {}, 0, + Encoding::RLE_DICTIONARY, {}, 0, {}, 0, {}, 0); + pages_.push_back(data_page); + InitReader(&descr); + // Tests dictionary page must occur before data page + ASSERT_THROW(reader_->HasNext(), ParquetException); + pages_.clear(); + + dict_page = std::make_shared(dummy, + 0, Encoding::DELTA_BYTE_ARRAY); + pages_.push_back(dict_page); + InitReader(&descr); + // Tests only RLE_DICTIONARY is supported + ASSERT_THROW(reader_->HasNext(), ParquetException); + pages_.clear(); + + shared_ptr dict_page1 = std::make_shared(dummy, + 0, Encoding::PLAIN_DICTIONARY); + shared_ptr dict_page2 = std::make_shared(dummy, + 0, Encoding::PLAIN); + pages_.push_back(dict_page1); + pages_.push_back(dict_page2); + InitReader(&descr); + // Column cannot have more than one dictionary + ASSERT_THROW(reader_->HasNext(), ParquetException); + pages_.clear(); + + data_page = MakeDataPage(&descr, {}, 0, + Encoding::DELTA_BYTE_ARRAY, {}, 0, {}, 0, {}, 0); + pages_.push_back(data_page); + InitReader(&descr); + // unsupported encoding + ASSERT_THROW(reader_->HasNext(), ParquetException); + pages_.clear(); +} + } // namespace test } // namespace parquet_cpp diff --git a/cpp/src/parquet/column/reader.cc b/cpp/src/parquet/column/reader.cc index 2885ebeabfc8b..bf76d4cc78eed 100644 --- a/cpp/src/parquet/column/reader.cc +++ b/cpp/src/parquet/column/reader.cc @@ -37,26 +37,35 @@ ColumnReader::ColumnReader(const ColumnDescriptor* descr, template void TypedColumnReader::ConfigureDictionary(const DictionaryPage* page) { - int encoding = static_cast(Encoding::RLE_DICTIONARY); + int encoding = static_cast(page->encoding()); + if (page->encoding() == Encoding::PLAIN_DICTIONARY || + page->encoding() == Encoding::PLAIN) { + encoding = static_cast(Encoding::RLE_DICTIONARY); + } auto it = decoders_.find(encoding); if (it != decoders_.end()) { throw ParquetException("Column cannot have more than one dictionary."); } - PlainDecoder dictionary(descr_); - dictionary.SetData(page->num_values(), page->data(), page->size()); - - // The dictionary is fully decoded during DictionaryDecoder::Init, so the - // DictionaryPage buffer is no longer required after this step - // - // TODO(wesm): investigate whether this all-or-nothing decoding of the - // dictionary makes sense and whether performance can be improved - - auto decoder = std::make_shared >(descr_); - decoder->SetDict(&dictionary); + if (page->encoding() == Encoding::PLAIN_DICTIONARY || + page->encoding() == Encoding::PLAIN) { + PlainDecoder dictionary(descr_); + dictionary.SetData(page->num_values(), page->data(), page->size()); + + // The dictionary is fully decoded during DictionaryDecoder::Init, so the + // DictionaryPage buffer is no longer required after this step + // + // TODO(wesm): investigate whether this all-or-nothing decoding of the + // dictionary makes sense and whether performance can be improved + + auto decoder = std::make_shared >(descr_); + decoder->SetDict(&dictionary); + decoders_[encoding] = decoder; + } else { + ParquetException::NYI("only plain dictionary encoding has been implemented"); + } - decoders_[encoding] = decoder; current_decoder_ = decoders_[encoding].get(); } @@ -130,6 +139,9 @@ bool TypedColumnReader::ReadNewPage() { auto it = decoders_.find(static_cast(encoding)); if (it != decoders_.end()) { + if (encoding == Encoding::RLE_DICTIONARY) { + DCHECK(current_decoder_->encoding() == Encoding::RLE_DICTIONARY); + } current_decoder_ = it->second.get(); } else { switch (encoding) { diff --git a/cpp/src/parquet/column/scanner-test.cc b/cpp/src/parquet/column/scanner-test.cc index b52a993f3013c..1d7579e4fc55d 100644 --- a/cpp/src/parquet/column/scanner-test.cc +++ b/cpp/src/parquet/column/scanner-test.cc @@ -217,6 +217,12 @@ TEST_F(TestFLBAFlatScanner, TestDictScanner) { Encoding::RLE_DICTIONARY); } +TEST_F(TestFLBAFlatScanner, TestPlainDictScanner) { + this->ExecuteAll(num_pages, num_levels_per_page, batch_size, FLBA_LENGTH, + Encoding::PLAIN_DICTIONARY); +} + + //PARQUET 502 TEST_F(TestFlatFLBAScanner, TestSmallBatch) { NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, diff --git a/cpp/src/parquet/column/test-util.h b/cpp/src/parquet/column/test-util.h index c9b08c2a8b035..4d10a4249009e 100644 --- a/cpp/src/parquet/column/test-util.h +++ b/cpp/src/parquet/column/test-util.h @@ -444,7 +444,8 @@ static int MakePages(const ColumnDescriptor *d, int num_pages, int levels_per_pa InitValues(num_values, values, buffer); PaginatePlain(d, values, def_levels, max_def_level, rep_levels, max_rep_level, levels_per_page, values_per_page, pages); - } else if (encoding == Encoding::RLE_DICTIONARY) { + } else if (encoding == Encoding::RLE_DICTIONARY + || encoding == Encoding::PLAIN_DICTIONARY) { // Calls InitValues and repeats the data InitDictValues(num_values, levels_per_page, values, buffer); PaginateDict(d, values, def_levels, max_def_level,