Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reading empty Parquet DataPage #10121

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions velox/dwio/parquet/reader/PageReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,9 @@ void PageReader::prepareDataPageV2(const PageHeader& pageHeader, int64_t row) {
}
auto levelsSize = repeatLength + defineLength;
pageData_ += levelsSize;
if (pageHeader.data_page_header_v2.__isset.is_compressed ||
pageHeader.data_page_header_v2.is_compressed) {
if (pageHeader.data_page_header_v2.__isset.is_compressed &&
pageHeader.data_page_header_v2.is_compressed &&
(pageHeader.compressed_page_size - levelsSize > 0)) {
pageData_ = decompressData(
pageData_,
pageHeader.compressed_page_size - levelsSize,
Expand Down
Binary file not shown.
98 changes: 56 additions & 42 deletions velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class ParquetReaderTest : public ParquetTestBase {
const RowTypePtr& rowType) {
const std::string sample(getExampleFilePath(fileName));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

RowReaderOptions rowReaderOpts;
Expand All @@ -56,7 +56,7 @@ class ParquetReaderTest : public ParquetTestBase {
FilterMap filters,
const RowVectorPtr& expected) {
const auto filePath(getExampleFilePath(fileName));
facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(filePath, readerOpts);
assertReadWithReaderAndFilters(
std::move(reader), fileName, fileSchema, std::move(filters), expected);
Expand All @@ -71,7 +71,7 @@ TEST_F(ParquetReaderTest, parseSample) {
// b: [1.0..20.0]
const std::string sample(getExampleFilePath("sample.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 20ULL);

Expand Down Expand Up @@ -111,7 +111,7 @@ TEST_F(ParquetReaderTest, parseUnannotatedList) {
// }
const std::string sample(getExampleFilePath("unannotated_list.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(sample, readerOpts);

EXPECT_EQ(reader->numberOfRows(), 22ULL);
Expand Down Expand Up @@ -166,7 +166,7 @@ TEST_F(ParquetReaderTest, parseUnannotatedMap) {
const std::string filename("unnotated_map.parquet");
const std::string sample(getExampleFilePath(filename));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

auto type = reader->typeWithId();
Expand Down Expand Up @@ -205,7 +205,7 @@ TEST_F(ParquetReaderTest, parseLegacyListWithMultipleChildren) {
const std::string filename("listmultiplechildren.parquet");
const std::string sample(getExampleFilePath(filename));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

auto type = reader->typeWithId();
Expand Down Expand Up @@ -247,7 +247,7 @@ TEST_F(ParquetReaderTest, parseLegacyListWithMultipleChildren) {
TEST_F(ParquetReaderTest, parseSampleRange1) {
const std::string sample(getExampleFilePath("sample.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(sample, readerOpts);

auto rowReaderOpts = getReaderOpts(sampleSchema());
Expand All @@ -266,7 +266,7 @@ TEST_F(ParquetReaderTest, parseSampleRange1) {
TEST_F(ParquetReaderTest, parseSampleRange2) {
const std::string sample(getExampleFilePath("sample.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(sample, readerOpts);

auto rowReaderOpts = getReaderOpts(sampleSchema());
Expand All @@ -285,7 +285,7 @@ TEST_F(ParquetReaderTest, parseSampleRange2) {
TEST_F(ParquetReaderTest, parseSampleEmptyRange) {
const std::string sample(getExampleFilePath("sample.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(sample, readerOpts);

auto rowReaderOpts = getReaderOpts(sampleSchema());
Expand All @@ -303,7 +303,7 @@ TEST_F(ParquetReaderTest, parseReadAsLowerCase) {
// 2 rows.
const std::string upper(getExampleFilePath("upper.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
readerOptions.setFileColumnNamesReadAsLowerCase(true);
auto reader = createReader(upper, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 2ULL);
Expand Down Expand Up @@ -337,7 +337,7 @@ TEST_F(ParquetReaderTest, parseRowMapArrayReadAsLowerCase) {
// +-----------------------+
const std::string upper(getExampleFilePath("upper_complex.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
readerOptions.setFileColumnNamesReadAsLowerCase(true);
auto reader = createReader(upper, readerOptions);

Expand Down Expand Up @@ -380,7 +380,7 @@ TEST_F(ParquetReaderTest, parseEmpty) {
// 0 rows.
const std::string empty(getExampleFilePath("empty.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(empty, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 0ULL);

Expand All @@ -402,7 +402,7 @@ TEST_F(ParquetReaderTest, parseInt) {
// bigint: [1000 .. 1009]
const std::string sample(getExampleFilePath("int.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(sample, readerOpts);

EXPECT_EQ(reader->numberOfRows(), 10ULL);
Expand Down Expand Up @@ -437,7 +437,7 @@ TEST_F(ParquetReaderTest, parseUnsignedInt1) {
// uint64: [18446744073709551615, 2000000000000000000, 3000000000000000000]
const std::string sample(getExampleFilePath("uint.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

EXPECT_EQ(reader->numberOfRows(), 3ULL);
Expand All @@ -457,9 +457,8 @@ TEST_F(ParquetReaderTest, parseUnsignedInt1) {
{TINYINT(), SMALLINT(), INTEGER(), BIGINT()});

RowReaderOptions rowReaderOpts;
rowReaderOpts.select(
std::make_shared<facebook::velox::dwio::common::ColumnSelector>(
rowType, rowType->names()));
rowReaderOpts.select(std::make_shared<dwio::common::ColumnSelector>(
rowType, rowType->names()));
rowReaderOpts.setScanSpec(makeScanSpec(rowType));
auto rowReader = reader->createRowReader(rowReaderOpts);

Expand Down Expand Up @@ -541,7 +540,7 @@ TEST_F(ParquetReaderTest, parseDate) {
// date: [1969-12-27 .. 1970-01-20]
const std::string sample(getExampleFilePath("date.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

EXPECT_EQ(reader->numberOfRows(), 25ULL);
Expand All @@ -568,7 +567,7 @@ TEST_F(ParquetReaderTest, parseRowMapArray) {
// ARRAY(INTEGER)) c1) c)
const std::string sample(getExampleFilePath("row_map_array.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

EXPECT_EQ(reader->numberOfRows(), 1ULL);
Expand Down Expand Up @@ -601,7 +600,7 @@ TEST_F(ParquetReaderTest, parseRowMapArray) {
TEST_F(ParquetReaderTest, projectNoColumns) {
// This is the case for count(*).
auto rowType = ROW({}, {});
facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
auto reader = createReader(getExampleFilePath("sample.parquet"), readerOpts);
RowReaderOptions rowReaderOpts;
rowReaderOpts.setScanSpec(makeScanSpec(rowType));
Expand All @@ -625,7 +624,7 @@ TEST_F(ParquetReaderTest, parseIntDecimal) {
// a: [11.11, 11.11, 22.22, 22.22, 33.33, 33.33]
// b: [11.11, 11.11, 22.22, 22.22, 33.33, 33.33]
auto rowType = ROW({"a", "b"}, {DECIMAL(7, 2), DECIMAL(14, 2)});
facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
const std::string decimal_dict(getExampleFilePath("decimal_dict.parquet"));

auto reader = createReader(decimal_dict, readerOpts);
Expand Down Expand Up @@ -673,7 +672,7 @@ TEST_F(ParquetReaderTest, parseMapKeyValueAsMap) {

const std::string sample(getExampleFilePath("map_key_value.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 1ULL);

Expand Down Expand Up @@ -725,9 +724,8 @@ TEST_F(ParquetReaderTest, readSampleBigintRangeFilter) {
TEST_F(ParquetReaderTest, readSampleBigintValuesUsingBitmaskFilter) {
// Read sample.parquet with the int filter "a in 16, 17, 18, 19, 20".
std::vector<int64_t> values{16, 17, 18, 19, 20};
auto bigintBitmaskFilter =
std::make_unique<facebook::velox::common::BigintValuesUsingBitmask>(
16, 20, std::move(values), false);
auto bigintBitmaskFilter = std::make_unique<common::BigintValuesUsingBitmask>(
16, 20, std::move(values), false);
FilterMap filters;
filters.insert({"a", std::move(bigintBitmaskFilter)});
auto expected = makeRowVector({
Expand Down Expand Up @@ -956,7 +954,7 @@ TEST_F(ParquetReaderTest, filterRowGroups) {
// decimal_no_ColumnMetadata.parquet has one columns a: DECIMAL(9,1). It
// doesn't have ColumnMetaData, and rowGroups_[0].columns[0].file_offset is 0.
auto rowType = ROW({"_c0"}, {DECIMAL(9, 1)});
facebook::velox::dwio::common::ReaderOptions readerOpts{leafPool_.get()};
dwio::common::ReaderOptions readerOpts{leafPool_.get()};
const std::string decimal_dict(
getExampleFilePath("decimal_no_ColumnMetadata.parquet"));

Expand All @@ -972,7 +970,7 @@ TEST_F(ParquetReaderTest, parseLongTagged) {
// This is a case for long with annonation read
const std::string sample(getExampleFilePath("tagged_long.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

EXPECT_EQ(reader->numberOfRows(), 4ULL);
Expand All @@ -990,7 +988,7 @@ TEST_F(ParquetReaderTest, preloadSmallFile) {
auto file = std::make_shared<LocalReadFile>(sample);
auto input = std::make_unique<BufferedInput>(file, *leafPool_);

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader =
std::make_unique<ParquetReader>(std::move(input), readerOptions);

Expand All @@ -1002,10 +1000,8 @@ TEST_F(ParquetReaderTest, preloadSmallFile) {
// Ensure the input is small parquet file.
const auto fileSize = file->size();
ASSERT_TRUE(
fileSize <= facebook::velox::dwio::common::ReaderOptions::
kDefaultFilePreloadThreshold ||
fileSize <= facebook::velox::dwio::common::ReaderOptions::
kDefaultFooterEstimatedSize);
fileSize <= dwio::common::ReaderOptions::kDefaultFilePreloadThreshold ||
fileSize <= dwio::common::ReaderOptions::kDefaultFooterEstimatedSize);

// Check the whole file already loaded.
ASSERT_EQ(file->bytesRead(), fileSize);
Expand All @@ -1026,7 +1022,7 @@ TEST_F(ParquetReaderTest, prefetchRowGroups) {
const std::string sample(getExampleFilePath("multiple_row_groups.parquet"));
const int numRowGroups = 4;

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
// Disable preload of file.
readerOptions.setFilePreloadThreshold(0);

Expand All @@ -1035,10 +1031,7 @@ TEST_F(ParquetReaderTest, prefetchRowGroups) {
// 4: Exactly as total number of row groups.
// 10: More than total number of row groups.
const std::vector<int> numPrefetchRowGroups{
facebook::velox::dwio::common::ReaderOptions::kDefaultPrefetchRowGroups,
2,
4,
10};
dwio::common::ReaderOptions::kDefaultPrefetchRowGroups, 2, 4, 10};
for (auto numPrefetch : numPrefetchRowGroups) {
readerOptions.setPrefetchRowGroups(numPrefetch);

Expand Down Expand Up @@ -1082,7 +1075,7 @@ TEST_F(ParquetReaderTest, testEmptyRowGroups) {
// empty_row_groups.parquet contains empty row groups
const std::string sample(getExampleFilePath("empty_row_groups.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 5ULL);

Expand All @@ -1108,7 +1101,7 @@ TEST_F(ParquetReaderTest, testEnumType) {
// enum_type.parquet contains 1 column (ENUM) with 3 rows.
const std::string sample(getExampleFilePath("enum_type.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 3ULL);

Expand All @@ -1133,7 +1126,7 @@ TEST_F(ParquetReaderTest, readVarbinaryFromFLBA) {
const std::string filename("varbinary_flba.parquet");
const std::string sample(getExampleFilePath(filename));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);

auto type = reader->typeWithId();
Expand Down Expand Up @@ -1161,7 +1154,7 @@ TEST_F(ParquetReaderTest, testV2PageWithZeroMaxDefRep) {
// enum_type.parquet contains 1 column (ENUM) with 3 rows.
const std::string sample(getExampleFilePath("v2_page.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 5ULL);

Expand All @@ -1185,7 +1178,7 @@ TEST_F(ParquetReaderTest, testV2PageWithZeroMaxDefRep) {
TEST_F(ParquetReaderTest, testLzoDataPage) {
const std::string sample(getExampleFilePath("lzo.parquet"));

facebook::velox::dwio::common::ReaderOptions readerOptions{leafPool_.get()};
dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 23'547ULL);

Expand Down Expand Up @@ -1215,3 +1208,24 @@ TEST_F(ParquetReaderTest, testLzoDataPage) {
.str(),
"31232");
}

TEST_F(ParquetReaderTest, testEmptyV2DataPage) {
const std::string sample(getExampleFilePath("empty_v2datapage.parquet"));

dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 30001ULL);

auto outputRowType = ROW({"test"}, {REAL()});
EXPECT_EQ(*(reader->typeWithId()->type()), *outputRowType);

auto rowReaderOpts = getReaderOpts(outputRowType);
rowReaderOpts.setScanSpec(makeScanSpec(outputRowType));
auto rowReader = reader->createRowReader(rowReaderOpts);

auto expected = makeRowVector({makeFlatVector<float>(
30001, [](auto /*row*/) { return 1; }, nullEvery(1))});

assertReadWithReaderAndExpected(
outputRowType, *rowReader, expected, *leafPool_);
}
Loading