Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parsing array struct when repetition_type is 'REPEATED' #10753

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,43 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
maxDefine,
isOptional,
isRepeated);
} else {
// Row type
// To support list backward compatibility, need create a new row type
// instance and set all the fields as its children.
auto childrenRowType =
createRowType(children, isFileColumnNamesReadAsLowerCase());
std::vector<std::unique_ptr<ParquetTypeWithId::TypeWithId>>
rowChildren;
// In this legacy case, there is no middle layer between "array"
// node and the children nodes. Below creates this dummy middle
// layer to mimic the non-legacy case and fill the gap.
rowChildren.emplace_back(std::make_unique<ParquetTypeWithId>(
childrenRowType,
std::move(children),
curSchemaIdx,
maxSchemaElementIdx,
ParquetTypeWithId::kNonLeaf,
"dummy",
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine,
isOptional,
isRepeated));
return std::make_unique<ParquetTypeWithId>(
TypeFactory<TypeKind::ARRAY>::create(childrenRowType),
std::move(rowChildren),
curSchemaIdx,
maxSchemaElementIdx,
ParquetTypeWithId::kNonLeaf, // columnIdx,
std::move(name),
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine,
isOptional,
isRepeated);
}
} else {
// Row type
Expand Down
Binary file not shown.
50 changes: 50 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,56 @@ TEST_F(ParquetReaderTest, parseMapKeyValueAsMap) {
assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_);
}

TEST_F(ParquetReaderTest, parseRowArrayTest) {
// schema:
// optionalPrimitive:int
// requiredPrimitive:int
// repeatedPrimitive:array<int>
// optionalMessage:struct<someId:int>
// requiredMessage:struct<someId:int>
// repeatedMessage:array<struct<someId:int>>
const std::string sample(
getExampleFilePath("proto-struct-with-array.parquet"));

dwio::common::ReaderOptions readerOptions{leafPool_.get()};
auto reader = createReader(sample, readerOptions);
EXPECT_EQ(reader->numberOfRows(), 1ULL);
auto type = reader->typeWithId();
EXPECT_EQ(type->size(), 6ULL);
auto col6_type = type->childAt(5);
EXPECT_EQ(col6_type->type()->kind(), TypeKind::ARRAY);
auto col6_1_type = col6_type->childAt(0);
EXPECT_EQ(col6_1_type->type()->kind(), TypeKind::ROW);

auto outputRowType =
ROW({"optionalPrimitive",
"requiredPrimitive",
"repeatedPrimitive",
"optionalMessage",
"requiredMessage",
"repeatedMessage"},
{INTEGER(),
INTEGER(),
ARRAY(INTEGER()),
ROW({"someId"}, {INTEGER()}),
ROW({"someId"}, {INTEGER()}),
ARRAY(ROW({"someId"}, {INTEGER()}))});
auto rowReaderOpts = getReaderOpts(outputRowType);
rowReaderOpts.setScanSpec(makeScanSpec(outputRowType));
auto rowReader = reader->createRowReader(rowReaderOpts);
VectorPtr result = BaseVector::create(outputRowType, 0, &*leafPool_);

ASSERT_TRUE(rowReader->next(1, result));
// data: 10, 9, <empty>, null, {9}, 2 elements starting at 0 {{9}, {10}}}
auto structArray = result->as<RowVector>()->childAt(5)->as<ArrayVector>();
auto structEle = structArray->elements()
->as<RowVector>()
->childAt(0)
->asFlatVector<int32_t>()
->valueAt(0);
EXPECT_EQ(structEle, 9);
}

TEST_F(ParquetReaderTest, readSampleBigintRangeFilter) {
// Read sample.parquet with the int filter "a BETWEEN 16 AND 20".
FilterMap filters;
Expand Down
Loading