diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index 9200886e8eb4..e36e305fe943 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -490,6 +490,43 @@ std::unique_ptr ReaderBase::getParquetColumnInfo( maxDefine, isOptional, isRepeated); + } else { + // Row type + // To support list backward compatibility, need create a new row type + // instance and set all the fields as its children. + auto childrenRowType = + createRowType(children, isFileColumnNamesReadAsLowerCase()); + std::vector> + rowChildren; + // In this legacy case, there is no middle layer between "array" + // node and the children nodes. Below creates this dummy middle + // layer to mimic the non-legacy case and fill the gap. + rowChildren.emplace_back(std::make_unique( + childrenRowType, + std::move(children), + curSchemaIdx, + maxSchemaElementIdx, + ParquetTypeWithId::kNonLeaf, + "dummy", + std::nullopt, + std::nullopt, + maxRepeat, + maxDefine, + isOptional, + isRepeated)); + return std::make_unique( + TypeFactory::create(childrenRowType), + std::move(rowChildren), + curSchemaIdx, + maxSchemaElementIdx, + ParquetTypeWithId::kNonLeaf, // columnIdx, + std::move(name), + std::nullopt, + std::nullopt, + maxRepeat, + maxDefine, + isOptional, + isRepeated); } } else { // Row type diff --git a/velox/dwio/parquet/tests/examples/proto-struct-with-array.parquet b/velox/dwio/parquet/tests/examples/proto-struct-with-array.parquet new file mode 100644 index 000000000000..325a8370ad20 Binary files /dev/null and b/velox/dwio/parquet/tests/examples/proto-struct-with-array.parquet differ diff --git a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp index b3a5ea7ade89..67da2935efc5 100644 --- a/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetReaderTest.cpp @@ -711,6 +711,56 @@ TEST_F(ParquetReaderTest, parseMapKeyValueAsMap) { assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_); } +TEST_F(ParquetReaderTest, parseRowArrayTest) { + // schema: + // optionalPrimitive:int + // requiredPrimitive:int + // repeatedPrimitive:array + // optionalMessage:struct + // requiredMessage:struct + // repeatedMessage:array> + const std::string sample( + getExampleFilePath("proto-struct-with-array.parquet")); + + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; + auto reader = createReader(sample, readerOptions); + EXPECT_EQ(reader->numberOfRows(), 1ULL); + auto type = reader->typeWithId(); + EXPECT_EQ(type->size(), 6ULL); + auto col6_type = type->childAt(5); + EXPECT_EQ(col6_type->type()->kind(), TypeKind::ARRAY); + auto col6_1_type = col6_type->childAt(0); + EXPECT_EQ(col6_1_type->type()->kind(), TypeKind::ROW); + + auto outputRowType = + ROW({"optionalPrimitive", + "requiredPrimitive", + "repeatedPrimitive", + "optionalMessage", + "requiredMessage", + "repeatedMessage"}, + {INTEGER(), + INTEGER(), + ARRAY(INTEGER()), + ROW({"someId"}, {INTEGER()}), + ROW({"someId"}, {INTEGER()}), + ARRAY(ROW({"someId"}, {INTEGER()}))}); + auto rowReaderOpts = getReaderOpts(outputRowType); + rowReaderOpts.setScanSpec(makeScanSpec(outputRowType)); + auto rowReader = reader->createRowReader(rowReaderOpts); + VectorPtr result = BaseVector::create(outputRowType, 0, &*leafPool_); + + ASSERT_TRUE(rowReader->next(1, result)); + // data: 10, 9, , null, {9}, 2 elements starting at 0 {{9}, {10}}} + auto structArray = result->as()->childAt(5)->as(); + auto structEle = structArray->elements() + ->as() + ->childAt(0) + ->asFlatVector() + ->valueAt(0); + EXPECT_EQ(structEle, 9); +} + TEST_F(ParquetReaderTest, readSampleBigintRangeFilter) { // Read sample.parquet with the int filter "a BETWEEN 16 AND 20". FilterMap filters;