-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-15639][SPARK-16321][SQL] Push down filter at RowGroups level for parquet reader #13701
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5687a3b
077f7f8
97ccacf
5711ae4
36fd059
687d75b
a8bae96
6c6fc69
50095a5
246129c
58b4689
f7baf41
a52b354
3c7afaa
ea81fdc
2d34803
462edc7
0b38ba1
cee74b7
bbc5f7b
a2ba343
ca074f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ import org.apache.spark.sql.functions._ | |
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.{AccumulatorContext, LongAccumulator} | ||
|
|
||
| /** | ||
| * A test suite that tests Parquet filter2 API based filter pushdown optimization. | ||
|
|
@@ -368,73 +369,75 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex | |
|
|
||
| test("SPARK-11103: Filter applied on merged Parquet schema with new column fails") { | ||
| import testImplicits._ | ||
|
|
||
| withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true", | ||
| SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true") { | ||
| withTempPath { dir => | ||
| val pathOne = s"${dir.getCanonicalPath}/table1" | ||
| (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathOne) | ||
| val pathTwo = s"${dir.getCanonicalPath}/table2" | ||
| (1 to 3).map(i => (i, i.toString)).toDF("c", "b").write.parquet(pathTwo) | ||
|
|
||
| // If the "c = 1" filter gets pushed down, this query will throw an exception which | ||
| // Parquet emits. This is a Parquet issue (PARQUET-389). | ||
| val df = spark.read.parquet(pathOne, pathTwo).filter("c = 1").selectExpr("c", "b", "a") | ||
| checkAnswer( | ||
| df, | ||
| Row(1, "1", null)) | ||
|
|
||
| // The fields "a" and "c" only exist in one Parquet file. | ||
| assert(df.schema("a").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
| assert(df.schema("c").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
|
|
||
| val pathThree = s"${dir.getCanonicalPath}/table3" | ||
| df.write.parquet(pathThree) | ||
|
|
||
| // We will remove the temporary metadata when writing Parquet file. | ||
| val schema = spark.read.parquet(pathThree).schema | ||
| assert(schema.forall(!_.metadata.contains(StructType.metadataKeyForOptionalField))) | ||
|
|
||
| val pathFour = s"${dir.getCanonicalPath}/table4" | ||
| val dfStruct = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b") | ||
| dfStruct.select(struct("a").as("s")).write.parquet(pathFour) | ||
|
|
||
| val pathFive = s"${dir.getCanonicalPath}/table5" | ||
| val dfStruct2 = sparkContext.parallelize(Seq((1, 1))).toDF("c", "b") | ||
| dfStruct2.select(struct("c").as("s")).write.parquet(pathFive) | ||
|
|
||
| // If the "s.c = 1" filter gets pushed down, this query will throw an exception which | ||
| // Parquet emits. | ||
| val dfStruct3 = spark.read.parquet(pathFour, pathFive).filter("s.c = 1") | ||
| .selectExpr("s") | ||
| checkAnswer(dfStruct3, Row(Row(null, 1))) | ||
|
|
||
| // The fields "s.a" and "s.c" only exist in one Parquet file. | ||
| val field = dfStruct3.schema("s").dataType.asInstanceOf[StructType] | ||
| assert(field("a").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
| assert(field("c").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
|
|
||
| val pathSix = s"${dir.getCanonicalPath}/table6" | ||
| dfStruct3.write.parquet(pathSix) | ||
|
|
||
| // We will remove the temporary metadata when writing Parquet file. | ||
| val forPathSix = spark.read.parquet(pathSix).schema | ||
| assert(forPathSix.forall(!_.metadata.contains(StructType.metadataKeyForOptionalField))) | ||
|
|
||
| // sanity test: make sure optional metadata field is not wrongly set. | ||
| val pathSeven = s"${dir.getCanonicalPath}/table7" | ||
| (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathSeven) | ||
| val pathEight = s"${dir.getCanonicalPath}/table8" | ||
| (4 to 6).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathEight) | ||
|
|
||
| val df2 = spark.read.parquet(pathSeven, pathEight).filter("a = 1").selectExpr("a", "b") | ||
| checkAnswer( | ||
| df2, | ||
| Row(1, "1")) | ||
|
|
||
| // The fields "a" and "b" exist in both two Parquet files. No metadata is set. | ||
| assert(!df2.schema("a").metadata.contains(StructType.metadataKeyForOptionalField)) | ||
| assert(!df2.schema("b").metadata.contains(StructType.metadataKeyForOptionalField)) | ||
| Seq("true", "false").map { vectorized => | ||
| withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true", | ||
| SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key -> "true", | ||
| SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { | ||
| withTempPath { dir => | ||
| val pathOne = s"${dir.getCanonicalPath}/table1" | ||
| (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathOne) | ||
| val pathTwo = s"${dir.getCanonicalPath}/table2" | ||
| (1 to 3).map(i => (i, i.toString)).toDF("c", "b").write.parquet(pathTwo) | ||
|
|
||
| // If the "c = 1" filter gets pushed down, this query will throw an exception which | ||
| // Parquet emits. This is a Parquet issue (PARQUET-389). | ||
| val df = spark.read.parquet(pathOne, pathTwo).filter("c = 1").selectExpr("c", "b", "a") | ||
| checkAnswer( | ||
| df, | ||
| Row(1, "1", null)) | ||
|
|
||
| // The fields "a" and "c" only exist in one Parquet file. | ||
| assert(df.schema("a").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
| assert(df.schema("c").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
|
|
||
| val pathThree = s"${dir.getCanonicalPath}/table3" | ||
| df.write.parquet(pathThree) | ||
|
|
||
| // We will remove the temporary metadata when writing Parquet file. | ||
| val schema = spark.read.parquet(pathThree).schema | ||
| assert(schema.forall(!_.metadata.contains(StructType.metadataKeyForOptionalField))) | ||
|
|
||
| val pathFour = s"${dir.getCanonicalPath}/table4" | ||
| val dfStruct = sparkContext.parallelize(Seq((1, 1))).toDF("a", "b") | ||
| dfStruct.select(struct("a").as("s")).write.parquet(pathFour) | ||
|
|
||
| val pathFive = s"${dir.getCanonicalPath}/table5" | ||
| val dfStruct2 = sparkContext.parallelize(Seq((1, 1))).toDF("c", "b") | ||
| dfStruct2.select(struct("c").as("s")).write.parquet(pathFive) | ||
|
|
||
| // If the "s.c = 1" filter gets pushed down, this query will throw an exception which | ||
| // Parquet emits. | ||
| val dfStruct3 = spark.read.parquet(pathFour, pathFive).filter("s.c = 1") | ||
| .selectExpr("s") | ||
| checkAnswer(dfStruct3, Row(Row(null, 1))) | ||
|
|
||
| // The fields "s.a" and "s.c" only exist in one Parquet file. | ||
| val field = dfStruct3.schema("s").dataType.asInstanceOf[StructType] | ||
| assert(field("a").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
| assert(field("c").metadata.getBoolean(StructType.metadataKeyForOptionalField)) | ||
|
|
||
| val pathSix = s"${dir.getCanonicalPath}/table6" | ||
| dfStruct3.write.parquet(pathSix) | ||
|
|
||
| // We will remove the temporary metadata when writing Parquet file. | ||
| val forPathSix = spark.read.parquet(pathSix).schema | ||
| assert(forPathSix.forall(!_.metadata.contains(StructType.metadataKeyForOptionalField))) | ||
|
|
||
| // sanity test: make sure optional metadata field is not wrongly set. | ||
| val pathSeven = s"${dir.getCanonicalPath}/table7" | ||
| (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathSeven) | ||
| val pathEight = s"${dir.getCanonicalPath}/table8" | ||
| (4 to 6).map(i => (i, i.toString)).toDF("a", "b").write.parquet(pathEight) | ||
|
|
||
| val df2 = spark.read.parquet(pathSeven, pathEight).filter("a = 1").selectExpr("a", "b") | ||
| checkAnswer( | ||
| df2, | ||
| Row(1, "1")) | ||
|
|
||
| // The fields "a" and "b" exist in both two Parquet files. No metadata is set. | ||
| assert(!df2.schema("a").metadata.contains(StructType.metadataKeyForOptionalField)) | ||
| assert(!df2.schema("b").metadata.contains(StructType.metadataKeyForOptionalField)) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -527,4 +530,32 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex | |
| assert(df.filter("_1 IS NOT NULL").count() === 4) | ||
| } | ||
| } | ||
|
|
||
| test("Fiters should be pushed down for vectorized Parquet reader at row group level") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about non-vectorized reader ?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For non-vectorized reader, we use parquet's
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @viirya
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already have test for the pushed down filters for ParquetRecordReader. But it is at individual record level. If you mean row group level, because ParquetRecordReader doesn't expose a count for the row group, I think we can't check if the filter is pushed down at row group level. Besides, it seems to be the functionality of ParquetRecordReader, and I think it should be unit test in parquet project.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, as the ParquetRecordReader also uses the Configuration to get the pushed down filters, I think this also fixes SPARK-16321. |
||
| import testImplicits._ | ||
|
|
||
| withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "true", | ||
| SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { | ||
| withTempPath { dir => | ||
| val path = s"${dir.getCanonicalPath}/table" | ||
| (1 to 1024).map(i => (101, i)).toDF("a", "b").write.parquet(path) | ||
|
|
||
| Seq(("true", (x: Long) => x == 0), ("false", (x: Long) => x > 0)).map { case (push, func) => | ||
| withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> push) { | ||
| val accu = new LongAccumulator | ||
| accu.register(sparkContext, Some("numRowGroups")) | ||
|
|
||
| val df = spark.read.parquet(path).filter("a < 100") | ||
| df.foreachPartition(_.foreach(v => accu.add(0))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what does this test? shouldn't
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In Here we force this trivial foreach function refer this accumulator, but doesn't change it, so the executor side can see it. |
||
| df.collect | ||
|
|
||
| val numRowGroups = AccumulatorContext.lookForAccumulatorByName("numRowGroups") | ||
| assert(numRowGroups.isDefined) | ||
| assert(func(numRowGroups.get.asInstanceOf[LongAccumulator].value)) | ||
| AccumulatorContext.remove(accu.id) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this change related?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A regression test added for the optional column (only existing in part of parquet files). Previously it is only for non vectorized parquet reader. Now adding test for vectorized reader. This PR has related changes before but it is removed as in the discussion folded.