-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-36136][SQL][TESTS] Refactor PruneFileSourcePartitionsSuite etc to a different package #33350
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.hive.execution | ||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import org.scalatest.matchers.should.Matchers._ | ||
|
|
||
|
|
@@ -24,18 +24,19 @@ import org.apache.spark.sql.{QueryTest, Row} | |
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.dsl.expressions._ | ||
| import org.apache.spark.sql.catalyst.dsl.plans._ | ||
| import org.apache.spark.sql.catalyst.expressions.Expression | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} | ||
| import org.apache.spark.sql.catalyst.rules.RuleExecutor | ||
| import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan} | ||
| import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} | ||
| import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat | ||
| import org.apache.spark.sql.execution.datasources.v2.BatchScanExec | ||
| import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec | ||
| import org.apache.spark.sql.functions.broadcast | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.test.SharedSparkSession | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| class PruneFileSourcePartitionsSuite extends PrunePartitionSuiteBase { | ||
| class PruneFileSourcePartitionsSuite extends PrunePartitionSuiteBase with SharedSparkSession { | ||
|
|
||
| override def format: String = "parquet" | ||
|
|
||
|
|
@@ -45,35 +46,27 @@ class PruneFileSourcePartitionsSuite extends PrunePartitionSuiteBase { | |
|
|
||
| test("PruneFileSourcePartitions should not change the output of LogicalRelation") { | ||
| withTable("test") { | ||
| withTempDir { dir => | ||
| sql( | ||
| s""" | ||
| |CREATE EXTERNAL TABLE test(i int) | ||
| |PARTITIONED BY (p int) | ||
| |STORED AS parquet | ||
| |LOCATION '${dir.toURI}'""".stripMargin) | ||
|
||
|
|
||
| val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") | ||
| val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) | ||
|
|
||
| val dataSchema = StructType(tableMeta.schema.filterNot { f => | ||
| tableMeta.partitionColumnNames.contains(f.name) | ||
| }) | ||
| val relation = HadoopFsRelation( | ||
| location = catalogFileIndex, | ||
| partitionSchema = tableMeta.partitionSchema, | ||
| dataSchema = dataSchema, | ||
| bucketSpec = None, | ||
| fileFormat = new ParquetFileFormat(), | ||
| options = Map.empty)(sparkSession = spark) | ||
|
|
||
| val logicalRelation = LogicalRelation(relation, tableMeta) | ||
| val query = Project(Seq(Symbol("i"), Symbol("p")), | ||
| Filter(Symbol("p") === 1, logicalRelation)).analyze | ||
|
|
||
| val optimized = Optimize.execute(query) | ||
| assert(optimized.missingInput.isEmpty) | ||
| } | ||
| spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("test") | ||
|
||
| val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") | ||
| val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) | ||
|
|
||
| val dataSchema = StructType(tableMeta.schema.filterNot { f => | ||
| tableMeta.partitionColumnNames.contains(f.name) | ||
| }) | ||
| val relation = HadoopFsRelation( | ||
| location = catalogFileIndex, | ||
| partitionSchema = tableMeta.partitionSchema, | ||
| dataSchema = dataSchema, | ||
| bucketSpec = None, | ||
| fileFormat = new ParquetFileFormat(), | ||
| options = Map.empty)(sparkSession = spark) | ||
|
|
||
| val logicalRelation = LogicalRelation(relation, tableMeta) | ||
| val query = Project(Seq(Symbol("id"), Symbol("p")), | ||
|
||
| Filter(Symbol("p") === 1, logicalRelation)).analyze | ||
|
|
||
| val optimized = Optimize.execute(query) | ||
| assert(optimized.missingInput.isEmpty) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -142,6 +135,10 @@ class PruneFileSourcePartitionsSuite extends PrunePartitionSuiteBase { | |
| } | ||
| } | ||
|
|
||
| protected def collectPartitionFiltersFn(): PartialFunction[SparkPlan, Seq[Expression]] = { | ||
| case scan: FileSourceScanExec => scan.partitionFilters | ||
| } | ||
|
|
||
| override def getScanExecPartitionSize(plan: SparkPlan): Long = { | ||
| plan.collectFirst { | ||
| case p: FileSourceScanExec => p.selectedPartitions.length | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so it's not only moving the package, but also changes some tests to not use hive tables but use data source tables instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, since we're moving
PruneFileSourcePartitionsSuiteout of thehivepackage, we need to remove the Hive dependency here too.As commented in the other thread, to me it's OK to switch to use data source table here. I also digged the history of the change, and it seems at the time when this test was added (in #15569), data source table doesn't use HMS to store table metadata by default (it was added #15515 later), but instead was using
ListingFileCatalog(?). Maybe it was for testing purpose that we created a Hive table here but then constructed aLogicalRelationto feed into thePruneFileSourcePartitionsrule?Let me know if you see concern here @cloud-fan , since you are the main author of this test and the related code :)