apache · fuwhu · Feb 11, 2020 · Feb 11, 2020
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -26,6 +26,19 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, FileScan}
 import org.apache.spark.sql.types.StructType
 
+/**
+ * Prune the partitions of file source based table using partition filters. Currently, this rule
+ * is applied to [[HadoopFsRelation]] with [[CatalogFileIndex]] and [[DataSourceV2ScanRelation]]
+ * with [[FileScan]].
+ *
+ * For [[HadoopFsRelation]], the location will be replaced by pruned file index, and corresponding
+ * statistics will be updated. And the partition filters will be kept in the filters of returned
+ * logical plan.
+ *
+ * For [[DataSourceV2ScanRelation]], both partition filters and data filters will be added to
+ * its underlying [[FileScan]]. And the partition filters will be removed in the filters of
+ * returned logical plan.
+ */
 private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
 
   private def getPartitionKeyFiltersAndDataFilters(

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala
@@ -30,6 +30,14 @@ import org.apache.spark.sql.execution.datasources.DataSourceStrategy
 import org.apache.spark.sql.internal.SQLConf
 
 /**
+ * Prune hive table partitions using partition filters on [[HiveTableRelation]]. The pruned
+ * partitions will be kept in [[HiveTableRelation.prunedPartitions]], and the statistics of
+ * the hive table relation will be updated based on pruned partitions.
+ *
+ * This rule is executed in optimization phase, so the statistics can be updated before physical
+ * planning, which is useful for some spark strategy, eg.
+ * [[org.apache.spark.sql.execution.SparkStrategies.JoinSelection]].
+ *
  * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source.
  */
 private[sql] class PruneHiveTablePartitions(session: SparkSession)