-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Spark : Add Files Perf improvement by push down partition filter to Spark/Hive catalog #2777
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,6 +80,7 @@ | |
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; | ||
| import scala.Function2; | ||
| import scala.Option; | ||
| import scala.Predef; | ||
| import scala.Some; | ||
| import scala.Tuple2; | ||
| import scala.collection.JavaConverters; | ||
|
|
@@ -140,7 +141,7 @@ public static Dataset<Row> partitionDFByFilter(SparkSession spark, String table, | |
| public static List<SparkPartition> getPartitions(SparkSession spark, String table) { | ||
| try { | ||
| TableIdentifier tableIdent = spark.sessionState().sqlParser().parseTableIdentifier(table); | ||
| return getPartitions(spark, tableIdent); | ||
| return getPartitions(spark, tableIdent, null); | ||
| } catch (ParseException e) { | ||
| throw SparkExceptionUtil.toUncheckedException(e, "Unable to parse table identifier: %s", table); | ||
| } | ||
|
|
@@ -151,15 +152,23 @@ public static List<SparkPartition> getPartitions(SparkSession spark, String tabl | |
| * | ||
| * @param spark a Spark session | ||
| * @param tableIdent a table identifier | ||
| * @param partitionFilter partition filter, or null if no filter | ||
| * @return all table's partitions | ||
| */ | ||
| public static List<SparkPartition> getPartitions(SparkSession spark, TableIdentifier tableIdent) { | ||
| public static List<SparkPartition> getPartitions(SparkSession spark, TableIdentifier tableIdent, | ||
| Map<String, String> partitionFilter) { | ||
| try { | ||
| SessionCatalog catalog = spark.sessionState().catalog(); | ||
| CatalogTable catalogTable = catalog.getTableMetadata(tableIdent); | ||
|
|
||
| Seq<CatalogTablePartition> partitions = catalog.listPartitions(tableIdent, Option.empty()); | ||
|
|
||
| Option<scala.collection.immutable.Map<String, String>> scalaPartitionFilter; | ||
| if (partitionFilter != null && !partitionFilter.isEmpty()) { | ||
| scalaPartitionFilter = Option.apply(JavaConverters.mapAsScalaMapConverter(partitionFilter).asScala() | ||
| .toMap(Predef.conforms())); | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The scala api requires immuable map, hence this extra step |
||
| } else { | ||
| scalaPartitionFilter = Option.empty(); | ||
| } | ||
| Seq<CatalogTablePartition> partitions = catalog.listPartitions(tableIdent, scalaPartitionFilter); | ||
| return JavaConverters | ||
| .seqAsJavaListConverter(partitions) | ||
| .asJava() | ||
|
|
@@ -375,14 +384,11 @@ public static void importSparkTable(SparkSession spark, TableIdentifier sourceTa | |
| if (Objects.equal(spec, PartitionSpec.unpartitioned())) { | ||
| importUnpartitionedSparkTable(spark, sourceTableIdentWithDB, targetTable); | ||
| } else { | ||
| List<SparkPartition> sourceTablePartitions = getPartitions(spark, sourceTableIdent); | ||
| List<SparkPartition> sourceTablePartitions = getPartitions(spark, sourceTableIdent, | ||
| partitionFilter); | ||
| Preconditions.checkArgument(!sourceTablePartitions.isEmpty(), | ||
| "Cannot find any partitions in table %s", sourceTableIdent); | ||
| List<SparkPartition> filteredPartitions = filterPartitions(sourceTablePartitions, partitionFilter); | ||
| Preconditions.checkArgument(!filteredPartitions.isEmpty(), | ||
| "Cannot find any partitions which match the given filter. Partition filter is %s", | ||
| MAP_JOINER.join(partitionFilter)); | ||
| importSparkPartitions(spark, filteredPartitions, targetTable, spec, stagingDir); | ||
| importSparkPartitions(spark, sourceTablePartitions, targetTable, spec, stagingDir); | ||
| } | ||
| } catch (AnalysisException e) { | ||
| throw SparkExceptionUtil.toUncheckedException( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -318,6 +318,26 @@ public void addFilteredPartitionsToPartitioned() { | |
| sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); | ||
| } | ||
|
|
||
| @Test | ||
| public void addFilteredPartitionsToPartitioned2() { | ||
| createCompositePartitionedTable("parquet"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need a SparkTableUtil test for the new getPartitions code as well, unless that's a pain |
||
|
|
||
| String createIceberg = | ||
| "CREATE TABLE %s (id Integer, name String, dept String, subdept String) USING iceberg " + | ||
| "PARTITIONED BY (id, dept)"; | ||
|
|
||
| sql(createIceberg, tableName); | ||
|
|
||
| Object result = scalarSql("CALL %s.system.add_files('%s', '`parquet`.`%s`', map('dept', 'hr'))", | ||
| catalogName, tableName, fileTableDir.getAbsolutePath()); | ||
|
|
||
| Assert.assertEquals(6L, result); | ||
|
|
||
| assertEquals("Iceberg table contains correct data", | ||
| sql("SELECT id, name, dept, subdept FROM %s WHERE dept = 'hr' ORDER BY id", sourceTableName), | ||
| sql("SELECT id, name, dept, subdept FROM %s ORDER BY id", tableName)); | ||
| } | ||
|
|
||
| @Test | ||
| public void addWeirdCaseHiveTable() { | ||
| createWeirdCaseTable(); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we benefit from having this as a Java Optional? Since we have to immediately convert it to Scala maybe we should just pass a normal Map?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the "filterPartitions" function would just use an empty map as "no filter"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done