Skip to content

Commit df061fd

Browse files
cloud-fangatorsmile
authored andcommitted
[SPARK-21457][SQL] ExternalCatalog.listPartitions should correctly handle partition values with dot
## What changes were proposed in this pull request? When we list partitions from hive metastore with a partial partition spec, we are expecting exact matching according to the partition values. However, hive treats dot specially and match any single character for dot. We should do an extra filter to drop unexpected partitions. ## How was this patch tested? new regression test. Author: Wenchen Fan <wenchen@databricks.com> Closes #18671 from cloud-fan/hive. (cherry picked from commit f18b905) Signed-off-by: gatorsmile <gatorsmile@gmail.com>
1 parent 99ce551 commit df061fd

File tree

4 files changed

+35
-13
lines changed

4 files changed

+35
-13
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,18 @@ object ExternalCatalogUtils {
159159
}
160160
}
161161
}
162+
163+
/**
164+
* Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a
165+
* partial partition spec w.r.t. PARTITION (a=1,b=2).
166+
*/
167+
def isPartialPartitionSpec(
168+
spec1: TablePartitionSpec,
169+
spec2: TablePartitionSpec): Boolean = {
170+
spec1.forall {
171+
case (partitionColumn, value) => spec2(partitionColumn) == value
172+
}
173+
}
162174
}
163175

164176
object CatalogUtils {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -542,18 +542,6 @@ class InMemoryCatalog(
542542
}
543543
}
544544

545-
/**
546-
* Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a
547-
* partial partition spec w.r.t. PARTITION (a=1,b=2).
548-
*/
549-
private def isPartialPartitionSpec(
550-
spec1: TablePartitionSpec,
551-
spec2: TablePartitionSpec): Boolean = {
552-
spec1.forall {
553-
case (partitionColumn, value) => spec2(partitionColumn) == value
554-
}
555-
}
556-
557545
override def listPartitionsByFilter(
558546
db: String,
559547
table: String,

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,18 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac
439439
assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown"))).isEmpty)
440440
}
441441

442+
test("SPARK-21457: list partitions with special chars") {
443+
val catalog = newBasicCatalog()
444+
assert(catalog.listPartitions("db2", "tbl1").isEmpty)
445+
446+
val part1 = CatalogTablePartition(Map("a" -> "1", "b" -> "i+j"), storageFormat)
447+
val part2 = CatalogTablePartition(Map("a" -> "1", "b" -> "i.j"), storageFormat)
448+
catalog.createPartitions("db2", "tbl1", Seq(part1, part2), ignoreIfExists = false)
449+
450+
assert(catalog.listPartitions("db2", "tbl1", Some(part1.spec)).map(_.spec) == Seq(part1.spec))
451+
assert(catalog.listPartitions("db2", "tbl1", Some(part2.spec)).map(_.spec) == Seq(part2.spec))
452+
}
453+
442454
test("list partitions by filter") {
443455
val tz = TimeZone.getDefault.getID
444456
val catalog = newBasicCatalog()

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1074,9 +1074,19 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
10741074
table: String,
10751075
partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient {
10761076
val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table))
1077-
client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
1077+
val res = client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part =>
10781078
part.copy(spec = restorePartitionSpec(part.spec, partColNameMap))
10791079
}
1080+
1081+
partialSpec match {
1082+
// This might be a bug of Hive: When the partition value inside the partial partition spec
1083+
// contains dot, and we ask Hive to list partitions w.r.t. the partial partition spec, Hive
1084+
// treats dot as matching any single character and may return more partitions than we
1085+
// expected. Here we do an extra filter to drop unexpected partitions.
1086+
case Some(spec) if spec.exists(_._2.contains(".")) =>
1087+
res.filter(p => isPartialPartitionSpec(spec, p.spec))
1088+
case _ => res
1089+
}
10801090
}
10811091

10821092
override def listPartitionsByFilter(

0 commit comments

Comments
 (0)