apache · ulysses-you · Feb 24, 2021 · Feb 24, 2021 · Feb 24, 2021 · c21
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -769,7 +769,9 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
       case InSet(child, values) if useAdvanced && values.size > inSetThreshold =>
         val dataType = child.dataType
-        val sortedValues = values.toSeq.sorted(TypeUtils.getInterpretedOrdering(dataType))
+        // Skip null here is safe, more details could see at ExtractableLiterals.
+        val sortedValues = values.filter(_ != null).toSeq
+          .sorted(TypeUtils.getInterpretedOrdering(dataType))
 object ExtractableLiterals { 
   def unapply(exprs: Seq[Expression]): Option[Seq[String]] = { 
     // SPARK-24879: The Hive metastore filter parser does not support "null", but we still want 
     // to push down as many predicates as we can while still maintaining correctness. 
     // In SQL, the `IN` expression evaluates as follows: 
     //  > `1 in (2, NULL)` -> NULL 
     //  > `1 in (1, NULL)` -> true 
     //  > `1 in (2)` -> false 
     // Since Hive metastore filters are NULL-intolerant binary operations joined only by 
     // `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as 
     // `1 in (2)`. 
     // If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts 
     // pushing down these predicates, then this optimization will become incorrect and need 
     // to be changed. 
     val extractables = exprs 
         .filter { 
           case Literal(null, _) => false 
           case _ => true 
         }.map(ExtractableLiteral.unapply) 
 object ExtractableLiterals { 
   def unapply(exprs: Seq[Expression]): Option[Seq[String]] = { 
     // SPARK-24879: The Hive metastore filter parser does not support "null", but we still want 
     // to push down as many predicates as we can while still maintaining correctness. 
     // In SQL, the `IN` expression evaluates as follows: 
     //  > `1 in (2, NULL)` -> NULL 
     //  > `1 in (1, NULL)` -> true 
     //  > `1 in (2)` -> false 
     // Since Hive metastore filters are NULL-intolerant binary operations joined only by 
     // `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as 
     // `1 in (2)`. 
     // If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts 
     // pushing down these predicates, then this optimization will become incorrect and need 
     // to be changed. 
     val extractables = exprs 
         .filter { 
           case Literal(null, _) => false 
           case _ => true 
         }.map(ExtractableLiteral.unapply) 
         convert(And(GreaterThanOrEqual(child, Literal(sortedValues.head, dataType)),
           LessThanOrEqual(child, Literal(sortedValues.last, dataType))))
 

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -179,5 +179,13 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
     }
   }
 
+  test("SPARK-34515: Fix NPE if InSet contains null value during getPartitionsByFilter") {
+    withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "2") {
+      val filter = InSet(a("p", IntegerType), Set(null, 1, 2))
+      val converted = shim.convertFilters(testTable, Seq(filter), conf.sessionLocalTimeZone)
+      assert(converted == "(p >= 1 and p <= 2)")
+    }
+  }
+
   private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
 }