diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 933384ed43e9..bc9d4cd7f418 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -598,6 +598,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { object ExtractableLiteral { def unapply(expr: Expression): Option[String] = expr match { + case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs. case Literal(value, _: IntegralType) => Some(value.toString) case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString)) case _ => None @@ -606,7 +607,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { object ExtractableLiterals { def unapply(exprs: Seq[Expression]): Option[Seq[String]] = { - val extractables = exprs.map(ExtractableLiteral.unapply) + // SPARK-24879: The Hive metastore filter parser does not support "null", but we still want + // to push down as many predicates as we can while still maintaining correctness. + // In SQL, the `IN` expression evaluates as follows: + // > `1 in (2, NULL)` -> NULL + // > `1 in (1, NULL)` -> true + // > `1 in (2)` -> false + // Since Hive metastore filters are NULL-intolerant binary operations joined only by + // `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as + // `1 in (2)`. + // If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts + // pushing down these predicates, then this optimization will become incorrect and need + // to be changed. + val extractables = exprs + .filter { + case Literal(null, _) => false + case _ => true + }.map(ExtractableLiteral.unapply) if (extractables.nonEmpty && extractables.forall(_.isDefined)) { Some(extractables.map(_.get)) } else { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 19765695fbcb..2a4efd0cce6e 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil, """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""") + filterTest("SPARK-24879 null literals should be ignored for IN constructs", + (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil, + "(intcol = 1)") + + // Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization + // will be applied by Catalyst, this filter converter does not need to account for this. + filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE", + (a("intcol", IntegerType) in Literal(null)) :: Nil, + "") + + filterTest("typecast null literals should not be pushed down in simple predicates", + (a("intcol", IntegerType) === Literal(null, IntegerType)) :: Nil, + "") + private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name) { withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") {