Skip to content

Commit cf02b1a

Browse files
chenhao-dbcloud-fan
authored andcommitted
[SPARK-47569][SQL] Disallow comparing variant
### What changes were proposed in this pull request? It adds type-checking rules to disallow comparing variant values (including group by a variant column). We may support comparing variant values in the future, but since we don't have a proper comparison implementation at this point, they should be disallowed on the user surface. ### How was this patch tested? Unit tests. Closes #45726 from chenhao-db/SPARK-47569. Authored-by: Chenhao Li <chenhao.li@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 535734e commit cf02b1a

File tree

5 files changed

+53
-2
lines changed

5 files changed

+53
-2
lines changed

common/utils/src/main/resources/error/error-classes.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1390,6 +1390,12 @@
13901390
],
13911391
"sqlState" : "42805"
13921392
},
1393+
"GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE" : {
1394+
"message" : [
1395+
"The expression <sqlExpr> cannot be used as a grouping expression because its data type <dataType> is not an orderable data type."
1396+
],
1397+
"sqlState" : "42822"
1398+
},
13931399
"HLL_INVALID_INPUT_SKETCH_BUFFER" : {
13941400
"message" : [
13951401
"Invalid call to <function>; only valid HLL sketch buffers are supported as inputs (such as those produced by the `hll_sketch_agg` function)."

docs/sql-error-conditions.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -846,6 +846,12 @@ GROUP BY `<index>` refers to an expression `<aggExpr>` that contains an aggregat
846846

847847
GROUP BY position `<index>` is not in select list (valid range is [1, `<size>`]).
848848

849+
### GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE
850+
851+
[SQLSTATE: 42822](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
852+
853+
The expression `<sqlExpr>` cannot be used as a grouping expression because its data type `<dataType>` is not an orderable data type.
854+
849855
### HLL_INVALID_INPUT_SKETCH_BUFFER
850856

851857
[SQLSTATE: 22546](sql-error-conditions-sqlstates.html#class-22-data-exception)

sql/api/src/main/scala/org/apache/spark/sql/catalyst/expressions/OrderUtils.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,15 @@
1616
*/
1717
package org.apache.spark.sql.catalyst.expressions
1818

19-
import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, NullType, StructType, UserDefinedType}
19+
import org.apache.spark.sql.types.{ArrayType, AtomicType, DataType, NullType, StructType, UserDefinedType, VariantType}
2020

2121
object OrderUtils {
2222
/**
2323
* Returns true iff the data type can be ordered (i.e. can be sorted).
2424
*/
2525
def isOrderable(dataType: DataType): Boolean = dataType match {
2626
case NullType => true
27+
case VariantType => false
2728
case dt: AtomicType => true
2829
case struct: StructType => struct.fields.forall(f => isOrderable(f.dataType))
2930
case array: ArrayType => isOrderable(array.elementType)

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
2828
import org.apache.spark.sql.catalyst.plans.logical.Aggregate
2929
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, CharVarcharUtils}
3030
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase, QueryExecutionErrors}
31-
import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType}
31+
import org.apache.spark.sql.types.{DataType, MapType, StringType, StructType, VariantType}
3232
import org.apache.spark.unsafe.types.UTF8String
3333

3434
object ExprUtils extends QueryErrorsBase {
@@ -193,6 +193,15 @@ object ExprUtils extends QueryErrorsBase {
193193
messageParameters = Map("sqlExpr" -> expr.sql))
194194
}
195195

196+
// Check if the data type of expr is orderable.
197+
if (expr.dataType.existsRecursively(_.isInstanceOf[VariantType])) {
198+
expr.failAnalysis(
199+
errorClass = "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE",
200+
messageParameters = Map(
201+
"sqlExpr" -> toSQLExpr(expr),
202+
"dataType" -> toSQLType(expr.dataType)))
203+
}
204+
196205
if (!expr.deterministic) {
197206
// This is just a sanity check, our analysis rule PullOutNondeterministic should
198207
// already pull out those nondeterministic expressions and evaluate them in

sql/core/src/test/scala/org/apache/spark/sql/VariantSuite.scala

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,4 +269,33 @@ class VariantSuite extends QueryTest with SharedSparkSession {
269269
}
270270
}
271271
}
272+
273+
test("group/order/join variant are disabled") {
274+
var ex = intercept[AnalysisException] {
275+
spark.sql("select parse_json('') group by 1")
276+
}
277+
assert(ex.getErrorClass == "GROUP_EXPRESSION_TYPE_IS_NOT_ORDERABLE")
278+
279+
ex = intercept[AnalysisException] {
280+
spark.sql("select parse_json('') order by 1")
281+
}
282+
assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
283+
284+
ex = intercept[AnalysisException] {
285+
spark.sql("select parse_json('') sort by 1")
286+
}
287+
assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
288+
289+
ex = intercept[AnalysisException] {
290+
spark.sql("with t as (select 1 as a, parse_json('') as v) " +
291+
"select rank() over (partition by a order by v) from t")
292+
}
293+
assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
294+
295+
ex = intercept[AnalysisException] {
296+
spark.sql("with t as (select parse_json('') as v) " +
297+
"select t1.v from t as t1 join t as t2 on t1.v = t2.v")
298+
}
299+
assert(ex.getErrorClass == "DATATYPE_MISMATCH.INVALID_ORDERING_TYPE")
300+
}
272301
}

0 commit comments

Comments
 (0)