From c7fbb3eb47f864f4ffcb372da6a7db3b7f8471f2 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Thu, 28 Dec 2023 13:48:15 +0100 Subject: [PATCH 1/4] initial working version --- .../sql/catalyst/encoders/ExpressionEncoder.scala | 5 +---- .../analyzer-results/selectExcept.sql.out | 12 ++++++++++++ .../resources/sql-tests/inputs/selectExcept.sql | 1 + .../sql-tests/results/selectExcept.sql.out | 14 ++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 74d7a5e7a6757..3f538aea0fe36 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -324,10 +324,7 @@ case class ExpressionEncoder[T]( // same `BoundReference` to refer to the object, and throw exception if they don't. assert(serializer.forall(_.references.isEmpty), "serializer cannot reference any attributes.") assert(serializer.flatMap { ser => - val boundRefs = ser.collect { case b: BoundReference => b } - assert(boundRefs.nonEmpty, - "each serializer expression should contain at least one `BoundReference`") - boundRefs + ser.collect { case b: BoundReference => b } }.distinct.length <= 1, "all serializer expressions must use the same BoundReference.") /** diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out index 8643d40b886bd..48c7ad10ed788 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out @@ -121,6 +121,18 @@ Project [id#x, name#x, named_struct(f1, data#x.f1, s2, named_struct(f3, data#x.s +- LocalRelation [id#x, name#x, data#x] +-- !query +SELECT * EXCEPT (data.f1, data.s2) FROM tbl_view +-- !query analysis +Project [id#x, name#x, named_struct() AS data#x] ++- SubqueryAlias tbl_view + +- View (`tbl_view`, [id#x,name#x,data#x]) + +- Project [cast(id#x as int) AS id#x, cast(name#x as string) AS name#x, cast(data#x as struct>) AS data#x] + +- Project [id#x, name#x, data#x] + +- SubqueryAlias tbl_view + +- LocalRelation [id#x, name#x, data#x] + + -- !query SELECT * EXCEPT (id, name, data) FROM tbl_view -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql b/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql index e07e4f1117c29..08d56aeda0a8f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/selectExcept.sql @@ -20,6 +20,7 @@ SELECT * EXCEPT (data) FROM tbl_view; SELECT * EXCEPT (data.f1) FROM tbl_view; SELECT * EXCEPT (data.s2) FROM tbl_view; SELECT * EXCEPT (data.s2.f2) FROM tbl_view; +SELECT * EXCEPT (data.f1, data.s2) FROM tbl_view; -- EXCEPT all columns SELECT * EXCEPT (id, name, data) FROM tbl_view; -- EXCEPT special character names diff --git a/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out b/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out index 6f6ba9097342a..2621782342cce 100644 --- a/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/selectExcept.sql.out @@ -121,6 +121,20 @@ struct>> 70 name7 {"f1":7,"s2":{"f3":"g"}} +-- !query +SELECT * EXCEPT (data.f1, data.s2) FROM tbl_view +-- !query schema +struct> +-- !query output +10 name1 {} +20 name2 {} +30 name3 {} +40 name4 {} +50 name5 {} +60 name6 {} +70 name7 {} + + -- !query SELECT * EXCEPT (id, name, data) FROM tbl_view -- !query schema From 23b5cfbb69bc470c006feae5e1627bba7abc096b Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Thu, 28 Dec 2023 16:47:07 +0100 Subject: [PATCH 2/4] check for an empty struct --- .../sql/catalyst/encoders/ExpressionEncoder.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 3f538aea0fe36..546d905ebf683 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -324,9 +324,17 @@ case class ExpressionEncoder[T]( // same `BoundReference` to refer to the object, and throw exception if they don't. assert(serializer.forall(_.references.isEmpty), "serializer cannot reference any attributes.") assert(serializer.flatMap { ser => - ser.collect { case b: BoundReference => b } + val boundRefs = ser.collect { case b: BoundReference => b } + assert(boundRefs.nonEmpty || isEmptyStruct(ser), + "each serializer expression should contain at least one `BoundReference`") + boundRefs }.distinct.length <= 1, "all serializer expressions must use the same BoundReference.") + private def isEmptyStruct(expr: NamedExpression): Boolean = expr.dataType match { + case struct: StructType => struct.isEmpty + case _ => false + } + /** * Returns a new copy of this encoder, where the `deserializer` is resolved and bound to the * given schema. From 02776349db7a29c42be96f587e67b6cff93a3b66 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Fri, 29 Dec 2023 09:28:25 +0100 Subject: [PATCH 3/4] update the assert error msg --- .../apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 546d905ebf683..fd411ce02ffae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -326,7 +326,8 @@ case class ExpressionEncoder[T]( assert(serializer.flatMap { ser => val boundRefs = ser.collect { case b: BoundReference => b } assert(boundRefs.nonEmpty || isEmptyStruct(ser), - "each serializer expression should contain at least one `BoundReference`") + "each serializer expression should contain at least one `BoundReference` " + + "or be an empty struct") boundRefs }.distinct.length <= 1, "all serializer expressions must use the same BoundReference.") From fd55e42bee7cabffa1bafc444df61a5cb9b03896 Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Fri, 29 Dec 2023 16:16:47 +0100 Subject: [PATCH 4/4] update the error message --- .../spark/sql/catalyst/encoders/ExpressionEncoder.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index fd411ce02ffae..654f393936368 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -326,8 +326,10 @@ case class ExpressionEncoder[T]( assert(serializer.flatMap { ser => val boundRefs = ser.collect { case b: BoundReference => b } assert(boundRefs.nonEmpty || isEmptyStruct(ser), - "each serializer expression should contain at least one `BoundReference` " + - "or be an empty struct") + "each serializer expression should contain at least one `BoundReference` or it " + + "should be an empty struct. This is required to ensure that there is a reference point " + + "for the serialized object or that the serialized object is intentionally left empty." + ) boundRefs }.distinct.length <= 1, "all serializer expressions must use the same BoundReference.")