diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala index f2cfcb984594d..003fa7646df20 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/Metadata.scala @@ -90,6 +90,24 @@ sealed class Metadata private[types] (private[types] val map: Map[String, Any]) /** Gets a Metadata array. */ def getMetadataArray(key: String): Array[Metadata] = get(key) + /** Return a copy with the keys removed */ + def withKeysRemoved(keysToRemove: Seq[String]): Metadata = { + if (keysToRemove.isEmpty) { + this + } else { + new Metadata(this.map -- keysToRemove) + } + } + + /** Return a copy with a key removed */ + def withKeyRemoved(keyToRemove: String): Metadata = { + if (map.contains(keyToRemove)) { + new Metadata(map - keyToRemove) + } else { + this + } + } + /** Converts to its JSON representation. */ def json: String = compact(render(jsonValue)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala index 5243e17afe5da..59d015b8ee139 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala @@ -133,7 +133,8 @@ object TableOutputResolver extends SQLConfHelper with Logging { val canWriteExpr = canWrite( tableName, valueType, colType, byName = true, conf, addError, colPath) if (canWriteExpr) { - applyColumnMetadata(checkNullability(value, col, conf, colPath), col) + val nullsHandled = checkNullability(value, col, conf, colPath) + applyColumnMetadata(nullsHandled, col) } else { value } @@ -222,12 +223,29 @@ object TableOutputResolver extends SQLConfHelper with Logging { val requiredMetadata = CharVarcharUtils.cleanMetadata(column.metadata) // Make sure that the result has the requiredMetadata and only that. - // If the expr is an Attribute or NamedLambdaVariable with the proper name and metadata, - // it should remain stable, but we do not trust that other NamedAttributes will - // remain stable (namely Alias). + // + // If the expr is a NamedLambdaVariable, it must be from our handling of structured + // array or map fields; the Alias will be added on the outer structured value. + // + // Even an Attribute with the proper name and metadata is not enough to prevent + // source query metadata leaking to the Write after rewrites, ie: + // case a: Attribute if a.name == column.name && a.metadata == requiredMetadata => a + // + // The problem is that an Attribute can be replaced by what it refers to, for example: + // Project AttrRef(metadata={}, exprId=2) + // Project Alias( + // cast(AttrRef(metadata={source_field_default_value}, exprId=1) as same_type), + // exprId=2, + // explicitMetadata=None) -- metadata.isEmpty + // gets rewritten to: + // Project Alias( + // AttrRef(metadata={source_field_default_value}, exprId=1), + // exprId=2, + // explicitMetadata=None) -- metadata.nonEmpty !! + // + // So we always add an Alias(expr, name, explicitMetadata = Some(requiredMetadata)) + // to prevent expr from leaking the source query metadata into the Write. expr match { - case a: Attribute if a.name == column.name && a.metadata == requiredMetadata => - a case v: NamedLambdaVariable if v.name == column.name && v.metadata == requiredMetadata => v case _ => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala index b56281fa12ba8..bc02c4bc4572a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/resolver/AliasResolver.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.analysis.resolver import org.apache.spark.sql.catalyst.analysis.{AliasResolution, UnresolvedAlias} import org.apache.spark.sql.catalyst.expressions.{ Alias, + AliasHelper, Expression, NamedExpression, OuterReference @@ -31,7 +32,8 @@ import org.apache.spark.sql.errors.QueryCompilationErrors */ class AliasResolver(expressionResolver: ExpressionResolver) extends TreeNodeResolver[UnresolvedAlias, Expression] - with ResolvesExpressionChildren { + with ResolvesExpressionChildren + with AliasHelper { private val scopes = expressionResolver.getNameScopes private val expressionResolutionContextStack = expressionResolver.getExpressionResolutionContextStack @@ -115,30 +117,19 @@ class AliasResolver(expressionResolver: ExpressionResolver) * * Project[ * Alias("alias_2")( - * Alias("alias_1")(id) - * ) + * Alias("alias_1")(id1) + * )(id2) * ]( ... ) * * and after the `collapseAlias` call (removing the bottom one) it would be: * * Project[ - * Alias("alias_2")(id) + * Alias("alias_2")(id2) * ]( ... ) */ private def collapseAlias(alias: Alias): Alias = alias.child match { - case innerAlias: Alias => - val metadata = if (alias.metadata.isEmpty) { - None - } else { - Some(alias.metadata) - } - alias.copy(child = innerAlias.child)( - exprId = alias.exprId, - qualifier = alias.qualifier, - explicitMetadata = metadata, - nonInheritableMetadataKeys = alias.nonInheritableMetadataKeys - ) + case _: Alias => mergeAliases(alias) case _ => alias } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala index c54b8162c6e3b..34393aaca7c67 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.expressions +import scala.annotation.tailrec + import org.apache.spark.sql.catalyst.analysis.MultiAlias import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project} @@ -68,8 +70,8 @@ trait AliasHelper { * but keep the name of the outermost attribute. */ protected def replaceAliasButKeepName( - expr: NamedExpression, - aliasMap: AttributeMap[Alias]): NamedExpression = { + expr: NamedExpression, + aliasMap: AttributeMap[Alias]): NamedExpression = { expr match { // We need to keep the `Alias` if we replace a top-level Attribute, so that it's still a // `NamedExpression`. We also need to keep the name of the original Attribute. @@ -90,7 +92,7 @@ trait AliasHelper { case a: Alias if a.metadata != Metadata.empty => a case other => trimAliases(other) } - case a @ Alias(child, _) => trimAliases(child) + case Alias(child, _) => trimAliases(child) case MultiAlias(child, _) => trimAliases(child) case other => other.mapChildren(trimAliases) } @@ -98,21 +100,74 @@ trait AliasHelper { protected def trimNonTopLevelAliases[T <: Expression](e: T): T = { val res = CurrentOrigin.withOrigin(e.origin) { e match { - case a: Alias => - // Preserve the _effective_ metadata. - a.copy(child = trimAliases(a.child))( - exprId = a.exprId, - qualifier = a.qualifier, - explicitMetadata = Some(a.metadata), - nonInheritableMetadataKeys = Nil) - case a: MultiAlias => - a.copy(child = trimAliases(a.child)) + case a: Alias => mergeAndTrimAliases(a) + case a: MultiAlias => a.copy(child = trimAliases(a.child)) case other => trimAliases(other) } } - res.copyTagsFrom(e) - res.asInstanceOf[T] } + + /** + * Merge any stack of aliases under the top-level alias, and then + * drops any aliases deeper in the expression tree. + * So Alias1(Alias2(Alias3(Foo(Alias4(x))))) becomes + * Alias5(Foo(x)) + * where Alias5 preserves the metadata of Alias{1,2,3} + * and the name and exprId of Alias1. + * Alias4 is simply removed. + */ + @tailrec + protected final def mergeAndTrimAliases(alias: Alias): Alias = { + alias.child match { + case _: Alias => mergeAndTrimAliases(mergeAliases(alias)) + case other => alias.withNewChild(trimAliases(other)) + } + } + + /** + * Merge an Alias(Alias(x)) into Alias(x) preserving metadata. + * + * If the outer alias has explicit metadata, + * it is preserved. + * Else if the inner alias has explicit metadata, + * the result has explicit outer.metadata. + * Else both are deriving the metadata. + * the result is deriving metadata, + * with the union of noninheritable keys. + * + * @param alias An Alias with a child Alias, Alias(Alias(x)) + * @return The merged alias, Alias(x) + */ + protected final def mergeAliases(alias: Alias): Alias = { + val child = alias.child.asInstanceOf[Alias] + var explicitMetadata = alias.explicitMetadata + var nonInheritableMetadataKeys = alias.nonInheritableMetadataKeys + + if (explicitMetadata.isDefined) { + // Outer alias is explicit; we can ignore inner metadata. + // The outer nonInheritableMetadataKeys are irrelevant. + nonInheritableMetadataKeys = Nil + } else if (child.explicitMetadata.isDefined) { + // Inner alias is explicit; remove any outer non-inherits. + // We don't need nonInheritableMetadataKeys anymore. + explicitMetadata = Some(alias.metadata) + nonInheritableMetadataKeys = Nil + } else { + // Both are deriving. Union the nonInheritableMetadataKeys + val nonInheritSet = nonInheritableMetadataKeys.toSet + nonInheritableMetadataKeys = nonInheritableMetadataKeys ++ + child.nonInheritableMetadataKeys.filterNot(nonInheritSet) + } + val res = CurrentOrigin.withOrigin(alias.origin) { + alias.copy(child = child.child)( + exprId = alias.exprId, + qualifier = alias.qualifier, + explicitMetadata = explicitMetadata, + nonInheritableMetadataKeys = nonInheritableMetadataKeys) + } + res.copyTagsFrom(alias) + res + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index fb7c8ea3fe550..732fc9a02a1dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -255,6 +255,11 @@ case class Alias(child: Expression, name: String)( s"${child.sql} AS $qualifierPrefix${quoteIfNeeded(name)}" } + // Copying this alias with a new child expression. + def withNewChild(newChild: Expression): Alias = { + withNewChildInternal(newChild) + } + override protected def withNewChildInternal(newChild: Expression): Alias = copy(child = newChild)(exprId, qualifier, explicitMetadata, nonInheritableMetadataKeys) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 19a99f6fa8507..fc65c24afcb8f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.trees.AlwaysProcess import org.apache.spark.sql.catalyst.trees.TreePattern._ import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes +import org.apache.spark.sql.catalyst.util.AUTO_GENERATED_ALIAS import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf @@ -619,11 +620,28 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] { // If the alias name is different from attribute name, we can't strip it either, or we // may accidentally change the output schema name of the root plan. case a @ Alias(attr: Attribute, name) - if (a.metadata == attr.metadata) && - name == attr.name && - !excludeList.contains(attr) && - !excludeList.contains(a) => - attr + if !excludeList.contains(attr) && + !excludeList.contains(a) && + name == attr.name => + + val metadata = a.metadata + var attrMetadata = attr.metadata + if (metadata == attrMetadata) { + // The alias is truly redundant, remove it. + attr + } else if (attr.metadata.contains(AUTO_GENERATED_ALIAS)) { + attrMetadata = attr.metadata.withKeyRemoved(AUTO_GENERATED_ALIAS) + if (metadata == attrMetadata) { + // The AUTO_GENERATED_ALIAS is not propagating to a view, so it is ok to remove it. + // With that key removed, the alias is now redundant, remove it. + attr.withMetadata(metadata) + } else { + a + } + } else { + a + } + case a => a } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/V2WriteAnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/V2WriteAnalysisSuite.scala index 2ed7c612d3f33..3b4da14c7b158 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/V2WriteAnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/V2WriteAnalysisSuite.scala @@ -332,7 +332,7 @@ abstract class V2WriteAnalysisSuiteBase extends AnalysisTest { ArrayType(new StructType().add("y", "int").add("x", "byte")), hasTransform = true) - withSQLConf("spark.sql.preserveCharVarcharTypeInfo" -> "true") { + withSQLConf(SQLConf.PRESERVE_CHAR_VARCHAR_TYPE_INFO.key -> "true") { // exact match on VARCHAR does not need transform assertArrayField(ArrayType(VarcharType(7)), ArrayType(VarcharType(7)), hasTransform = false) // VARCHAR length increase could avoid transform @@ -512,7 +512,8 @@ abstract class V2WriteAnalysisSuiteBase extends AnalysisTest { val y = query.output.last val parsedPlan = byName(table, query) - val expectedPlan = byName(table, Project(Seq(Alias(X, "x")(), y), query)) + val expectedPlan = byName(table, + Project(Seq(Alias(X, "x")(), Alias(y, y.name)()), query)) assertNotResolved(parsedPlan) checkAnalysis(parsedPlan, expectedPlan, caseSensitive = false) @@ -529,7 +530,8 @@ abstract class V2WriteAnalysisSuiteBase extends AnalysisTest { val x = query.output.last val parsedPlan = byName(table, query) - val expectedPlan = byName(table, Project(Seq(x, y), query)) + val expectedPlan = byName(table, + Project(Seq(Alias(x, x.name)(), Alias(y, y.name)()), query)) assertNotResolved(parsedPlan) checkAnalysis(parsedPlan, expectedPlan) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out index bb0fef48abf7d..48bbd2d2f9828 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-command.sql.out @@ -57,14 +57,15 @@ WITH s AS (SELECT 43 AS col) INSERT INTO cte_tbl SELECT * FROM S -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/cte_tbl, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/cte_tbl], Append, `spark_catalog`.`default`.`cte_tbl`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/cte_tbl), [col] -+- WithCTE - :- CTERelationDef xxxx, false - : +- SubqueryAlias s - : +- Project [43 AS col#x] - : +- OneRowRelation - +- Project [col#x] - +- SubqueryAlias S - +- CTERelationRef xxxx, true, [col#x], false, false, 1, false ++- Project [col#x AS col#x] + +- WithCTE + :- CTERelationDef xxxx, false + : +- SubqueryAlias s + : +- Project [43 AS col#x] + : +- OneRowRelation + +- Project [col#x] + +- SubqueryAlias S + +- CTERelationRef xxxx, true, [col#x], false, false, 1, false -- !query @@ -79,14 +80,15 @@ Project [col#x] INSERT INTO cte_tbl WITH s AS (SELECT 44 AS col) SELECT * FROM s -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/cte_tbl, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/cte_tbl], Append, `spark_catalog`.`default`.`cte_tbl`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/cte_tbl), [col] -+- WithCTE - :- CTERelationDef xxxx, false - : +- SubqueryAlias s - : +- Project [44 AS col#x] - : +- OneRowRelation - +- Project [col#x] - +- SubqueryAlias s - +- CTERelationRef xxxx, true, [col#x], false, false, 1, false ++- Project [col#x AS col#x] + +- WithCTE + :- CTERelationDef xxxx, false + : +- SubqueryAlias s + : +- Project [44 AS col#x] + : +- OneRowRelation + +- Project [col#x] + +- SubqueryAlias s + +- CTERelationRef xxxx, true, [col#x], false, false, 1, false -- !query @@ -111,15 +113,17 @@ INSERT INTO cte_tbl2 SELECT col -- !query analysis Union false, false :- InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/cte_tbl, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/cte_tbl], Append, `spark_catalog`.`default`.`cte_tbl`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/cte_tbl), [col] -: +- Project [col#x] -: +- SubqueryAlias s -: +- Project [45 AS col#x] -: +- OneRowRelation +: +- Project [col#x AS col#x] +: +- Project [col#x] +: +- SubqueryAlias s +: +- Project [45 AS col#x] +: +- OneRowRelation +- InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/cte_tbl2, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/cte_tbl2], Append, `spark_catalog`.`default`.`cte_tbl2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/cte_tbl2), [col] - +- Project [col#x] - +- SubqueryAlias s - +- Project [45 AS col#x] - +- OneRowRelation + +- Project [col#x AS col#x] + +- Project [col#x] + +- SubqueryAlias s + +- Project [45 AS col#x] + +- OneRowRelation -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out index 848cdce88b0ba..960c5dfee08c4 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out @@ -1076,20 +1076,21 @@ WITH RECURSIVE r(level) AS ( INSERT INTO rt SELECT * FROM r -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/rt, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/rt], Append, `spark_catalog`.`default`.`rt`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/rt), [level] -+- WithCTE - :- CTERelationDef xxxx, false - : +- SubqueryAlias r - : +- Project [col1#x AS level#x] - : +- UnionLoop xxxx - : :- LocalRelation [col1#x] - : +- Project [(level#x + 1) AS (level + 1)#x] - : +- Filter (level#x < 9) - : +- SubqueryAlias r - : +- Project [col1#x AS level#x] - : +- UnionLoopRef xxxx, [col1#x], false - +- Project [level#x] - +- SubqueryAlias r - +- CTERelationRef xxxx, true, [level#x], false, false, false ++- Project [level#x AS level#x] + +- WithCTE + :- CTERelationDef xxxx, false + : +- SubqueryAlias r + : +- Project [col1#x AS level#x] + : +- UnionLoop xxxx + : :- LocalRelation [col1#x] + : +- Project [(level#x + 1) AS (level + 1)#x] + : +- Filter (level#x < 9) + : +- SubqueryAlias r + : +- Project [col1#x AS level#x] + : +- UnionLoopRef xxxx, [col1#x], false + +- Project [level#x] + +- SubqueryAlias r + +- CTERelationRef xxxx, true, [level#x], false, false, false -- !query @@ -1136,13 +1137,15 @@ WithCTE : +- UnionLoopRef xxxx, [col1#x], false +- Union false, false :- InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/rt2, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/rt2], Append, `spark_catalog`.`default`.`rt2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/rt2), [level] - : +- Project [level#x] - : +- SubqueryAlias r - : +- CTERelationRef xxxx, true, [level#x], false, false, false + : +- Project [level#x AS level#x] + : +- Project [level#x] + : +- SubqueryAlias r + : +- CTERelationRef xxxx, true, [level#x], false, false, false +- InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/rt2, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/rt2], Append, `spark_catalog`.`default`.`rt2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/rt2), [level] - +- Project [level#x] - +- SubqueryAlias r - +- CTERelationRef xxxx, true, [level#x], false, false, false + +- Project [level#x AS level#x] + +- Project [level#x] + +- SubqueryAlias r + +- CTERelationRef xxxx, true, [level#x], false, false, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out index 8f0676f7733a4..4b730a0561b4a 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/identifier-clause.sql.out @@ -960,10 +960,11 @@ CreateDataSourceTableAsSelectCommand `spark_catalog`.`default`.`t2`, ErrorIfExis insert into identifier('t2') select my_col from (values (3) as (my_col)) group by 1 -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t2, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/t2], Append, `spark_catalog`.`default`.`t2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t2), [my_col] -+- Aggregate [my_col#x], [my_col#x] - +- SubqueryAlias __auto_generated_subquery_name - +- SubqueryAlias as - +- LocalRelation [my_col#x] ++- Project [my_col#x AS my_col#x] + +- Aggregate [my_col#x], [my_col#x] + +- SubqueryAlias __auto_generated_subquery_name + +- SubqueryAlias as + +- LocalRelation [my_col#x] -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-nulls-ordering.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-nulls-ordering.sql.out index 1eb699deac725..25831f8523877 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-nulls-ordering.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/order-by-nulls-ordering.sql.out @@ -10,7 +10,8 @@ INSERT INTO spark_10747 VALUES (6, 12, 10), (6, 11, 4), (6, 9, 10), (6, 15, 8), (6, 15, 8), (6, 7, 4), (6, 7, 8), (6, 13, null), (6, 10, null) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/spark_10747, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/spark_10747], Append, `spark_catalog`.`default`.`spark_10747`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/spark_10747), [col1, col2, col3] -+- LocalRelation [col1#x, col2#x, col3#x] ++- Project [col1#x AS col1#x, col2#x AS col2#x, col3#x AS col3#x] + +- LocalRelation [col1#x, col2#x, col3#x] -- !query @@ -148,7 +149,7 @@ INSERT INTO spark_10747_mix VALUES ('c', 3, 2.0, 2.00, null) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/spark_10747_mix, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/spark_10747_mix], Append, `spark_catalog`.`default`.`spark_10747_mix`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/spark_10747_mix), [col1, col2, col3, col4, col5] -+- Project [col1#x, col2#x, cast(col3#x as double) AS col3#x, cast(col4#x as decimal(10,2)) AS col4#x, cast(col5#x as decimal(20,1)) AS col5#x] ++- Project [col1#x AS col1#x, col2#x AS col2#x, cast(col3#x as double) AS col3#x, cast(col4#x as decimal(10,2)) AS col4#x, cast(col5#x as decimal(20,1)) AS col5#x] +- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x] diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/insert.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/insert.sql.out index 9f232433e1807..d6f2b7ed5d360 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/insert.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/insert.sql.out @@ -9,7 +9,7 @@ CreateDataSourceTableCommand `spark_catalog`.`default`.`inserttest`, false insert into inserttest values (NULL, 3, 'testing') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/inserttest, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/inserttest], Append, `spark_catalog`.`default`.`inserttest`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/inserttest), [col1, col2, col3] -+- Project [cast(col1#x as int) AS col1#x, col2#x, col3#x] ++- Project [cast(col1#x as int) AS col1#x, col2#x AS col2#x, col3#x AS col3#x] +- LocalRelation [col1#x, col2#x, col3#x] @@ -17,7 +17,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into inserttest values (NULL, 5, 'testing') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/inserttest, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/inserttest], Append, `spark_catalog`.`default`.`inserttest`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/inserttest), [col1, col2, col3] -+- Project [cast(col1#x as int) AS col1#x, col2#x, col3#x] ++- Project [cast(col1#x as int) AS col1#x, col2#x AS col2#x, col3#x AS col3#x] +- LocalRelation [col1#x, col2#x, col3#x] @@ -25,7 +25,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into inserttest values (NULL, 5, 'test') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/inserttest, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/inserttest], Append, `spark_catalog`.`default`.`inserttest`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/inserttest), [col1, col2, col3] -+- Project [cast(col1#x as int) AS col1#x, col2#x, col3#x] ++- Project [cast(col1#x as int) AS col1#x, col2#x AS col2#x, col3#x AS col3#x] +- LocalRelation [col1#x, col2#x, col3#x] @@ -33,7 +33,7 @@ InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_d insert into inserttest values (NULL, 7, 'testing') -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/inserttest, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/inserttest], Append, `spark_catalog`.`default`.`inserttest`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/inserttest), [col1, col2, col3] -+- Project [cast(col1#x as int) AS col1#x, col2#x, col3#x] ++- Project [cast(col1#x as int) AS col1#x, col2#x AS col2#x, col3#x AS col3#x] +- LocalRelation [col1#x, col2#x, col3#x] @@ -49,7 +49,8 @@ Project [col1#x, col2#x, col3#x] insert into inserttest values(30, 50, repeat('x', 10000)) -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/inserttest, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/inserttest], Append, `spark_catalog`.`default`.`inserttest`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/inserttest), [col1, col2, col3] -+- LocalRelation [col1#x, col2#x, col3#x] ++- Project [col1#x AS col1#x, col2#x AS col2#x, col3#x AS col3#x] + +- LocalRelation [col1#x, col2#x, col3#x] -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out index e02562e29835f..06602788f0bc3 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/selectExcept.sql.out @@ -171,13 +171,14 @@ INSERT INTO ids SELECT * EXCEPT (name, data) FROM tbl_view -- !query analysis InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/ids, false, CSV, [path=file:[not included in comparison]/{warehouse_dir}/ids], Append, `spark_catalog`.`default`.`ids`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/ids), [id] -+- Project [id#x] - +- SubqueryAlias tbl_view - +- View (`tbl_view`, [id#x, name#x, data#x]) - +- Project [cast(id#x as int) AS id#x, cast(name#x as string) AS name#x, cast(data#x as struct>) AS data#x] - +- Project [id#x, name#x, data#x] - +- SubqueryAlias tbl_view - +- LocalRelation [id#x, name#x, data#x] ++- Project [id#x AS id#x] + +- Project [id#x] + +- SubqueryAlias tbl_view + +- View (`tbl_view`, [id#x, name#x, data#x]) + +- Project [cast(id#x as int) AS id#x, cast(name#x as string) AS name#x, cast(data#x as struct>) AS data#x] + +- Project [id#x, name#x, data#x] + +- SubqueryAlias tbl_view + +- LocalRelation [id#x, name#x, data#x] -- !query diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out index 323084223d4bc..b46d43a626951 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/thetasketch.sql.out @@ -998,7 +998,7 @@ WithCTE : +- SubqueryAlias union_result : +- Aggregate [theta_union_agg(sketch#x, 16, 0, 0) AS union_sketch#x] : +- SubqueryAlias sketches -: +- CTERelationRef xxxx, true, [sketch_type#x, sketch#x], false, false, 4 +: +- CTERelationRef xxxx, true, [sketch_type#x, sketch#x], false, false, 4, false :- CTERelationDef xxxx, false : +- SubqueryAlias individual_sketches : +- Aggregate [theta_sketch_agg(col1#x, 12, 0, 0) AS sketch1#x, theta_sketch_agg(col2#x, 12, 0, 0) AS sketch2#x] @@ -1007,9 +1007,9 @@ WithCTE +- Project [theta_sketch_estimate(scalar-subquery#x []) AS union_estimate#xL, theta_sketch_estimate(theta_union(sketch1#x, sketch2#x, 15)) AS binary_union_estimate#xL, theta_sketch_estimate(theta_intersection(sketch1#x, sketch2#x)) AS intersection_estimate#xL, theta_sketch_estimate(theta_difference(sketch1#x, sketch2#x)) AS difference_estimate#xL] : +- Project [union_sketch#x] : +- SubqueryAlias union_result - : +- CTERelationRef xxxx, true, [union_sketch#x], false, false, 1 + : +- CTERelationRef xxxx, true, [union_sketch#x], false, false, 1, false +- SubqueryAlias individual_sketches - +- CTERelationRef xxxx, true, [sketch1#x, sketch2#x], false, false, 1 + +- CTERelationRef xxxx, true, [sketch1#x, sketch2#x], false, false, 1, false -- !query diff --git a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out index 1545157aa53bf..f2df635a5a4ff 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain-aqe.sql.out @@ -1189,9 +1189,10 @@ struct == Analyzed Logical Plan == InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/explain_temp5, false, [val#x], Parquet, [path=file:[not included in comparison]/{warehouse_dir}/explain_temp5], Append, `spark_catalog`.`default`.`explain_temp5`, org.apache.spark.sql.execution.datasources.CatalogFileIndex(file:[not included in comparison]/{warehouse_dir}/explain_temp5), [key, val] -+- Project [key#x, val#x] - +- SubqueryAlias spark_catalog.default.explain_temp4 - +- Relation spark_catalog.default.explain_temp4[key#x,val#x] parquet ++- Project [key#x AS key#x, val#x AS val#x] + +- Project [key#x, val#x] + +- SubqueryAlias spark_catalog.default.explain_temp4 + +- Relation spark_catalog.default.explain_temp4[key#x,val#x] parquet == Optimized Logical Plan == InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/explain_temp5, false, [val#x], Parquet, [path=file:[not included in comparison]/{warehouse_dir}/explain_temp5], Append, `spark_catalog`.`default`.`explain_temp5`, org.apache.spark.sql.execution.datasources.CatalogFileIndex(file:[not included in comparison]/{warehouse_dir}/explain_temp5), [key, val] diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out index 898bd9097e43b..221a323b01bb2 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out @@ -1081,9 +1081,10 @@ struct == Analyzed Logical Plan == InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/explain_temp5, false, [val#x], Parquet, [path=file:[not included in comparison]/{warehouse_dir}/explain_temp5], Append, `spark_catalog`.`default`.`explain_temp5`, org.apache.spark.sql.execution.datasources.CatalogFileIndex(file:[not included in comparison]/{warehouse_dir}/explain_temp5), [key, val] -+- Project [key#x, val#x] - +- SubqueryAlias spark_catalog.default.explain_temp4 - +- Relation spark_catalog.default.explain_temp4[key#x,val#x] parquet ++- Project [key#x AS key#x, val#x AS val#x] + +- Project [key#x, val#x] + +- SubqueryAlias spark_catalog.default.explain_temp4 + +- Relation spark_catalog.default.explain_temp4[key#x,val#x] parquet == Optimized Logical Plan == InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/explain_temp5, false, [val#x], Parquet, [path=file:[not included in comparison]/{warehouse_dir}/explain_temp5], Append, `spark_catalog`.`default`.`explain_temp5`, org.apache.spark.sql.execution.datasources.CatalogFileIndex(file:[not included in comparison]/{warehouse_dir}/explain_temp5), [key, val] diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 6678f9535fe0d..4fe49117ecc06 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -2012,12 +2012,17 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { withSQLConf(SQLConf.JSON_GENERATOR_WRITE_NULL_IF_WITH_DEFAULT_VALUE.key -> "false", SQLConf.JSON_GENERATOR_IGNORE_NULL_FIELDS.key -> "true") { withTable("t") { - sql("create table t (a struct default struct(42), b int) using json") - sql("insert into t values (cast(null as struct), null)") - // nulls should not be written for either field - checkAnswer(readTableAsText("t"), Row("{}")) + sql("""create table t ( + | a struct default struct(43), + | b int default 17, + | c struct) + | using json + |""".stripMargin) + sql("insert into t values (cast(null as struct), null, struct(5 as z))") + // nulls should not be written for a or b fields + checkAnswer(readTableAsText("t"), Row("{\"c\":{\"y\":5}}")) // default value is filled in for missing fields. - checkAnswer(spark.table("t"), Row(Row(42), null)) + checkAnswer(spark.table("t"), Row(Row(43), 17, Row(5))) } } // SPARK-52772 Should not pick up JSON DEFAULT from source diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index d58dec6f1126e..b82a69d526959 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -1312,7 +1312,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd .queryExecution.analyzed } - assertResult(1, "Duplicated project detected\n" + analyzedPlan) { + assertResult(2, "Duplicated project detected\n" + analyzedPlan) { analyzedPlan.collect { case i: InsertIntoHiveTable => i.query.collect { case p: Project => () }.size }.sum