-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25313][SQL]Fix regression in FileFormatWriter output names #22320
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bbd572c
5bce8a0
16bb457
3c282ef
98bf027
538fea9
45d2a20
3ca072d
4590c98
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,14 +56,14 @@ case class InsertIntoHadoopFsRelationCommand( | |
| mode: SaveMode, | ||
| catalogTable: Option[CatalogTable], | ||
| fileIndex: Option[FileIndex], | ||
| outputColumns: Seq[Attribute]) | ||
| outputColumnNames: Seq[String]) | ||
| extends DataWritingCommand { | ||
| import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Line 66:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, then we can use this method instead. |
||
| override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { | ||
| // Most formats don't do well with duplicate columns, so lets not allow that | ||
| SchemaUtils.checkSchemaColumnNameDuplication( | ||
| query.schema, | ||
| SchemaUtils.checkColumnNameDuplication( | ||
| outputColumnNames, | ||
| s"when inserting into $outputPath", | ||
| sparkSession.sessionState.conf.caseSensitiveAnalysis) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -805,6 +805,80 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be | |
| } | ||
| } | ||
|
|
||
| test("Insert overwrite table command should output correct schema: basic") { | ||
| withTable("tbl", "tbl2") { | ||
| withView("view1") { | ||
| val df = spark.range(10).toDF("id") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is trivial...As the column name
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "case sensitive"? How is so since Spark SQL is case-insensitive by default?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think @gengliangwang meant case preserving, which is the behavior we are testing against.
|
||
| df.write.format("parquet").saveAsTable("tbl") | ||
| spark.sql("CREATE VIEW view1 AS SELECT id FROM tbl") | ||
| spark.sql("CREATE TABLE tbl2(ID long) USING parquet") | ||
| spark.sql("INSERT OVERWRITE TABLE tbl2 SELECT ID FROM view1") | ||
| val identifier = TableIdentifier("tbl2") | ||
| val location = spark.sessionState.catalog.getTableMetadata(identifier).location.toString | ||
| val expectedSchema = StructType(Seq(StructField("ID", LongType, true))) | ||
| assert(spark.read.parquet(location).schema == expectedSchema) | ||
| checkAnswer(spark.table("tbl2"), df) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Insert overwrite table command should output correct schema: complex") { | ||
| withTable("tbl", "tbl2") { | ||
| withView("view1") { | ||
| val df = spark.range(10).map(x => (x, x.toInt, x.toInt)).toDF("col1", "col2", "col3") | ||
| df.write.format("parquet").saveAsTable("tbl") | ||
| spark.sql("CREATE VIEW view1 AS SELECT * FROM tbl") | ||
| spark.sql("CREATE TABLE tbl2(COL1 long, COL2 int, COL3 int) USING parquet PARTITIONED " + | ||
| "BY (COL2) CLUSTERED BY (COL3) INTO 3 BUCKETS") | ||
| spark.sql("INSERT OVERWRITE TABLE tbl2 SELECT COL1, COL2, COL3 FROM view1") | ||
| val identifier = TableIdentifier("tbl2") | ||
| val location = spark.sessionState.catalog.getTableMetadata(identifier).location.toString | ||
| val expectedSchema = StructType(Seq( | ||
| StructField("COL1", LongType, true), | ||
| StructField("COL3", IntegerType, true), | ||
| StructField("COL2", IntegerType, true))) | ||
| assert(spark.read.parquet(location).schema == expectedSchema) | ||
| checkAnswer(spark.table("tbl2"), df) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Create table as select command should output correct schema: basic") { | ||
| withTable("tbl", "tbl2") { | ||
| withView("view1") { | ||
| val df = spark.range(10).toDF("id") | ||
| df.write.format("parquet").saveAsTable("tbl") | ||
| spark.sql("CREATE VIEW view1 AS SELECT id FROM tbl") | ||
| spark.sql("CREATE TABLE tbl2 USING parquet AS SELECT ID FROM view1") | ||
| val identifier = TableIdentifier("tbl2") | ||
| val location = spark.sessionState.catalog.getTableMetadata(identifier).location.toString | ||
| val expectedSchema = StructType(Seq(StructField("ID", LongType, true))) | ||
| assert(spark.read.parquet(location).schema == expectedSchema) | ||
| checkAnswer(spark.table("tbl2"), df) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Create table as select command should output correct schema: complex") { | ||
| withTable("tbl", "tbl2") { | ||
| withView("view1") { | ||
| val df = spark.range(10).map(x => (x, x.toInt, x.toInt)).toDF("col1", "col2", "col3") | ||
| df.write.format("parquet").saveAsTable("tbl") | ||
| spark.sql("CREATE VIEW view1 AS SELECT * FROM tbl") | ||
| spark.sql("CREATE TABLE tbl2 USING parquet PARTITIONED BY (COL2) " + | ||
| "CLUSTERED BY (COL3) INTO 3 BUCKETS AS SELECT COL1, COL2, COL3 FROM view1") | ||
| val identifier = TableIdentifier("tbl2") | ||
| val location = spark.sessionState.catalog.getTableMetadata(identifier).location.toString | ||
| val expectedSchema = StructType(Seq( | ||
| StructField("COL1", LongType, true), | ||
| StructField("COL3", IntegerType, true), | ||
| StructField("COL2", IntegerType, true))) | ||
| assert(spark.read.parquet(location).schema == expectedSchema) | ||
| checkAnswer(spark.table("tbl2"), df) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("use Spark jobs to list files") { | ||
| withSQLConf(SQLConf.PARALLEL_PARTITION_DISCOVERY_THRESHOLD.key -> "1") { | ||
| withTempDir { dir => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.command.DataWritingCommand | |
| case class CreateHiveTableAsSelectCommand( | ||
| tableDesc: CatalogTable, | ||
| query: LogicalPlan, | ||
| outputColumns: Seq[Attribute], | ||
| outputColumnNames: Seq[String], | ||
| mode: SaveMode) | ||
| extends DataWritingCommand { | ||
|
|
||
|
|
@@ -63,13 +63,14 @@ case class CreateHiveTableAsSelectCommand( | |
| query, | ||
| overwrite = false, | ||
| ifPartitionNotExists = false, | ||
| outputColumns = outputColumns).run(sparkSession, child) | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove one |
||
| } else { | ||
| // TODO ideally, we should get the output data ready first and then | ||
| // add the relation into catalog, just in case of failure occurs while data | ||
| // processing. | ||
| assert(tableDesc.schema.isEmpty) | ||
| catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false) | ||
| val schema = DataWritingCommand.logicalPlanSchemaWithNames(query, outputColumnNames) | ||
| catalog.createTable(tableDesc.copy(schema = schema), ignoreIfExists = false) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The schema naming need to be consistent with |
||
|
|
||
| try { | ||
| // Read back the metadata of the table which was created just now. | ||
|
|
@@ -82,7 +83,7 @@ case class CreateHiveTableAsSelectCommand( | |
| query, | ||
| overwrite = true, | ||
| ifPartitionNotExists = false, | ||
| outputColumns = outputColumns).run(sparkSession, child) | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this duplication needed here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the duplication?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel it's better to specify parameters by name if the previous parameter is already specified by name, e.g. |
||
| } catch { | ||
| case NonFatal(e) => | ||
| // drop the created table. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,7 +69,7 @@ case class InsertIntoHiveTable( | |
| query: LogicalPlan, | ||
| overwrite: Boolean, | ||
| ifPartitionNotExists: Boolean, | ||
| outputColumns: Seq[Attribute]) extends SaveAsHiveFile { | ||
| outputColumnNames: Seq[String]) extends SaveAsHiveFile { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For better test coverage, can you add tests for hive tables?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No problem 👍
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks! |
||
|
|
||
| /** | ||
| * Inserts all the rows in the table into Hive. Row objects are properly serialized with the | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -754,6 +754,54 @@ class HiveDDLSuite | |
| } | ||
| } | ||
|
|
||
| test("Insert overwrite Hive table should output correct schema") { | ||
| withSQLConf(CONVERT_METASTORE_PARQUET.key -> "false") { | ||
| withTable("tbl", "tbl2") { | ||
| withView("view1") { | ||
| spark.sql("CREATE TABLE tbl(id long)") | ||
| spark.sql("INSERT OVERWRITE TABLE tbl VALUES 4") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might be missing something, but why does this test use SQL statements not DataFrameWriter API, e.g.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can, but it's important to keep the code style consistent with the existing code in the same file. In this test suite, seems SQL statements are prefered. |
||
| spark.sql("CREATE VIEW view1 AS SELECT id FROM tbl") | ||
| withTempPath { path => | ||
| sql( | ||
| s""" | ||
| |CREATE TABLE tbl2(ID long) USING hive | ||
| |OPTIONS(fileFormat 'parquet') | ||
| |LOCATION '${path.toURI}' | ||
| """.stripMargin) | ||
| spark.sql("INSERT OVERWRITE TABLE tbl2 SELECT ID FROM view1") | ||
| val expectedSchema = StructType(Seq(StructField("ID", LongType, true))) | ||
| assert(spark.read.parquet(path.toString).schema == expectedSchema) | ||
| checkAnswer(spark.table("tbl2"), Seq(Row(4))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Create Hive table as select should output correct schema") { | ||
| withSQLConf(CONVERT_METASTORE_PARQUET.key -> "false") { | ||
| withTable("tbl", "tbl2") { | ||
| withView("view1") { | ||
| spark.sql("CREATE TABLE tbl(id long)") | ||
| spark.sql("INSERT OVERWRITE TABLE tbl VALUES 4") | ||
| spark.sql("CREATE VIEW view1 AS SELECT id FROM tbl") | ||
| withTempPath { path => | ||
| sql( | ||
| s""" | ||
| |CREATE TABLE tbl2 USING hive | ||
| |OPTIONS(fileFormat 'parquet') | ||
| |LOCATION '${path.toURI}' | ||
| |AS SELECT ID FROM view1 | ||
| """.stripMargin) | ||
| val expectedSchema = StructType(Seq(StructField("ID", LongType, true))) | ||
| assert(spark.read.parquet(path.toString).schema == expectedSchema) | ||
| checkAnswer(spark.table("tbl2"), Seq(Row(4))) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("alter table partition - storage information") { | ||
| sql("CREATE TABLE boxes (height INT, length INT) PARTITIONED BY (width INT)") | ||
| sql("INSERT OVERWRITE TABLE boxes PARTITION (width=4) SELECT 4, 4") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
query: LogicalPlan->outputAttributes: Seq[Attribute]in the function argument, then drop the line above?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think both are OK. The current way makes it easier to call this Util function, and it is easier to understand what the parameter should be. While the ways you suggests makes the argument carrying minimal information.