-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-16034][SQL] Checks the partition columns when calling dataFrame.write.mode("append").saveAsTable #13749
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -435,26 +435,25 @@ case class DataSource( | |
| // If we are appending to a table that already exists, make sure the partitioning matches | ||
| // up. If we fail to load the table for whatever reason, ignore the check. | ||
| if (mode == SaveMode.Append) { | ||
| val existingPartitionColumnSet = try { | ||
| Some( | ||
| resolveRelation() | ||
| .asInstanceOf[HadoopFsRelation] | ||
| .location | ||
| .partitionSpec() | ||
| .partitionColumns | ||
| .fieldNames | ||
| .toSet) | ||
| } catch { | ||
| case e: Exception => | ||
| None | ||
| } | ||
|
|
||
| existingPartitionColumnSet.foreach { ex => | ||
| if (ex.map(_.toLowerCase) != partitionColumns.map(_.toLowerCase()).toSet) { | ||
| throw new AnalysisException( | ||
| s"Requested partitioning does not equal existing partitioning: " + | ||
| s"$ex != ${partitionColumns.toSet}.") | ||
| } | ||
| val existingColumns = Try { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| resolveRelation() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, the returned partitioning columns are user-provided instead of existing dataset's partitioning columns.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, this triggers a partitioning discovery. We should avoid it. |
||
| .asInstanceOf[HadoopFsRelation] | ||
| .location | ||
| .partitionSpec() | ||
| .partitionColumns | ||
| .fieldNames | ||
| .toSeq | ||
| }.getOrElse(Seq.empty[String]) | ||
| val sameColumns = | ||
| existingColumns.map(_.toLowerCase) == partitionColumns.map(_.toLowerCase) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, this is always case-insensitive resolution. |
||
| if (existingColumns.size > 0 && !sameColumns) { | ||
| throw new AnalysisException( | ||
| s"""Requested partitioning does not match existing partitioning. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add "Requested partitioning does not match existing partitioning for table $table" ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, updated |
||
| |Existing partitioning columns: | ||
| | ${existingColumns.mkString(", ")} | ||
| |Requested partitioning columns: | ||
| | ${partitionColumns.mkString(", ")} | ||
| |""".stripMargin) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1317,4 +1317,28 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { | |
| assertUnsupported("TRUNCATE TABLE my_tab PARTITION (age=10)") | ||
| } | ||
|
|
||
| test("SPARK-16034 Partition columns should match when appending to existing data source tables") { | ||
| import testImplicits._ | ||
| val df = Seq((1, 2, 3)).toDF("a", "b", "c") | ||
| withTable("partitionedTable") { | ||
| df.write.mode("overwrite").partitionBy("a", "b").saveAsTable("partitionedTable") | ||
| // Misses some partition columns | ||
| intercept[AnalysisException] { | ||
| df.write.mode("append").partitionBy("a").saveAsTable("partitionedTable") | ||
| } | ||
| // Wrong order | ||
| intercept[AnalysisException] { | ||
| df.write.mode("append").partitionBy("b", "a").saveAsTable("partitionedTable") | ||
| } | ||
| // Partition columns not specified | ||
| intercept[AnalysisException] { | ||
| df.write.mode("append").saveAsTable("partitionedTable") | ||
| } | ||
| assert(sql("select * from partitionedTable").collect().size == 1) | ||
| // Inserts new data successfully when partition columns are correctly specified in | ||
| // partitionBy(...). | ||
| df.write.mode("append").partitionBy("a", "b").saveAsTable("partitionedTable") | ||
| assert(sql("select * from partitionedTable").collect().size == 2) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to check the answer. |
||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This log entry is mainly for catching the table name and mode, right?