-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-7949] insert into hudi table with columns specified #11568
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2826,4 +2826,163 @@ class TestInsertTable extends HoodieSparkSqlTestBase { | |
| spark.sessionState.conf.unsetConf("hoodie.datasource.insert.dup.policy") | ||
| spark.sessionState.conf.unsetConf("hoodie.datasource.write.operation") | ||
| } | ||
|
|
||
| test("Test insert into with special cols") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. special -> specified |
||
| withTempDir { tmp => | ||
| if (HoodieSparkUtils.gteqSpark3_2) { | ||
| val targetTableA = generateTableName | ||
| val tablePathA = s"${tmp.getCanonicalPath}/$targetTableA" | ||
| if (HoodieSparkUtils.isSpark3_4) { | ||
| spark.sql("set spark.sql.defaultColumn.enabled = false") | ||
| } | ||
|
|
||
| spark.sql( | ||
| s""" | ||
| |create table if not exists $targetTableA ( | ||
| | id bigint, | ||
| | name string, | ||
| | price double | ||
| |) using hudi | ||
| |tblproperties ( | ||
| | primaryKey = 'id', | ||
| | type = 'mor', | ||
| | preCombineField = 'name' | ||
| |) location '$tablePathA' | ||
| |""".stripMargin) | ||
|
|
||
| spark.sql(s"insert into $targetTableA (id, price, name) values (1, 12.1, 'aaa')") | ||
|
|
||
| checkAnswer(s"select id, price, name from $targetTableA")( | ||
| Seq(1, 12.1, "aaa") | ||
| ) | ||
|
|
||
| val targetTableB = generateTableName | ||
| val tablePathB = s"${tmp.getCanonicalPath}/$targetTableB" | ||
|
|
||
| spark.sql( | ||
| s""" | ||
| |create table if not exists $targetTableB ( | ||
| | id bigint, | ||
| | name string, | ||
| | price double, | ||
| | day string, | ||
| | hour string | ||
| |) using hudi | ||
| |tblproperties ( | ||
| | primaryKey = 'id', | ||
| | type = 'mor', | ||
| | preCombineField = 'name' | ||
| |) partitioned by (day, hour) | ||
| |location '$tablePathB' | ||
| |""".stripMargin) | ||
|
|
||
| spark.sql(s"insert into $targetTableB (id, day, price, name, hour) " + | ||
| s"values (2, '01', 12.2, 'bbb', '02')") | ||
|
|
||
| spark.sql(s"insert into $targetTableB (id, day, price, name, hour) " + | ||
| s"select id, '01' as dt, price, name, '03' as hour from $targetTableA") | ||
|
|
||
| spark.sql(s"insert into $targetTableB partition(day='02', hour) (id, hour, price, name) " + | ||
| s"values (3, '01', 12.3, 'ccc')") | ||
|
|
||
| spark.sql(s"insert into $targetTableB partition(day='02', hour='02') (id, price, name) " + | ||
| s"values (4, 12.4, 'ddd')") | ||
|
|
||
| checkAnswer(s"select id, price, name, day, hour from $targetTableB")( | ||
| Seq(2, 12.2, "bbb", "01", "02"), | ||
| Seq(1, 12.1, "aaa", "01", "03"), | ||
| Seq(3, 12.3, "ccc", "02", "01"), | ||
| Seq(4, 12.4, "ddd", "02", "02") | ||
| ) | ||
|
|
||
| if (HoodieSparkUtils.isSpark3_4) { | ||
| spark.sql("set spark.sql.defaultColumn.enabled = true") | ||
| checkExceptionContain(s"insert into $targetTableB (id, day, price, name, hour) " + | ||
| s"select id, '01' as dt, price, name, '03' as hour from $targetTableA")( | ||
| "hudi not support specified cols when enable default columns") | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("Test insert overwrite with special cols") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| withTempDir { tmp => | ||
| if (HoodieSparkUtils.gteqSpark3_2) { | ||
| val targetTableA = generateTableName | ||
| val tablePathA = s"${tmp.getCanonicalPath}/$targetTableA" | ||
| if (HoodieSparkUtils.isSpark3_4) { | ||
| spark.sql("set spark.sql.defaultColumn.enabled = false") | ||
| } | ||
|
|
||
| spark.sql( | ||
| s""" | ||
| |create table if not exists $targetTableA ( | ||
| | id bigint, | ||
| | name string, | ||
| | price double | ||
| |) using hudi | ||
| |tblproperties ( | ||
| | primaryKey = 'id', | ||
| | type = 'mor', | ||
| | preCombineField = 'name' | ||
| |) location '$tablePathA' | ||
| |""".stripMargin) | ||
|
|
||
| spark.sql(s"insert overwrite $targetTableA (id, price, name) values (1, 12.1, 'aaa')") | ||
|
|
||
| checkAnswer(s"select id, price, name from $targetTableA")( | ||
| Seq(1, 12.1, "aaa") | ||
| ) | ||
|
|
||
| val targetTableB = generateTableName | ||
| val tablePathB = s"${tmp.getCanonicalPath}/$targetTableB" | ||
|
|
||
| spark.sql( | ||
| s""" | ||
| |create table if not exists $targetTableB ( | ||
| | id bigint, | ||
| | name string, | ||
| | price double, | ||
| | day string, | ||
| | hour string | ||
| |) using hudi | ||
| |tblproperties ( | ||
| | primaryKey = 'id', | ||
| | type = 'mor', | ||
| | preCombineField = 'name' | ||
| |) partitioned by (day, hour) | ||
| |location '$tablePathB' | ||
| |""".stripMargin) | ||
|
|
||
| spark.sql(s"insert overwrite $targetTableB (id, day, price, name, hour) " + | ||
| s"values (2, '01', 12.2, 'bbb', '02')") | ||
|
|
||
| checkAnswer(s"select id, price, name, day, hour from $targetTableB")( | ||
| Seq(2, 12.2, "bbb", "01", "02") | ||
| ) | ||
|
|
||
| spark.sql(s"insert overwrite $targetTableB (id, day, price, name, hour) " + | ||
| s"select id, '01' as dt, price, name, '03' as hour from $targetTableA") | ||
|
|
||
| spark.sql(s"insert overwrite $targetTableB partition(day='02', hour) (id, hour, price, name) " + | ||
| s"values (3, '01', 12.3, 'ccc')") | ||
|
|
||
| spark.sql(s"insert overwrite $targetTableB partition(day='02', hour='02') (id, price, name) " + | ||
| s"values (4, 12.4, 'ddd')") | ||
|
|
||
| checkAnswer(s"select id, price, name, day, hour from $targetTableB")( | ||
| Seq(1, 12.1, "aaa", "01", "03"), | ||
| Seq(3, 12.3, "ccc", "02", "01"), | ||
| Seq(4, 12.4, "ddd", "02", "02") | ||
| ) | ||
|
|
||
| if (HoodieSparkUtils.isSpark3_4) { | ||
| spark.sql("set spark.sql.defaultColumn.enabled = true") | ||
| checkExceptionContain(s"insert overwrite $targetTableB (id, day, price, name, hour) " + | ||
| s"select id, '01' as dt, price, name, '03' as hour from $targetTableA")( | ||
| "hudi not support specified cols when enable default columns") | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,10 +61,10 @@ object HoodieSpark2CatalystPlanUtils extends HoodieCatalystPlansUtils { | |
| Join(left, right, joinType, None) | ||
| } | ||
|
|
||
| override def unapplyInsertIntoStatement(plan: LogicalPlan): Option[(LogicalPlan, Map[String, Option[String]], LogicalPlan, Boolean, Boolean)] = { | ||
| override def unapplyInsertIntoStatement(plan: LogicalPlan): Option[(LogicalPlan, Seq[String], Map[String, Option[String]], LogicalPlan, Boolean, Boolean)] = { | ||
| plan match { | ||
| case InsertIntoTable(table, partition, query, overwrite, ifPartitionNotExists) => | ||
| Some((table, partition, query, overwrite, ifPartitionNotExists)) | ||
| Some((table, Seq.empty, partition, query, overwrite, ifPartitionNotExists)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think we should log some msg here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it unnecessary, because it is not supported at the sql grammar level |
||
| case _ => None | ||
| } | ||
| } | ||
|
|
@@ -129,4 +129,6 @@ object HoodieSpark2CatalystPlanUtils extends HoodieCatalystPlansUtils { | |
| override def unapplyShowIndexes(plan: LogicalPlan): Option[(LogicalPlan, Seq[Attribute])] = None | ||
|
|
||
| override def unapplyRefreshIndex(plan: LogicalPlan): Option[(LogicalPlan, String)] = None | ||
|
|
||
| override def createProjectForByNameQuery(lr: LogicalRelation, plan: LogicalPlan): Option[LogicalPlan] = None | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, Join, J | |
| import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} | ||
| import org.apache.spark.sql.execution.{ExtendedMode, SimpleMode} | ||
| import org.apache.spark.sql.execution.command.{CreateTableLikeCommand, ExplainCommand} | ||
| import org.apache.spark.sql.execution.datasources.LogicalRelation | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
|
|
@@ -56,15 +57,6 @@ trait HoodieSpark3CatalystPlanUtils extends HoodieCatalystPlansUtils { | |
| Join(left, right, joinType, None, JoinHint.NONE) | ||
| } | ||
|
|
||
| override def unapplyInsertIntoStatement(plan: LogicalPlan): Option[(LogicalPlan, Map[String, Option[String]], LogicalPlan, Boolean, Boolean)] = { | ||
| plan match { | ||
| case insert: InsertIntoStatement => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why remove this impl?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. every subclass has it own impl, so I remove it |
||
| Some((insert.table, insert.partitionSpec, insert.query, insert.overwrite, insert.ifPartitionNotExists)) | ||
| case _ => | ||
| None | ||
| } | ||
| } | ||
|
|
||
|
|
||
| override def unapplyCreateTableLikeCommand(plan: LogicalPlan): Option[(TableIdentifier, TableIdentifier, CatalogStorageFormat, Option[String], Map[String, String], Boolean)] = { | ||
| plan match { | ||
|
|
@@ -84,6 +76,8 @@ trait HoodieSpark3CatalystPlanUtils extends HoodieCatalystPlansUtils { | |
| override def produceSameOutput(a: LogicalPlan, b: LogicalPlan): Boolean = { | ||
| a.sameOutput(b) | ||
| } | ||
|
|
||
| override def createProjectForByNameQuery(lr: LogicalRelation, plan: LogicalPlan): Option[LogicalPlan] = None | ||
| } | ||
|
|
||
| object HoodieSpark3CatalystPlanUtils extends SparkAdapterSupport { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,12 +18,11 @@ | |
|
|
||
| package org.apache.spark.sql | ||
|
|
||
| import org.apache.hudi.SparkHoodieTableFileIndex | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.ResolvedTable | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Expression, ProjectionOverSchema} | ||
| import org.apache.spark.sql.catalyst.planning.ScanOperation | ||
| import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, MergeIntoTable, Project} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan, MergeIntoTable} | ||
| import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} | ||
| import org.apache.spark.sql.execution.command.AlterTableRecoverPartitionsCommand | ||
| import org.apache.spark.sql.execution.datasources.parquet.{HoodieFormatTrait, ParquetFileFormat} | ||
|
|
@@ -83,4 +82,13 @@ object HoodieSpark31CatalystPlanUtils extends HoodieSpark3CatalystPlanUtils { | |
| override def unapplyShowIndexes(plan: LogicalPlan): Option[(LogicalPlan, Seq[Attribute])] = None | ||
|
|
||
| override def unapplyRefreshIndex(plan: LogicalPlan): Option[(LogicalPlan, String)] = None | ||
|
|
||
| override def unapplyInsertIntoStatement(plan: LogicalPlan): Option[(LogicalPlan, Seq[String], Map[String, Option[String]], LogicalPlan, Boolean, Boolean)] = { | ||
| plan match { | ||
| case insert: InsertIntoStatement => | ||
| Some((insert.table, Seq.empty, insert.partitionSpec, insert.query, insert.overwrite, insert.ifPartitionNotExists)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we log some msg here? |
||
| case _ => | ||
| None | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we add a doc for the new param?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done