forked from delta-io/delta
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
See the project plan at delta-io#1105. This PR adds CDF to the UPDATE command, during which we generate both preimage and postimage CDF data. This PR also adds UpdateCDCSuite which adds basic tests for these CDF changes. As a high-level overview of how this CDF-update operation is performed, when we find a row that satisfies the update condition, we `explode` an array containing the pre-image, post-image, and main-table updated rows. The pre-image and post-image rows are appropriately typed with the corresponding CDF_TYPE, and the main-table updated row has CDF_TYPE `null`. Thus, the first two rows will be written to the cdf parquet file, with the latter is written to standard main-table data parquet file. Closes delta-io#1146 GitOrigin-RevId: 47413c5345bb97c0e1303a7f4d4d06b89c35ab7a
- Loading branch information
1 parent
d56d774
commit 38df69f
Showing
3 changed files
with
243 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
core/src/test/scala/org/apache/spark/sql/delta/cdc/UpdateCDCSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
/* | ||
* Copyright (2021) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.delta.cdc | ||
|
||
// scalastyle:off import.ordering.noEmptyLine | ||
import org.apache.spark.sql.delta._ | ||
import org.apache.spark.sql.delta.commands.cdc.CDCReader | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.sql.Row | ||
|
||
class UpdateCDCSuite extends UpdateSQLSuite with DeltaColumnMappingTestUtils { | ||
import testImplicits._ | ||
|
||
override protected def sparkConf: SparkConf = super.sparkConf | ||
.set(DeltaConfigs.CHANGE_DATA_FEED.defaultTablePropertyKey, "true") | ||
|
||
test("CDC for unconditional update") { | ||
append(Seq((1, 1), (2, 2), (3, 3), (4, 4)).toDF("key", "value")) | ||
|
||
checkUpdate( | ||
condition = None, | ||
setClauses = "value = -1", | ||
expectedResults = Row(1, -1) :: Row(2, -1) :: Row(3, -1) :: Row(4, -1) :: Nil) | ||
|
||
checkAnswer( | ||
CDCReader.changesToBatchDF(DeltaLog.forTable(spark, tempPath), 1, 1, spark) | ||
.drop(CDCReader.CDC_COMMIT_TIMESTAMP), | ||
Row(1, 1, "update_preimage", 1) :: Row(1, -1, "update_postimage", 1) :: | ||
Row(2, 2, "update_preimage", 1) :: Row(2, -1, "update_postimage", 1) :: | ||
Row(3, 3, "update_preimage", 1) :: Row(3, -1, "update_postimage", 1) :: | ||
Row(4, 4, "update_preimage", 1) :: Row(4, -1, "update_postimage", 1) :: Nil) | ||
} | ||
|
||
test("CDC for conditional update on all rows") { | ||
append(Seq((1, 1), (2, 2), (3, 3), (4, 4)).toDF("key", "value")) | ||
|
||
checkUpdate( | ||
condition = Some("key < 10"), | ||
setClauses = "value = -1", | ||
expectedResults = Row(1, -1) :: Row(2, -1) :: Row(3, -1) :: Row(4, -1) :: Nil) | ||
|
||
checkAnswer( | ||
CDCReader.changesToBatchDF(DeltaLog.forTable(spark, tempPath), 1, 1, spark) | ||
.drop(CDCReader.CDC_COMMIT_TIMESTAMP), | ||
Row(1, 1, "update_preimage", 1) :: Row(1, -1, "update_postimage", 1) :: | ||
Row(2, 2, "update_preimage", 1) :: Row(2, -1, "update_postimage", 1) :: | ||
Row(3, 3, "update_preimage", 1) :: Row(3, -1, "update_postimage", 1) :: | ||
Row(4, 4, "update_preimage", 1) :: Row(4, -1, "update_postimage", 1) :: Nil) | ||
} | ||
|
||
test("CDC for point update") { | ||
append(Seq((1, 1), (2, 2), (3, 3), (4, 4)).toDF("key", "value")) | ||
|
||
checkUpdate( | ||
condition = Some("key = 1"), | ||
setClauses = "value = -1", | ||
expectedResults = Row(1, -1) :: Row(2, 2) :: Row(3, 3) :: Row(4, 4) :: Nil) | ||
|
||
checkAnswer( | ||
CDCReader.changesToBatchDF(DeltaLog.forTable(spark, tempPath), 1, 1, spark) | ||
.drop(CDCReader.CDC_COMMIT_TIMESTAMP), | ||
Row(1, 1, "update_preimage", 1) :: Row(1, -1, "update_postimage", 1) :: Nil) | ||
} | ||
|
||
test("CDC for partition-optimized update") { | ||
append( | ||
Seq((1, 1, 1), (2, 2, 0), (3, 3, 1), (4, 4, 0)).toDF("key", "value", "part"), | ||
partitionBy = Seq("part")) | ||
|
||
checkUpdate( | ||
condition = Some("part = 1"), | ||
setClauses = "value = -1", | ||
expectedResults = Row(1, -1) :: Row(2, 2) :: Row(3, -1) :: Row(4, 4) :: Nil) | ||
|
||
checkAnswer( | ||
CDCReader.changesToBatchDF(DeltaLog.forTable(spark, tempPath), 1, 1, spark) | ||
.drop(CDCReader.CDC_COMMIT_TIMESTAMP), | ||
Row(1, 1, 1, "update_preimage", 1) :: Row(1, -1, 1, "update_postimage", 1) :: | ||
Row(3, 3, 1, "update_preimage", 1) :: Row(3, -1, 1, "update_postimage", 1) :: Nil) | ||
} | ||
|
||
|
||
test("update a partitioned CDC enabled table to set the partition column to null") { | ||
val tableName = "part_table_test" | ||
withTable(tableName) { | ||
Seq((0, 0, 0), (1, 1, 1), (2, 2, 2)) | ||
.toDF("key", "partition_column", "value") | ||
.write | ||
.partitionBy("partition_column") | ||
.format("delta") | ||
.saveAsTable(tableName) | ||
sql(s"INSERT INTO $tableName VALUES (4, 4, 4)") | ||
sql(s"UPDATE $tableName SET partition_column = null WHERE partition_column = 4") | ||
checkAnswer( | ||
CDCReader.changesToBatchDF( | ||
DeltaLog.forTable( | ||
spark, | ||
spark.sessionState.sqlParser.parseTableIdentifier(tableName) | ||
), 1, 3, spark) | ||
.drop(CDCReader.CDC_COMMIT_TIMESTAMP), | ||
Row(4, 4, 4, "insert", 1) :: | ||
Row(4, 4, 4, "update_preimage", 2) :: | ||
Row(4, null, 4, "update_postimage", 2) :: Nil) | ||
} | ||
} | ||
} | ||
|