Skip to content

Commit bba0e94

Browse files
authored
[Spark] Skip collecting commit stats to prevent computing Snapshot State (#2718)
## Description Before this PR, Delta computes a [SnapshotState](https://github.com/delta-io/delta/blob/v3.1.0/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotState.scala#L46-L58) during every commit. Computing a SnapshotState is fairly slow and expensive, because it involves reading the entirety of a checkpoint, sidecars, and log segment. For many types of commit, it should be unnecessary to compute the SnapshotState. After this PR, a transaction can avoid computing the SnapshotState of a newly created snapshot. Skipping the computation is enabled via a spark configuration option `spark.databricks.delta.commitStats.collect=false` This change can have a big performance impact when writing into a Delta Table. Especially when the table comprises a large number of underlying data files. ## How was this patch tested? - Locally built delta-spark - Ran a small spark job to insert rows into a delta table - Inspected log4j output to see if snapshot state was computed - Repeated again, this time setting `spark.databricks.delta.commitStats.collect=false` Simple demo job that triggers computing SnapshotState, before this PR: ```scala val spark = SparkSession .builder .appName("myapp") .master("local[*]") .config("spark.sql.warehouse.dir", "./warehouse") .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") .getOrCreate spark.sql("""CREATE TABLE test_delta(id string) USING DELTA """) spark.sql(""" INSERT INTO test_delta (id) VALUES (42) """) spark.close() ``` ## Does this PR introduce _any_ user-facing changes? Yes, after this PR the user can set spark config option `spark.databricks.delta.commitStats.collect=false` to avoid computing SnapshotState after a commit.
1 parent 1b210c2 commit bba0e94

File tree

6 files changed

+51
-3
lines changed

6 files changed

+51
-3
lines changed

spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala

+9-2
Original file line numberDiff line numberDiff line change
@@ -1874,6 +1874,13 @@ trait OptimisticTransactionImpl extends TransactionalWrite
18741874
val info = currentTransactionInfo.commitInfo
18751875
.map(_.copy(readVersion = None, isolationLevel = None)).orNull
18761876
setNeedsCheckpoint(attemptVersion, postCommitSnapshot)
1877+
val doCollectCommitStats =
1878+
needsCheckpoint || spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_FORCE_ALL_COMMIT_STATS)
1879+
1880+
// Stats that force an expensive snapshot state reconstruction:
1881+
val numFilesTotal = if (doCollectCommitStats) postCommitSnapshot.numOfFiles else -1L
1882+
val sizeInBytesTotal = if (doCollectCommitStats) postCommitSnapshot.sizeInBytes else -1L
1883+
18771884
val stats = CommitStats(
18781885
startVersion = snapshot.version,
18791886
commitVersion = attemptVersion,
@@ -1887,8 +1894,8 @@ trait OptimisticTransactionImpl extends TransactionalWrite
18871894
numRemove = numRemove,
18881895
numSetTransaction = numSetTransaction,
18891896
bytesNew = bytesNew,
1890-
numFilesTotal = postCommitSnapshot.numOfFiles,
1891-
sizeInBytesTotal = postCommitSnapshot.sizeInBytes,
1897+
numFilesTotal = numFilesTotal,
1898+
sizeInBytesTotal = sizeInBytesTotal,
18921899
numCdcFiles = numCdcFiles,
18931900
cdcBytesNew = cdcBytesNew,
18941901
protocol = postCommitSnapshot.protocol,

spark/src/main/scala/org/apache/spark/sql/delta/RowId.scala

+5-1
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,11 @@ object RowId {
120120
* Extracts the high watermark of row IDs from a snapshot.
121121
*/
122122
private[delta] def extractHighWatermark(snapshot: Snapshot): Option[Long] =
123-
RowTrackingMetadataDomain.fromSnapshot(snapshot).map(_.rowIdHighWaterMark)
123+
if (isSupported(snapshot.protocol)) {
124+
RowTrackingMetadataDomain.fromSnapshot(snapshot).map(_.rowIdHighWaterMark)
125+
} else {
126+
None
127+
}
124128

125129
/** Base Row ID column name */
126130
val BASE_ROW_ID = "base_row_id"

spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala

+11
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,17 @@ trait DeltaSQLConfBase {
8383
.stringConf
8484
.createOptional
8585

86+
val DELTA_FORCE_ALL_COMMIT_STATS =
87+
buildConf("commitStats.force")
88+
.internal()
89+
.doc(
90+
"""When true, forces commit statistics to be collected for logging purposes.
91+
| Enabling this feature requires the Snapshot State to be computed, which is
92+
| potentially expensive.
93+
""".stripMargin)
94+
.booleanConf
95+
.createWithDefault(false)
96+
8697
val DELTA_CONVERT_USE_METADATA_LOG =
8798
buildConf("convert.useMetadataLog")
8899
.doc(

spark/src/test/scala/org/apache/spark/sql/delta/DeleteSuiteBase.scala

+4
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,8 @@ abstract class DeleteSuiteBase extends QueryTest
326326
test("schema pruning on data condition") {
327327
val input = Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value")
328328
append(input, Nil)
329+
// Start from a cached snapshot state
330+
deltaLog.update().stateDF
329331

330332
val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
331333
checkDelete(Some("key = 2"),
@@ -347,6 +349,8 @@ abstract class DeleteSuiteBase extends QueryTest
347349
val input = Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value")
348350
.select(struct("key", "value").alias("nested"))
349351
append(input, Nil)
352+
// Start from a cached snapshot state
353+
deltaLog.update().stateDF
350354

351355
val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
352356
checkDelete(Some("nested.key = 2"),

spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala

+18
Original file line numberDiff line numberDiff line change
@@ -890,4 +890,22 @@ class OptimisticTransactionSuite
890890
}
891891
}
892892
}
893+
894+
test("Append does not trigger snapshot state computation") {
895+
withTempDir { tableDir =>
896+
val df = Seq((1, 0), (2, 1)).toDF("key", "value")
897+
df.write.format("delta").mode("append").save(tableDir.getCanonicalPath)
898+
899+
val deltaLog = DeltaLog.forTable(spark, tableDir)
900+
val preCommitSnapshot = deltaLog.update()
901+
assert(!preCommitSnapshot.stateReconstructionTriggered)
902+
903+
df.write.format("delta").mode("append").save(tableDir.getCanonicalPath)
904+
905+
val postCommitSnapshot = deltaLog.update()
906+
assert(!preCommitSnapshot.stateReconstructionTriggered)
907+
assert(!postCommitSnapshot.stateReconstructionTriggered)
908+
}
909+
}
910+
893911
}

spark/src/test/scala/org/apache/spark/sql/delta/UpdateSuiteBase.scala

+4
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,8 @@ abstract class UpdateSuiteBase
696696

697697
test("schema pruning on finding files to update") {
698698
append(Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value"))
699+
// Start from a cached snapshot state
700+
deltaLog.update().stateDF
699701

700702
val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
701703
checkUpdate(condition = Some("key = 2"), setClauses = "key = 1, value = 3",
@@ -717,6 +719,8 @@ abstract class UpdateSuiteBase
717719
test("nested schema pruning on finding files to update") {
718720
append(Seq((2, 2), (1, 4), (1, 1), (0, 3)).toDF("key", "value")
719721
.select(struct("key", "value").alias("nested")))
722+
// Start from a cached snapshot state
723+
deltaLog.update().stateDF
720724

721725
val executedPlans = DeltaTestUtils.withPhysicalPlansCaptured(spark) {
722726
checkUpdate(condition = Some("nested.key = 2"),

0 commit comments

Comments
 (0)