Skip to content

Commit

Permalink
Use ZSTD Parquet compression codec for Delta Lake by default
Browse files Browse the repository at this point in the history
  • Loading branch information
findinpath authored and raunaqmorarka committed Dec 24, 2024
1 parent 590bbb3 commit 3257342
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 37 deletions.
2 changes: 1 addition & 1 deletion docs/src/main/sphinx/connector/delta-lake.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ values. Typical usage does not require you to configure them.
* `GZIP`

The equivalent catalog session property is `compression_codec`.
- `SNAPPY`
- `ZSTD`
* - `delta.max-partitions-per-writer`
- Maximum number of partitions per writer.
- `100`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public class DeltaLakeConfig
private boolean tableStatisticsEnabled = true;
private boolean extendedStatisticsEnabled = true;
private boolean collectExtendedStatisticsOnWrite = true;
private HiveCompressionCodec compressionCodec = HiveCompressionCodec.SNAPPY;
private HiveCompressionCodec compressionCodec = HiveCompressionCodec.ZSTD;
private long perTransactionMetastoreCacheMaximumSize = 1000;
private boolean storeTableMetadataEnabled;
private int storeTableMetadataThreads = 5;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,12 +95,12 @@ public void testCacheFileOperations()
.add(new CacheOperation("Alluxio.writeCache", "00000000000000000002.json", 0, 658))
.add(new CacheOperation("InputFile.length", "00000000000000000003.json"))
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 220))
.add(new CacheOperation("Input.readFully", "key=p1/", 0, 220))
.add(new CacheOperation("Input.readFully", "key=p2/", 0, 220))
.add(new CacheOperation("Alluxio.writeCache", "key=p1/", 0, 220))
.add(new CacheOperation("Alluxio.writeCache", "key=p2/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
.add(new CacheOperation("Input.readFully", "key=p1/", 0, 227))
.add(new CacheOperation("Input.readFully", "key=p2/", 0, 227))
.add(new CacheOperation("Alluxio.writeCache", "key=p1/", 0, 227))
.add(new CacheOperation("Alluxio.writeCache", "key=p2/", 0, 227))
.build());
assertFileSystemAccesses(
"SELECT * FROM test_cache_file_operations",
Expand All @@ -113,8 +113,8 @@ public void testCacheFileOperations()
.add(new CacheOperation("InputFile.length", "00000000000000000002.json"))
.add(new CacheOperation("InputFile.length", "00000000000000000003.json"))
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
.build());
assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p3', '3-xyz')", 1);
assertUpdate("INSERT INTO test_cache_file_operations VALUES ('p4', '4-xyz')", 1);
Expand All @@ -139,17 +139,17 @@ public void testCacheFileOperations()
.add(new CacheOperation("InputFile.length", "00000000000000000005.json"))
.add(new CacheOperation("InputFile.length", "00000000000000000006.json"))
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 220))
.add(new CacheOperation("Input.readFully", "key=p3/", 0, 220))
.add(new CacheOperation("Input.readFully", "key=p4/", 0, 220))
.add(new CacheOperation("Input.readFully", "key=p5/", 0, 220))
.add(new CacheOperation("Alluxio.writeCache", "key=p3/", 0, 220))
.add(new CacheOperation("Alluxio.writeCache", "key=p4/", 0, 220))
.add(new CacheOperation("Alluxio.writeCache", "key=p5/", 0, 220))
.add(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
.add(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
.add(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 227))
.add(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 227))
.add(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 227))
.add(new CacheOperation("Input.readFully", "key=p3/", 0, 227))
.add(new CacheOperation("Input.readFully", "key=p4/", 0, 227))
.add(new CacheOperation("Input.readFully", "key=p5/", 0, 227))
.add(new CacheOperation("Alluxio.writeCache", "key=p3/", 0, 227))
.add(new CacheOperation("Alluxio.writeCache", "key=p4/", 0, 227))
.add(new CacheOperation("Alluxio.writeCache", "key=p5/", 0, 227))
.build());
assertFileSystemAccesses(
"SELECT * FROM test_cache_file_operations",
Expand All @@ -168,11 +168,11 @@ public void testCacheFileOperations()
.add(new CacheOperation("InputFile.length", "00000000000000000005.json"))
.add(new CacheOperation("InputFile.length", "00000000000000000006.json"))
.add(new CacheOperation("InputFile.newStream", "_last_checkpoint"))
.addCopies(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 220), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 220), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 220), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 220), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 220), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p1/", 0, 227), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p2/", 0, 227), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p3/", 0, 227), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p4/", 0, 227), 1)
.addCopies(new CacheOperation("Alluxio.readCached", "key=p5/", 0, 227), 1)
.build());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@ public void testTableDataCachedWhileTransactionLogNotCached()
.addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2)
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json"))
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint"))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p1/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p2/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p1/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p2/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p1/", 0, 227))
.add(new CacheFileSystemTraceUtils.CacheOperation("Input.readFully", "key=p2/", 0, 227))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p1/", 0, 227))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.writeCache", "key=p2/", 0, 227))
.build());
assertFileSystemAccesses(
"SELECT * FROM test_transaction_log_not_cached",
Expand All @@ -93,8 +93,8 @@ public void testTableDataCachedWhileTransactionLogNotCached()
.addCopies(new CacheFileSystemTraceUtils.CacheOperation("Input.readTail", "00000000000000000002.checkpoint.parquet"), 2)
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.length", "00000000000000000003.json"))
.add(new CacheFileSystemTraceUtils.CacheOperation("InputFile.newStream", "_last_checkpoint"))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 220))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p1/", 0, 227))
.add(new CacheFileSystemTraceUtils.CacheOperation("Alluxio.readCached", "key=p2/", 0, 227))
.build());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public void testDefaults()
.setTableStatisticsEnabled(true)
.setExtendedStatisticsEnabled(true)
.setCollectExtendedStatisticsOnWrite(true)
.setCompressionCodec(HiveCompressionCodec.SNAPPY)
.setCompressionCodec(HiveCompressionCodec.ZSTD)
.setDeleteSchemaLocationsFallback(false)
.setParquetTimeZone(TimeZone.getDefault().getID())
.setPerTransactionMetastoreCacheMaximumSize(1000)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,8 @@ public void verifyCompressionCodecsDataProvider()
assertThat(onTrino().executeQuery("SHOW SESSION LIKE 'delta.compression_codec'"))
.containsOnly(row(
"delta.compression_codec",
"SNAPPY",
"SNAPPY",
"ZSTD",
"ZSTD",
"varchar",
"Compression codec to use when writing new data files. Possible values: " +
Stream.of(compressionCodecs())
Expand Down

0 comments on commit 3257342

Please sign in to comment.