-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
56 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,83 +1,66 @@ | ||
# Catalog and Core Configuration | ||
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing | ||
# Catalog and Schema Settings | ||
spark.sql.defaultCatalog glue | ||
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog | ||
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog | ||
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO | ||
spark.sql.defaultCatalog glue | ||
spark.sql.catalog.glue.database dbt-spark-iceberg | ||
spark.sql.catalog.glue.lock-impl org.apache.iceberg.aws.glue.DynamoLockManager | ||
spark.sql.catalog.glue.lock.table myGlueLockTable | ||
|
||
# Table capabilities and operation settings | ||
spark.sql.catalog.glue.table-default.format-version 2 | ||
spark.sql.catalog.glue.table-default.write.update.mode merge-on-read | ||
spark.sql.catalog.glue.table-default.write.delete.mode merge-on-read | ||
spark.sql.catalog.glue.table-default.write.merge.mode merge-on-read | ||
spark.sql.catalog.glue.table-default.write.distribution-mode hash | ||
spark.sql.catalog.glue.table-default.write.data.path s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue.table-default.write.metadata.path s3a://dbt-spark-iceberg/github-integration-testing/metadata | ||
spark.sql.catalog.glue.table-default.write.metadata.previous-versions-max 10 | ||
spark.sql.catalog.glue.table-default.write.format.default iceberg | ||
spark.sql.catalog.glue.table-default.engine.hive.enabled true | ||
spark.sql.table.is.transactional true | ||
# Default Schema Configuration | ||
spark.sql.catalog.glue.default-namespace default_snowplow_manifest | ||
|
||
# Iceberg Specific Configurations | ||
spark.sql.iceberg.check-nullability false | ||
spark.sql.iceberg.vectorization.enabled true | ||
spark.sql.iceberg.handle-timestamp-without-timezone true | ||
# Session Extensions | ||
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions | ||
|
||
# Merge Operation Optimizations | ||
spark.sql.iceberg.merge.mode merge-on-read | ||
spark.sql.iceberg.merge.cardinality.check.enabled false | ||
spark.sql.optimizer.dynamicPartitionPruning.enabled true | ||
spark.sql.optimizer.decorrelate.subquery.enabled true | ||
spark.sql.optimizer.join.reorder.enabled true | ||
spark.sql.optimizer.runtime.bloomFilter.enabled true | ||
|
||
# Merge Performance Tuning | ||
spark.sql.shuffle.partitions 200 | ||
spark.sql.adaptive.shuffle.targetPostShuffleInputSize 128M | ||
spark.sql.adaptive.advisoryPartitionSizeInBytes 128M | ||
spark.sql.adaptive.coalescePartitions.initialPartitionNum 200 | ||
spark.sql.adaptive.coalescePartitions.minPartitionNum 50 | ||
|
||
# Memory Configuration for Merge Operations | ||
spark.memory.fraction 0.85 | ||
spark.memory.storageFraction 0.3 | ||
spark.sql.shuffle.spill.diskMerge.enabled true | ||
spark.sql.shuffle.spill.numElementsForceSpillThreshold 5000 | ||
|
||
# Additional Merge Optimizations | ||
spark.sql.autoBroadcastJoinThreshold 100M | ||
spark.sql.adaptive.localShuffleReader.enabled true | ||
spark.sql.adaptive.skewJoin.enabled true | ||
spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin 0.2 | ||
spark.sql.adaptive.coalescePartitions.enabled true | ||
|
||
# Maintain your existing performance, network, and S3 configurations... | ||
# General Spark Configuration | ||
spark.master local[3] | ||
spark.driver.memory 10g | ||
spark.executor.memory 3g | ||
spark.driver.maxResultSize 2g | ||
spark.default.parallelism 6 | ||
spark.memory.fraction 0.85 | ||
spark.sql.adaptive.enabled true | ||
|
||
# Iceberg Specific Configuration | ||
spark.wds.iceberg.format-version 2 | ||
spark.sql.iceberg.handle-timestamp-without-timezone true | ||
spark.sql.catalog.spark_catalog.type hive | ||
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog | ||
|
||
# Network Resilience (keep existing...) | ||
spark.network.timeout 800s | ||
spark.executor.heartbeatInterval 100s | ||
spark.storage.blockManagerSlaveTimeoutMs 300s | ||
spark.rpc.io.maxRetries 10 | ||
# AWS Configuration | ||
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem | ||
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider | ||
spark.hadoop.fs.s3a.path.style.access true | ||
spark.hadoop.fs.s3a.connection.ssl.enabled true | ||
spark.hadoop.fs.s3a.committer.name directory | ||
spark.hadoop.fs.s3a.committer.staging.tmp.path /tmp/spark_staging | ||
spark.hadoop.fs.s3a.buffer.dir /tmp/spark_local_buf | ||
spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled true | ||
|
||
# S3/AWS Configuration (keep existing...) | ||
spark.hadoop.fs.s3a.connection.timeout 300000 | ||
spark.hadoop.fs.s3a.connection.maximum 200 | ||
spark.hadoop.fs.s3a.attempts.maximum 20 | ||
# Write and Format Configuration | ||
spark.sql.parquet.compression.codec zstd | ||
# spark.sql.parquet.mergeSchema true | ||
spark.sql.parquet.filterPushdown true | ||
spark.sql.hive.metastorePartitionPruning true | ||
spark.sql.streaming.schemaInference true | ||
|
||
# Error Recovery and Resilience (keep existing...) | ||
spark.task.maxFailures 8 | ||
spark.speculation true | ||
spark.speculation.multiplier 3 | ||
# Operation Settings | ||
spark.sql.sources.partitionOverwriteMode dynamic | ||
spark.sql.shuffle.partitions 6 | ||
spark.sql.broadcastTimeout 300 | ||
spark.network.timeout 300s | ||
|
||
# Thrift Server Configuration (keep existing...) | ||
spark.sql.hive.thriftServer.async true | ||
spark.sql.hive.thriftServer.maxWorkerThreads 6 | ||
spark.sql.hive.thriftServer.minWorkerThreads 4 | ||
# Transaction and Consistency | ||
spark.sql.sources.default iceberg | ||
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions | ||
# spark.sql.transaction.isolation.level SERIALIZABLE | ||
spark.sql.hive.thriftServer.async true | ||
spark.sql.hive.thriftServer.maxWorkerThreads 6 | ||
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog | ||
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog | ||
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO | ||
spark.sql.defaultCatalog glue | ||
spark.sql.catalog.glue.database dbt-spark-iceberg | ||
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions | ||
spark.sql.iceberg.handle-timestamp-without-timezone true |