Skip to content

Commit

Permalink
Working
Browse files Browse the repository at this point in the history
  • Loading branch information
ilias1111 committed Nov 11, 2024
1 parent a45a483 commit b93d485
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 69 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/spark_deployment/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ services:
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
- ./setup.sh:/setup.sh
entrypoint: ["/bin/bash", "/setup.sh"]
command: ["/bin/bash", "-c", "/spark/sbin/start-thriftserver.sh && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
command: ["/bin/bash", "-c", "/spark/sbin/start-thriftserver.sh \
--master local[3] \
--driver-memory 10g \
--executor-memory 3g \
&& tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
networks:
- spark-network
119 changes: 51 additions & 68 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,83 +1,66 @@
# Catalog and Core Configuration
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
# Catalog and Schema Settings
spark.sql.defaultCatalog glue
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg
spark.sql.catalog.glue.lock-impl org.apache.iceberg.aws.glue.DynamoLockManager
spark.sql.catalog.glue.lock.table myGlueLockTable

# Table capabilities and operation settings
spark.sql.catalog.glue.table-default.format-version 2
spark.sql.catalog.glue.table-default.write.update.mode merge-on-read
spark.sql.catalog.glue.table-default.write.delete.mode merge-on-read
spark.sql.catalog.glue.table-default.write.merge.mode merge-on-read
spark.sql.catalog.glue.table-default.write.distribution-mode hash
spark.sql.catalog.glue.table-default.write.data.path s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.table-default.write.metadata.path s3a://dbt-spark-iceberg/github-integration-testing/metadata
spark.sql.catalog.glue.table-default.write.metadata.previous-versions-max 10
spark.sql.catalog.glue.table-default.write.format.default iceberg
spark.sql.catalog.glue.table-default.engine.hive.enabled true
spark.sql.table.is.transactional true
# Default Schema Configuration
spark.sql.catalog.glue.default-namespace default_snowplow_manifest

# Iceberg Specific Configurations
spark.sql.iceberg.check-nullability false
spark.sql.iceberg.vectorization.enabled true
spark.sql.iceberg.handle-timestamp-without-timezone true
# Session Extensions
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions

# Merge Operation Optimizations
spark.sql.iceberg.merge.mode merge-on-read
spark.sql.iceberg.merge.cardinality.check.enabled false
spark.sql.optimizer.dynamicPartitionPruning.enabled true
spark.sql.optimizer.decorrelate.subquery.enabled true
spark.sql.optimizer.join.reorder.enabled true
spark.sql.optimizer.runtime.bloomFilter.enabled true

# Merge Performance Tuning
spark.sql.shuffle.partitions 200
spark.sql.adaptive.shuffle.targetPostShuffleInputSize 128M
spark.sql.adaptive.advisoryPartitionSizeInBytes 128M
spark.sql.adaptive.coalescePartitions.initialPartitionNum 200
spark.sql.adaptive.coalescePartitions.minPartitionNum 50

# Memory Configuration for Merge Operations
spark.memory.fraction 0.85
spark.memory.storageFraction 0.3
spark.sql.shuffle.spill.diskMerge.enabled true
spark.sql.shuffle.spill.numElementsForceSpillThreshold 5000

# Additional Merge Optimizations
spark.sql.autoBroadcastJoinThreshold 100M
spark.sql.adaptive.localShuffleReader.enabled true
spark.sql.adaptive.skewJoin.enabled true
spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin 0.2
spark.sql.adaptive.coalescePartitions.enabled true

# Maintain your existing performance, network, and S3 configurations...
# General Spark Configuration
spark.master local[3]
spark.driver.memory 10g
spark.executor.memory 3g
spark.driver.maxResultSize 2g
spark.default.parallelism 6
spark.memory.fraction 0.85
spark.sql.adaptive.enabled true

# Iceberg Specific Configuration
spark.wds.iceberg.format-version 2
spark.sql.iceberg.handle-timestamp-without-timezone true
spark.sql.catalog.spark_catalog.type hive
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog

# Network Resilience (keep existing...)
spark.network.timeout 800s
spark.executor.heartbeatInterval 100s
spark.storage.blockManagerSlaveTimeoutMs 300s
spark.rpc.io.maxRetries 10
# AWS Configuration
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.connection.ssl.enabled true
spark.hadoop.fs.s3a.committer.name directory
spark.hadoop.fs.s3a.committer.staging.tmp.path /tmp/spark_staging
spark.hadoop.fs.s3a.buffer.dir /tmp/spark_local_buf
spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled true

# S3/AWS Configuration (keep existing...)
spark.hadoop.fs.s3a.connection.timeout 300000
spark.hadoop.fs.s3a.connection.maximum 200
spark.hadoop.fs.s3a.attempts.maximum 20
# Write and Format Configuration
spark.sql.parquet.compression.codec zstd
# spark.sql.parquet.mergeSchema true
spark.sql.parquet.filterPushdown true
spark.sql.hive.metastorePartitionPruning true
spark.sql.streaming.schemaInference true

# Error Recovery and Resilience (keep existing...)
spark.task.maxFailures 8
spark.speculation true
spark.speculation.multiplier 3
# Operation Settings
spark.sql.sources.partitionOverwriteMode dynamic
spark.sql.shuffle.partitions 6
spark.sql.broadcastTimeout 300
spark.network.timeout 300s

# Thrift Server Configuration (keep existing...)
spark.sql.hive.thriftServer.async true
spark.sql.hive.thriftServer.maxWorkerThreads 6
spark.sql.hive.thriftServer.minWorkerThreads 4
# Transaction and Consistency
spark.sql.sources.default iceberg
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
# spark.sql.transaction.isolation.level SERIALIZABLE
spark.sql.hive.thriftServer.async true
spark.sql.hive.thriftServer.maxWorkerThreads 6
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
spark.sql.iceberg.handle-timestamp-without-timezone true

0 comments on commit b93d485

Please sign in to comment.