Skip to content

Commit

Permalink
Update spark-defaults.conf
Browse files Browse the repository at this point in the history
  • Loading branch information
ilias1111 committed Nov 9, 2024
1 parent 1e6b931 commit 8692d86
Showing 1 changed file with 21 additions and 26 deletions.
47 changes: 21 additions & 26 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
# Core Spark Configuration - Constrained for 4 CPU, 16GB RAM machine
# Core Spark Configuration
spark.master local[4]
spark.driver.memory 8g # Half of total RAM
spark.executor.memory 4g # Quarter of total RAM
spark.memory.fraction 0.7 # Fraction of heap space for execution/storage
spark.memory.storageFraction 0.3 # Fraction of memory fraction for storage
spark.driver.memory 8g
spark.executor.memory 4g
spark.memory.fraction 0.7
spark.memory.storageFraction 0.3
spark.memory.offHeap.enabled true
spark.memory.offHeap.size 2g # Small off-heap to avoid OOM
spark.memory.offHeap.size 2147483648

# Parallelism and Partitioning - Adjusted for 4 CPU
spark.sql.shuffle.partitions 8 # 2x number of cores
spark.default.parallelism 8 # 2x number of cores
spark.sql.files.maxPartitionBytes 134217728 # 128MB per partition
# Parallelism and Partitioning
spark.sql.shuffle.partitions 8
spark.default.parallelism 8
spark.sql.files.maxPartitionBytes 134217728

# Performance Optimization
spark.sql.adaptive.enabled true
spark.sql.adaptive.coalescePartitions.enabled true
spark.sql.adaptive.skewJoin.enabled true

# Serialization
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.kryoserializer.buffer.max 256m
spark.sql.inMemoryColumnarStorage.compressed true
spark.sql.inMemoryColumnarStorage.batchSize 10000
spark.kryoserializer.buffer.max 268435456

# S3A Configuration
# AWS S3 Configuration
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.access.key ${AWS_ACCESS_KEY_ID}
spark.hadoop.fs.s3a.secret.key ${AWS_SECRET_ACCESS_KEY}
Expand All @@ -31,13 +31,11 @@ spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
spark.hadoop.fs.s3a.path.style.access false
spark.hadoop.fs.s3a.region eu-west-1

# S3A Connection Management - Conservative settings
spark.hadoop.fs.s3a.connection.maximum 20 # Reduced connection pool
spark.hadoop.fs.s3a.connection.timeout 300000 # 5 minutes
spark.hadoop.fs.s3a.threads.max 8 # 2x cores
# S3A Connection Management
spark.hadoop.fs.s3a.connection.maximum 20
spark.hadoop.fs.s3a.connection.timeout 300000
spark.hadoop.fs.s3a.threads.max 8
spark.hadoop.fs.s3a.connection.ssl.enabled true
spark.hadoop.fs.s3a.readahead.range 128K # Reduced readahead
spark.hadoop.fs.s3a.retry.limit 10 # Reasonable retry limit

# Iceberg Catalog Configuration
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
Expand All @@ -47,9 +45,6 @@ spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-inte
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue

# Timeouts and Network
spark.network.timeout 300s # 5 minutes
spark.sql.broadcastTimeout 300s # 5 minutes

# Garbage Collection
spark.executor.extraJavaOptions -XX:+UseG1GC -XX:G1HeapRegionSize=16M -XX:+UseCompressedOops
# Network timeouts
spark.network.timeout 300000
spark.sql.broadcastTimeout 300000

0 comments on commit 8692d86

Please sign in to comment.