From 8692d863ffb069875ca3eef49f7124aaca89e7d8 Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Sat, 9 Nov 2024 20:33:46 +0200 Subject: [PATCH] Update spark-defaults.conf --- .../spark_deployment/spark-defaults.conf | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/.github/workflows/spark_deployment/spark-defaults.conf b/.github/workflows/spark_deployment/spark-defaults.conf index 702fdd5..f970f1d 100644 --- a/.github/workflows/spark_deployment/spark-defaults.conf +++ b/.github/workflows/spark_deployment/spark-defaults.conf @@ -1,27 +1,27 @@ -# Core Spark Configuration - Constrained for 4 CPU, 16GB RAM machine +# Core Spark Configuration spark.master local[4] -spark.driver.memory 8g # Half of total RAM -spark.executor.memory 4g # Quarter of total RAM -spark.memory.fraction 0.7 # Fraction of heap space for execution/storage -spark.memory.storageFraction 0.3 # Fraction of memory fraction for storage +spark.driver.memory 8g +spark.executor.memory 4g +spark.memory.fraction 0.7 +spark.memory.storageFraction 0.3 spark.memory.offHeap.enabled true -spark.memory.offHeap.size 2g # Small off-heap to avoid OOM +spark.memory.offHeap.size 2147483648 -# Parallelism and Partitioning - Adjusted for 4 CPU -spark.sql.shuffle.partitions 8 # 2x number of cores -spark.default.parallelism 8 # 2x number of cores -spark.sql.files.maxPartitionBytes 134217728 # 128MB per partition +# Parallelism and Partitioning +spark.sql.shuffle.partitions 8 +spark.default.parallelism 8 +spark.sql.files.maxPartitionBytes 134217728 # Performance Optimization spark.sql.adaptive.enabled true spark.sql.adaptive.coalescePartitions.enabled true spark.sql.adaptive.skewJoin.enabled true + +# Serialization spark.serializer org.apache.spark.serializer.KryoSerializer -spark.kryoserializer.buffer.max 256m -spark.sql.inMemoryColumnarStorage.compressed true -spark.sql.inMemoryColumnarStorage.batchSize 10000 +spark.kryoserializer.buffer.max 268435456 -# S3A Configuration +# AWS S3 Configuration spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem spark.hadoop.fs.s3a.access.key ${AWS_ACCESS_KEY_ID} spark.hadoop.fs.s3a.secret.key ${AWS_SECRET_ACCESS_KEY} @@ -31,13 +31,11 @@ spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com spark.hadoop.fs.s3a.path.style.access false spark.hadoop.fs.s3a.region eu-west-1 -# S3A Connection Management - Conservative settings -spark.hadoop.fs.s3a.connection.maximum 20 # Reduced connection pool -spark.hadoop.fs.s3a.connection.timeout 300000 # 5 minutes -spark.hadoop.fs.s3a.threads.max 8 # 2x cores +# S3A Connection Management +spark.hadoop.fs.s3a.connection.maximum 20 +spark.hadoop.fs.s3a.connection.timeout 300000 +spark.hadoop.fs.s3a.threads.max 8 spark.hadoop.fs.s3a.connection.ssl.enabled true -spark.hadoop.fs.s3a.readahead.range 128K # Reduced readahead -spark.hadoop.fs.s3a.retry.limit 10 # Reasonable retry limit # Iceberg Catalog Configuration spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions @@ -47,9 +45,6 @@ spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-inte spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO spark.sql.defaultCatalog glue -# Timeouts and Network -spark.network.timeout 300s # 5 minutes -spark.sql.broadcastTimeout 300s # 5 minutes - -# Garbage Collection -spark.executor.extraJavaOptions -XX:+UseG1GC -XX:G1HeapRegionSize=16M -XX:+UseCompressedOops \ No newline at end of file +# Network timeouts +spark.network.timeout 300000 +spark.sql.broadcastTimeout 300000 \ No newline at end of file