feat: enable custom resource limit per network (ethpandaops#471)

bharath-123 · Jan 29, 2024 · 5db6611 · 5db6611
1 parent 26384ea
commit 5db6611
Show file tree

Hide file tree

Showing 17 changed files with 680 additions and 95 deletions.
diff --git a/.github/tests/tolerations.yaml b/.github/tests/tolerations.yaml
@@ -0,0 +1,36 @@
+participants:
+  - el_client_type: reth
+    cl_client_type: teku
+    cl_split_mode_enabled: true
+    cl_tolerations:
+      - key: "node-role.kubernetes.io/master1"
+        operator: "Exists"
+        effect: "NoSchedule"
+      - key: "node-role.kubernetes.io/master2"
+        operator: "Exists"
+        effect: "NoSchedule"
+    el_tolerations:
+      - key: "node-role.kubernetes.io/master3"
+        operator: "Exists"
+        effect: "NoSchedule"
+    validator_tolerations:
+      - key: "node-role.kubernetes.io/master4"
+        operator: "Exists"
+        effect: "NoSchedule"
+  - el_client_type: reth
+    cl_client_type: teku
+    cl_split_mode_enabled: true
+    tolerations:
+      - key: "node-role.kubernetes.io/master5"
+        operator: "Exists"
+        effect: "NoSchedule"
+  - el_client_type: reth
+    cl_client_type: teku
+    cl_split_mode_enabled: true
+additional_services:
+  - dora
+global_tolerations:
+  - key: "node-role.kubernetes.io/master6"
+    value: "true"
+    operator: "Equal"
+    effect: "NoSchedule"
diff --git a/README.md b/README.md
@@ -64,6 +64,26 @@ To mitigate these issues, you can use the `el_client_volume_size` and `cl_client
 
 For optimal performance, we recommend using a cloud provider that allows you to provision Kubernetes clusters with fast persistent storage or self hosting your own Kubernetes cluster with fast persistent storage.
 
+#### Taints and tolerations
+It is possible to run the package on a Kubernetes cluster with taints and tolerations. This is done by adding the tolerations to the `tolerations` field in the `network_params.yaml` file. For example:
+```yaml
+participants:
+  - el_client_type: reth
+    cl_client_type: teku
+global_tolerations:
+  - key: "node-role.kubernetes.io/master6"
+    value: "true"
+    operator: "Equal"
+    effect: "NoSchedule"
+```
+
+It is possible to define toleration globally, per participant or per container. The order of precedence is as follows:
+1. Container (`el_tolerations`, `cl_tolerations`, `validator_tolerations`)
+2. Participant (`tolerations`)
+3. Global (`global_tolerations`)
+
+This feature is only available for Kubernetes. To learn more about taints and tolerations, please visit the [Kubernetes documentation](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/).
+
 #### Tear down
 
 The testnet will reside in an [enclave][enclave] - an isolated, ephemeral environment. The enclave and its contents (e.g. running containers, files artifacts, etc) will persist until torn down. You can remove an enclave and its contents with:
@@ -147,6 +167,17 @@ participants:
   # Example; el_extra_labels: {"ethereum-package.partition": "1"}
   el_extra_labels: {}
 
+  # A list of tolerations that will be passed to the EL client container
+  # Only works with Kubernetes
+  # Example: el_tolerations:
+  # - key: "key"
+  #   operator: "Equal"
+  #   value: "value"
+  #   effect: "NoSchedule"
+  #   toleration_seconds: 3600
+  # Defaults to empty
+  el_tolerations: []
+
   # The type of CL client that should be started
   # Valid values are nimbus, lighthouse, lodestar, teku, and prysm
   cl_client_type: lighthouse
@@ -178,6 +209,40 @@ participants:
   # Default values can be found in /src/package_io/constants.star VOLUME_SIZE
   cl_client_volume_size: 0
 
+  # A list of tolerations that will be passed to the CL client container
+  # Only works with Kubernetes
+  # Example: el_tolerations:
+  # - key: "key"
+  #   operator: "Equal"
+  #   value: "value"
+  #   effect: "NoSchedule"
+  #   toleration_seconds: 3600
+  # Defaults to empty
+  el_tolerations: []
+
+  # A list of tolerations that will be passed to the validator container
+  # Only works with Kubernetes
+  # Example: el_tolerations:
+  # - key: "key"
+  #   operator: "Equal"
+  #   value: "value"
+  #   effect: "NoSchedule"
+  #   toleration_seconds: 3600
+  # Defaults to empty
+  validator_tolerations: []
+
+  # A list of tolerations that will be passed to the EL/CL/validator containers
+  # This is to be used when you don't want to specify the tolerations for each container separately
+  # Only works with Kubernetes
+  # Example: tolerations:
+  # - key: "key"
+  #   operator: "Equal"
+  #   value: "value"
+  #   effect: "NoSchedule"
+  #   toleration_seconds: 3600
+  # Defaults to empty
+  tolerations: []
+
   # A list of optional extra params that will be passed to the CL client Beacon container for modifying its behaviour
   # If the client combines the Beacon & validator nodes (e.g. Teku, Nimbus), then this list will be passed to the combined Beacon-validator node
   beacon_extra_params: []
@@ -495,6 +560,17 @@ xatu_sentry_params:
   - voluntary_exit
   - contribution_and_proof
   - blob_sidecar
+
+# Global tolerations that will be passed to all containers (unless overridden by a more specific toleration)
+# Only works with Kubernetes
+# Example: tolerations:
+# - key: "key"
+#   operator: "Equal"
+#   value: "value"
+#   effect: "NoSchedule"
+#   toleration_seconds: 3600
+# Defaults to empty
+global_tolerations: []
 ```
 
 #### Example configurations

diff --git a/main.star b/main.star
@@ -62,6 +62,7 @@ def run(plan, args={}):
     parallel_keystore_generation = args_with_right_defaults.parallel_keystore_generation
     persistent = args_with_right_defaults.persistent
     xatu_sentry_params = args_with_right_defaults.xatu_sentry_params
+    global_tolerations = args_with_right_defaults.global_tolerations
 
     grafana_datasource_config_template = read_file(
         static_files.GRAFANA_DATASOURCE_CONFIG_TEMPLATE_FILEPATH
@@ -96,6 +97,7 @@ def run(plan, args={}):
         jwt_file,
         persistent,
         xatu_sentry_params,
+        global_tolerations,
         parallel_keystore_generation,
     )
 

diff --git a/src/cl/lighthouse/lighthouse_launcher.star b/src/cl/lighthouse/lighthouse_launcher.star
@@ -28,9 +28,7 @@ BEACON_METRICS_PORT_NUM = 5054
 
 # The min/max CPU/memory that the beacon node can use
 BEACON_MIN_CPU = 50
-BEACON_MAX_CPU = 1000
 BEACON_MIN_MEMORY = 256
-BEACON_MAX_MEMORY = 1024
 
 #  ---------------------------------- Validator client -------------------------------------
 VALIDATOR_KEYS_MOUNTPOINT_ON_CLIENTS = "/data/lighthouse/validator-keys"
@@ -84,7 +82,7 @@ VALIDATOR_USED_PORTS = {
     ),
 }
 
-LIGHTHOUSE_LOG_LEVELS = {
+VERBOSITY_LEVELS = {
     constants.GLOBAL_CLIENT_LOG_LEVEL.error: "error",
     constants.GLOBAL_CLIENT_LOG_LEVEL.warn: "warn",
     constants.GLOBAL_CLIENT_LOG_LEVEL.info: "info",
@@ -121,6 +119,10 @@ def launch(
     extra_validator_labels,
     persistent,
     cl_volume_size,
+    cl_tolerations,
+    validator_tolerations,
+    participant_tolerations,
+    global_tolerations,
     split_mode_enabled=False,
 ):
     beacon_service_name = "{0}".format(service_name)
@@ -129,19 +131,34 @@ def launch(
     )
 
     log_level = input_parser.get_client_log_level_or_default(
-        participant_log_level, global_log_level, LIGHTHOUSE_LOG_LEVELS
+        participant_log_level, global_log_level, VERBOSITY_LEVELS
+    )
+
+    tolerations = input_parser.get_client_tolerations(
+        cl_tolerations, participant_tolerations, global_tolerations
     )
 
-    bn_min_cpu = int(bn_min_cpu) if int(bn_min_cpu) > 0 else BEACON_MIN_CPU
-    bn_max_cpu = int(bn_max_cpu) if int(bn_max_cpu) > 0 else BEACON_MAX_CPU
-    bn_min_mem = int(bn_min_mem) if int(bn_min_mem) > 0 else BEACON_MIN_MEMORY
-    bn_max_mem = int(bn_max_mem) if int(bn_max_mem) > 0 else BEACON_MAX_MEMORY
     network_name = (
         "devnets"
         if launcher.network != "kurtosis"
+        and launcher.network != "ephemery"
         and launcher.network not in constants.PUBLIC_NETWORKS
         else launcher.network
     )
+
+    bn_min_cpu = int(bn_min_cpu) if int(bn_min_cpu) > 0 else BEACON_MIN_CPU
+    bn_max_cpu = (
+        int(bn_max_cpu)
+        if int(bn_max_cpu) > 0
+        else constants.RAM_CPU_OVERRIDES[network_name]["lighthouse_max_cpu"]
+    )
+    bn_min_mem = int(bn_min_mem) if int(bn_min_mem) > 0 else BEACON_MIN_MEMORY
+    bn_max_mem = (
+        int(bn_max_mem)
+        if int(bn_max_mem) > 0
+        else constants.RAM_CPU_OVERRIDES[network_name]["lighthouse_max_mem"]
+    )
+
     cl_volume_size = (
         int(cl_volume_size)
         if int(cl_volume_size) > 0
@@ -169,6 +186,7 @@ def launch(
         extra_beacon_labels,
         persistent,
         cl_volume_size,
+        tolerations,
     )
 
     beacon_service = plan.add_service(beacon_service_name, beacon_config)
@@ -203,7 +221,9 @@ def launch(
         v_max_cpu = int(v_max_cpu) if int(v_max_cpu) > 0 else VALIDATOR_MAX_CPU
         v_min_mem = int(v_min_mem) if int(v_min_mem) > 0 else VALIDATOR_MIN_MEMORY
         v_max_mem = int(v_max_mem) if int(v_max_mem) > 0 else VALIDATOR_MAX_MEMORY
-
+        tolerations = input_parser.get_client_tolerations(
+            validator_tolerations, participant_tolerations, global_tolerations
+        )
         validator_config = get_validator_config(
             launcher.el_cl_genesis_data,
             image,
@@ -219,6 +239,7 @@ def launch(
             extra_validator_params,
             extra_validator_labels,
             persistent,
+            tolerations,
         )
 
         validator_service = plan.add_service(validator_service_name, validator_config)
@@ -297,6 +318,7 @@ def get_beacon_config(
     extra_labels,
     persistent,
     cl_volume_size,
+    tolerations,
 ):
     # If snooper is enabled use the snooper engine context, otherwise use the execution client context
     if snooper_enabled:
@@ -445,6 +467,7 @@ def get_beacon_config(
             el_client_context.client_name,
             extra_labels,
         ),
+        tolerations=tolerations,
     )
 
 
@@ -463,6 +486,7 @@ def get_validator_config(
     extra_params,
     extra_labels,
     persistent,
+    tolerations,
 ):
     validator_keys_dirpath = shared_utils.path_join(
         VALIDATOR_KEYS_MOUNTPOINT_ON_CLIENTS,
@@ -528,6 +552,7 @@ def get_validator_config(
             el_client_context.client_name,
             extra_labels,
         ),
+        tolerations=tolerations,
     )