Enforce one pod per node

It figures out he resource capacities of the actual machine-type of the node, and uses it in the pod-config to set resources.{requests|limits} to enforce each pod/workload to run in its own dedicated node.
GoogleCloudPlatform · Sep 17, 2024 · bf0661a · bf0661a
1 parent ddcef98
commit bf0661a
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 0 deletions.
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py
@@ -31,6 +31,7 @@
 # local imports from other directories
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'utils'))
 from run_tests_common import escape_commas_in_string, parse_args, run_command, add_iam_role_for_buckets
+from utils import UnknownMachineTypeError, resource_limits
 
 # local imports from same directory
 import dlio_workload
@@ -43,6 +44,16 @@ def createHelmInstallCommands(
 ) -> list:
  """Creates helm install commands for the given dlioWorkload objects."""
  helm_commands = []
+ try:
+ resourceLimits, resourceRequests = resource_limits(machineType)
+ except UnknownMachineTypeError:
+ print(
+ f'Found unknown machine-type: {machineType}, defaulting resource limits'
+ ' to cpu=0,memory=0'
+ )
+ resourceLimits = {'cpu': 0, 'memory': '0'}
+ resourceRequests = resourceLimits
+
  for dlioWorkload in dlioWorkloads:
  for batchSize in dlioWorkload.batchSizes:
  chartName, podName, outputDirPrefix = dlio_workload.DlioChartNamePodName(
@@ -63,6 +74,10 @@ def createHelmInstallCommands(
  f'--set nodeType={machineType}',
  f'--set podName={podName}',
  f'--set outputDirPrefix={outputDirPrefix}',
+ f"--set resourceLimits.cpu={resourceLimits['cpu']}",
+ f"--set resourceLimits.memory={resourceLimits['memory']}",
+ f"--set resourceRequests.cpu={resourceRequests['cpu']}",
+ f"--set resourceRequests.memory={resourceRequests['memory']}",
  ]
 
  helm_command = ' '.join(commands)

diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py
@@ -30,6 +30,7 @@
 # local imports from other directories
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'utils'))
 from run_tests_common import escape_commas_in_string, parse_args, run_command, add_iam_role_for_buckets
+from utils import UnknownMachineTypeError, resource_limits
 
 # local imports from same directory
 import fio_workload
@@ -42,6 +43,16 @@ def createHelmInstallCommands(
 ) -> list:
  """Creates helm install commands for the given fioWorkload objects."""
  helm_commands = []
+ try:
+ resourceLimits, resourceRequests = resource_limits(machineType)
+ except UnknownMachineTypeError:
+ print(
+ f'Found unknown machine-type: {machineType}, defaulting resource limits'
+ ' to cpu=0,memory=0'
+ )
+ resourceLimits = {'cpu': 0, 'memory': '0'}
+ resourceRequests = resourceLimits
+
  for fioWorkload in fioWorkloads:
  for readType in fioWorkload.readTypes:
  chartName, podName, outputDirPrefix = fio_workload.FioChartNamePodName(
@@ -64,6 +75,10 @@ def createHelmInstallCommands(
  f'--set nodeType={machineType}',
  f'--set podName={podName}',
  f'--set outputDirPrefix={outputDirPrefix}',
+ f"--set resourceLimits.cpu={resourceLimits['cpu']}",
+ f"--set resourceLimits.memory={resourceLimits['memory']}",
+ f"--set resourceRequests.cpu={resourceRequests['cpu']}",
+ f"--set resourceRequests.memory={resourceRequests['memory']}",
  ]
 
  helm_command = ' '.join(commands)

diff --git a/perfmetrics/scripts/testing_on_gke/examples/utils/utils.py b/perfmetrics/scripts/testing_on_gke/examples/utils/utils.py
@@ -155,6 +155,35 @@ def timestamp_to_epoch(timestamp: str) -> int:
  )
 
 
+class UnknownMachineTypeError(Exception):
+ """Defines custom exception for unknown machine-type scenario.
+
+ It holds value of machineType as str.
+ """
+
+ def __init__(self, message, machineType: str):
+ super().__init__(message)
+ self.machineType = machineType
+
+
+def resource_limits(nodeType: str) -> Tuple[dict, dict]:
+ """Returns resource limits and requests for cpu/memory for different machine types."""
+ if nodeType == "n2-standard-96":
+ return {"cpu": 96, "memory": "384Gi"}, {"cpu": 90, "memory": "300Gi"}
+ elif nodeType == "n2-standard-48":
+ return {"cpu": 48, "memory": "192Gi"}, {"cpu": 45, "memory": "150Gi"}
+ elif nodeType == "n2-standard-32":
+ return {"cpu": 32, "memory": "128Gi"}, {"cpu": 30, "memory": "100Gi"}
+ elif nodeType == "c3-standard-176" or nodeType == "c3-standard-176-lssd":
+ return {"cpu": 176, "memory": "704Gi"}, {"cpu": 100, "memory": "400Gi"}
+ else:
+ raise UnknownMachineTypeError(
+ f"Unknown machine-type: {nodeType}. Unable to decide the"
+ " resource-limits for it.",
+ nodeType,
+ )
+
+
 def isRelevantMonitoringResult(
  result,
  cluster_name: str,