Skip to content

Commit

Permalink
Enforce one pod per node
Browse files Browse the repository at this point in the history
It figures out he resource capacities of
the actual machine-type of the node,
and uses it in the pod-config to
set resources.{requests|limits}
to enforce each pod/workload to run
in its own dedicated node.
  • Loading branch information
gargnitingoogle committed Sep 17, 2024
1 parent ddcef98 commit bf0661a
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 0 deletions.
15 changes: 15 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
# local imports from other directories
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'utils'))
from run_tests_common import escape_commas_in_string, parse_args, run_command, add_iam_role_for_buckets
from utils import UnknownMachineTypeError, resource_limits

# local imports from same directory
import dlio_workload
Expand All @@ -43,6 +44,16 @@ def createHelmInstallCommands(
) -> list:
"""Creates helm install commands for the given dlioWorkload objects."""
helm_commands = []
try:
resourceLimits, resourceRequests = resource_limits(machineType)
except UnknownMachineTypeError:
print(
f'Found unknown machine-type: {machineType}, defaulting resource limits'
' to cpu=0,memory=0'
)
resourceLimits = {'cpu': 0, 'memory': '0'}
resourceRequests = resourceLimits

for dlioWorkload in dlioWorkloads:
for batchSize in dlioWorkload.batchSizes:
chartName, podName, outputDirPrefix = dlio_workload.DlioChartNamePodName(
Expand All @@ -63,6 +74,10 @@ def createHelmInstallCommands(
f'--set nodeType={machineType}',
f'--set podName={podName}',
f'--set outputDirPrefix={outputDirPrefix}',
f"--set resourceLimits.cpu={resourceLimits['cpu']}",
f"--set resourceLimits.memory={resourceLimits['memory']}",
f"--set resourceRequests.cpu={resourceRequests['cpu']}",
f"--set resourceRequests.memory={resourceRequests['memory']}",
]

helm_command = ' '.join(commands)
Expand Down
15 changes: 15 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
# local imports from other directories
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'utils'))
from run_tests_common import escape_commas_in_string, parse_args, run_command, add_iam_role_for_buckets
from utils import UnknownMachineTypeError, resource_limits

# local imports from same directory
import fio_workload
Expand All @@ -42,6 +43,16 @@ def createHelmInstallCommands(
) -> list:
"""Creates helm install commands for the given fioWorkload objects."""
helm_commands = []
try:
resourceLimits, resourceRequests = resource_limits(machineType)
except UnknownMachineTypeError:
print(
f'Found unknown machine-type: {machineType}, defaulting resource limits'
' to cpu=0,memory=0'
)
resourceLimits = {'cpu': 0, 'memory': '0'}
resourceRequests = resourceLimits

for fioWorkload in fioWorkloads:
for readType in fioWorkload.readTypes:
chartName, podName, outputDirPrefix = fio_workload.FioChartNamePodName(
Expand All @@ -64,6 +75,10 @@ def createHelmInstallCommands(
f'--set nodeType={machineType}',
f'--set podName={podName}',
f'--set outputDirPrefix={outputDirPrefix}',
f"--set resourceLimits.cpu={resourceLimits['cpu']}",
f"--set resourceLimits.memory={resourceLimits['memory']}",
f"--set resourceRequests.cpu={resourceRequests['cpu']}",
f"--set resourceRequests.memory={resourceRequests['memory']}",
]

helm_command = ' '.join(commands)
Expand Down
29 changes: 29 additions & 0 deletions perfmetrics/scripts/testing_on_gke/examples/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,35 @@ def timestamp_to_epoch(timestamp: str) -> int:
)


class UnknownMachineTypeError(Exception):
"""Defines custom exception for unknown machine-type scenario.
It holds value of machineType as str.
"""

def __init__(self, message, machineType: str):
super().__init__(message)
self.machineType = machineType


def resource_limits(nodeType: str) -> Tuple[dict, dict]:
"""Returns resource limits and requests for cpu/memory for different machine types."""
if nodeType == "n2-standard-96":
return {"cpu": 96, "memory": "384Gi"}, {"cpu": 90, "memory": "300Gi"}
elif nodeType == "n2-standard-48":
return {"cpu": 48, "memory": "192Gi"}, {"cpu": 45, "memory": "150Gi"}
elif nodeType == "n2-standard-32":
return {"cpu": 32, "memory": "128Gi"}, {"cpu": 30, "memory": "100Gi"}
elif nodeType == "c3-standard-176" or nodeType == "c3-standard-176-lssd":
return {"cpu": 176, "memory": "704Gi"}, {"cpu": 100, "memory": "400Gi"}
else:
raise UnknownMachineTypeError(
f"Unknown machine-type: {nodeType}. Unable to decide the"
" resource-limits for it.",
nodeType,
)


def isRelevantMonitoringResult(
result,
cluster_name: str,
Expand Down

0 comments on commit bf0661a

Please sign in to comment.