Skip to content

Commit

Permalink
patch affinity to work with dws (#7)
Browse files Browse the repository at this point in the history
* patch affinity to work with dws

* lint
  • Loading branch information
asaiacai authored Oct 11, 2024
1 parent 3238be0 commit 28d48e5
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 28 deletions.
9 changes: 6 additions & 3 deletions examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ name: torch-ddp-bench
num_nodes: 2

resources:
accelerators: A100:8 # Make sure you use 8 GPU instances
use_spot: True
cloud: gcp
accelerators: H100-MEGA-80GB:8 # Make sure you use 8 GPU instances
cloud: kubernetes
labels:
kueue.x-k8s.io/queue-name: user-queue # this is assigned by your admin
kueue.x-k8s.io/priority-class: low-priority
max-run-duration-seconds: "3000"

file_mounts:
./torch_ddp_benchmark.py: ./examples/torch_ddp_benchmark/torch_ddp_benchmark.py
Expand Down
51 changes: 26 additions & 25 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,32 +518,32 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
continue
pod_spec['metadata']['name'] = pod_name
pod_spec['metadata']['labels']['component'] = pod_name
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
}
pod = kubernetes.core_api(context).create_namespaced_pod(
namespace, pod_spec)
created_pods[pod.metadata.name] = pod
Expand Down Expand Up @@ -659,6 +659,7 @@ def _terminate_node(namespace: str, context: Optional[str],
logger.warning('terminate_instances: Error occurred when analyzing '
f'SSH Jump pod: {e}')
try:

kubernetes.core_api(context).delete_namespaced_service(
pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
kubernetes.core_api(context).delete_namespaced_service(
Expand Down

0 comments on commit 28d48e5

Please sign in to comment.