Skip to content

Commit

Permalink
K8s pod dns (#6)
Browse files Browse the repository at this point in the history
* enumerate worker pods, bind pod_name to headless service

* patch worker numbering

* allow any traffic between sky pods

* add test for ssh vs hostname

* fix ip-worker mapping for k8s ssh

* lint
  • Loading branch information
asaiacai committed Sep 27, 2024
1 parent dacf273 commit 4d602c5
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 27 deletions.
11 changes: 8 additions & 3 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2335,9 +2335,14 @@ def is_provided_ips_valid(
zip(cluster_internal_ips, cluster_feasible_ips))

# Ensure head node is the first element, then sort based on the
# external IPs for stableness
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
internal_external_ips[1:], key=lambda x: x[1])
# external IPs for stableness. Skip for k8s nodes since pods
# worker ids are already mapped.
if (cluster_info is not None and
cluster_info.provider_name == 'kubernetes'):
stable_internal_external_ips = internal_external_ips
else:
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
internal_external_ips[1:], key=lambda x: x[1])
self.stable_internal_external_ips = stable_internal_external_ips

@functools.lru_cache()
Expand Down
12 changes: 6 additions & 6 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import copy
import time
from typing import Any, Dict, List, Optional
import uuid

from sky import exceptions
from sky import sky_logging
Expand Down Expand Up @@ -504,17 +503,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
created_pods = {}
logger.debug(f'run_instances: calling create_namespaced_pod '
f'(count={to_start_count}).')
for _ in range(to_start_count):
for pod_id in range(config.count):
if head_pod_name is None:
pod_spec['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
head_selector = head_service_selector(cluster_name_on_cloud)
pod_spec['metadata']['labels'].update(head_selector)
pod_spec['metadata']['name'] = f'{cluster_name_on_cloud}-head'
else:
pod_spec['metadata']['labels'].update(constants.WORKER_NODE_TAGS)
pod_uuid = str(uuid.uuid4())[:4]
pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
pod_spec['metadata']['name'] = f'{pod_name}-worker'
pod_name = f'{cluster_name_on_cloud}-worker{pod_id}'
if pod_id == 0 or pod_name in running_pods:
continue
pod_spec['metadata']['name'] = pod_name
pod_spec['metadata']['labels']['component'] = pod_name
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
Expand All @@ -541,7 +542,6 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
}]
}
}

pod = kubernetes.core_api(context).create_namespaced_pod(
namespace, pod_spec)
created_pods[pod.metadata.name] = pod
Expand Down
43 changes: 25 additions & 18 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -226,27 +226,34 @@ provider:
- apiVersion: v1
kind: Service
metadata:
labels:
parent: skypilot
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: {{cluster_name_on_cloud}}-head
labels:
parent: skypilot
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
# NOTE: If you're running multiple Ray clusters with services
# on one Kubernetes cluster, they must have unique service
# names.
name: {{cluster_name_on_cloud}}-head
spec:
# This selector must match the head node pod's selector below.
selector:
component: {{cluster_name_on_cloud}}-head
ports:
- name: client
protocol: TCP
port: 10001
targetPort: 10001
- name: dashboard
protocol: TCP
port: 8265
targetPort: 8265
clusterIP: None
# Service maps to rest of the worker nodes
{% for worker_id in range(1, num_nodes) %}
- apiVersion: v1
kind: Service
metadata:
labels:
parent: skypilot
skypilot-cluster: {{cluster_name_on_cloud}}
skypilot-user: {{ user }}
name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
spec:
selector:
component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
clusterIP: None
{% endfor %}

# Specify the pod type for the ray head node (as configured below).
head_node_type: ray_head_default
Expand All @@ -259,7 +266,7 @@ available_node_types:
metadata:
# name will be filled in the provisioner
# head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
# service is required.
# service is required. Remaining nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
labels:
parent: skypilot
# component will be set for the head node pod to be the same as the head node service selector above if a
Expand Down
24 changes: 24 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -3364,6 +3364,30 @@ def test_kubernetes_custom_image(image_id):
run_one_test(test)


@pytest.mark.kubernetes
def test_kubernetes_ssh_hostname():
name = _get_cluster_name()
test = Test(
'test-kubernetes-ssh-hostname',
[
f'sky launch -c {name} -y --num-nodes 10 --cpus 1+',
f'ssh {name} -t "hostname" | grep head',
f'ssh {name}-worker1 -t "hostname" | grep worker1',
f'ssh {name}-worker2 -t "hostname" | grep worker2',
f'ssh {name}-worker3 -t "hostname" | grep worker3',
f'ssh {name}-worker4 -t "hostname" | grep worker4',
f'ssh {name}-worker5 -t "hostname" | grep worker5',
f'ssh {name}-worker6 -t "hostname" | grep worker6',
f'ssh {name}-worker7 -t "hostname" | grep worker7',
f'ssh {name}-worker8 -t "hostname" | grep worker8',
f'ssh {name}-worker9 -t "hostname" | grep worker9',
],
f'sky down -y {name}',
timeout=10 * 60,
)
run_one_test(test)


@pytest.mark.azure
def test_azure_start_stop_two_nodes():
name = _get_cluster_name()
Expand Down

0 comments on commit 4d602c5

Please sign in to comment.