From 93f8a11d3927eeb56a5f4e2ff8bb07dd2241213e Mon Sep 17 00:00:00 2001 From: Valay Dave Date: Mon, 6 May 2024 16:21:32 -0700 Subject: [PATCH] [@parallel on Kubernetes] support for Jobsets This commit adds support for @parallel when flows are run `--with kubernetes` Support for Argo workflows will follow in a separate commit. A user can run a flow with the following: @step def start(self): self.next(self.parallel_step, num_parallel=3) @kubernetes(cpu=1, memory=512) @parallel @step def parallel_step(self): ... --- metaflow/plugins/argo/argo_workflows.py | 5 + metaflow/plugins/kubernetes/kubernetes.py | 81 +- metaflow/plugins/kubernetes/kubernetes_cli.py | 12 + .../plugins/kubernetes/kubernetes_client.py | 5 +- .../kubernetes/kubernetes_decorator.py | 42 +- metaflow/plugins/kubernetes/kubernetes_job.py | 392 +++++---- .../plugins/kubernetes/kubernetes_jobsets.py | 745 ++++++++++++++++++ 7 files changed, 1065 insertions(+), 217 deletions(-) create mode 100644 metaflow/plugins/kubernetes/kubernetes_jobsets.py diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index 572ca205af0..5cd0313fdda 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -838,6 +838,11 @@ def _dag_templates(self): def _visit( node, exit_node=None, templates=None, dag_tasks=None, parent_foreach=None ): + if node.parallel_foreach: + raise ArgoWorkflowsException( + "Deploying flows with @parallel decorator(s) " + "as Argo Workflows is not supported currently." + ) # Every for-each node results in a separate subDAG and an equivalent # DAGTemplate rooted at the child of the for-each node. Each DAGTemplate # has a unique name - the top-level DAGTemplate is named as the name of diff --git a/metaflow/plugins/kubernetes/kubernetes.py b/metaflow/plugins/kubernetes/kubernetes.py index c87d3c221de..c6bfe38e9ca 100644 --- a/metaflow/plugins/kubernetes/kubernetes.py +++ b/metaflow/plugins/kubernetes/kubernetes.py @@ -3,9 +3,9 @@ import os import re import shlex +import copy import time from typing import Dict, List, Optional -import uuid from uuid import uuid4 from metaflow import current, util @@ -66,6 +66,12 @@ class KubernetesKilledException(MetaflowException): headline = "Kubernetes Batch job killed" +def _extract_labels_and_annotations_from_job_spec(job_spec): + annotations = job_spec.template.metadata.annotations + labels = job_spec.template.metadata.labels + return copy.copy(annotations), copy.copy(labels) + + class Kubernetes(object): def __init__( self, @@ -140,9 +146,64 @@ def _command( return shlex.split('bash -c "%s"' % cmd_str) def launch_job(self, **kwargs): - self._job = self.create_job(**kwargs).execute() + if ( + "num_parallel" in kwargs + and kwargs["num_parallel"] + and int(kwargs["num_parallel"]) > 0 + ): + job = self.create_job_object(**kwargs) + spec = job.create_job_spec() + # `kwargs["step_cli"]` is setting `ubf_context` as control to ALL pods. + # This will be modified by the KubernetesJobSet object + annotations, labels = _extract_labels_and_annotations_from_job_spec(spec) + self._job = self.create_jobset( + job_spec=spec, + run_id=kwargs["run_id"], + step_name=kwargs["step_name"], + task_id=kwargs["task_id"], + namespace=kwargs["namespace"], + env=kwargs["env"], + num_parallel=kwargs["num_parallel"], + port=kwargs["port"], + annotations=annotations, + labels=labels, + ).execute() + else: + kwargs["name_pattern"] = "t-{uid}-".format(uid=str(uuid4())[:8]) + self._job = self.create_job_object(**kwargs).k8screate().execute() + + def create_jobset( + self, + job_spec=None, + run_id=None, + step_name=None, + task_id=None, + namespace=None, + env=None, + num_parallel=None, + port=None, + annotations=None, + labels=None, + ): + if env is None: + env = {} - def create_job( + _prefix = str(uuid4())[:6] + js = KubernetesClient().jobset( + name="js-%s" % _prefix, + run_id=run_id, + task_id=task_id, + step_name=step_name, + namespace=namespace, + labels=self._get_labels(labels), + annotations=annotations, + num_parallel=num_parallel, + job_spec=job_spec, + port=port, + ) + return js + + def create_job_object( self, flow_name, run_id, @@ -176,14 +237,15 @@ def create_job( labels=None, shared_memory=None, port=None, + name_pattern=None, + num_parallel=None, ): if env is None: env = {} - job = ( KubernetesClient() .job( - generate_name="t-{uid}-".format(uid=str(uuid4())[:8]), + generate_name=name_pattern, namespace=namespace, service_account=service_account, secrets=secrets, @@ -217,6 +279,7 @@ def create_job( persistent_volume_claims=persistent_volume_claims, shared_memory=shared_memory, port=port, + num_parallel=num_parallel, ) .environment_variable("METAFLOW_CODE_SHA", code_package_sha) .environment_variable("METAFLOW_CODE_URL", code_package_url) @@ -332,6 +395,9 @@ def create_job( .label("app.kubernetes.io/part-of", "metaflow") ) + return job + + def create_k8sjob(self, job): return job.create() def wait(self, stdout_location, stderr_location, echo=None): @@ -366,7 +432,7 @@ def wait_for_launch(job): t = time.time() time.sleep(update_delay(time.time() - start_time)) - prefix = b"[%s] " % util.to_bytes(self._job.id) + _make_prefix = lambda: b"[%s] " % util.to_bytes(self._job.id) stdout_tail = get_log_tailer(stdout_location, self._datastore.TYPE) stderr_tail = get_log_tailer(stderr_location, self._datastore.TYPE) @@ -376,7 +442,7 @@ def wait_for_launch(job): # 2) Tail logs until the job has finished tail_logs( - prefix=prefix, + prefix=_make_prefix(), stdout_tail=stdout_tail, stderr_tail=stderr_tail, echo=echo, @@ -392,7 +458,6 @@ def wait_for_launch(job): # exists prior to calling S3Tail and note the user about # truncated logs if it doesn't. # TODO : For hard crashes, we can fetch logs from the pod. - if self._job.has_failed: exit_code, reason = self._job.reason msg = next( diff --git a/metaflow/plugins/kubernetes/kubernetes_cli.py b/metaflow/plugins/kubernetes/kubernetes_cli.py index 9d4750f45f6..3c32d4c4dd0 100644 --- a/metaflow/plugins/kubernetes/kubernetes_cli.py +++ b/metaflow/plugins/kubernetes/kubernetes_cli.py @@ -7,6 +7,7 @@ from metaflow._vendor import click from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, CommandException from metaflow.metadata.util import sync_local_metadata_from_datastore +from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, KUBERNETES_LABELS from metaflow.mflog import TASK_LOG_SOURCE import metaflow.tracing as tracing @@ -109,6 +110,15 @@ def kubernetes(): ) @click.option("--shared-memory", default=None, help="Size of shared memory in MiB") @click.option("--port", default=None, help="Port number to expose from the container") +@click.option( + "--ubf-context", default=None, type=click.Choice([None, UBF_CONTROL, UBF_TASK]) +) +@click.option( + "--num-parallel", + default=None, + type=int, + help="Number of parallel nodes to run as a multi-node job.", +) @click.pass_context def step( ctx, @@ -136,6 +146,7 @@ def step( tolerations=None, shared_memory=None, port=None, + num_parallel=None, **kwargs ): def echo(msg, stream="stderr", job_id=None, **kwargs): @@ -251,6 +262,7 @@ def _sync_metadata(): tolerations=tolerations, shared_memory=shared_memory, port=port, + num_parallel=num_parallel, ) except Exception as e: traceback.print_exc(chain=False) diff --git a/metaflow/plugins/kubernetes/kubernetes_client.py b/metaflow/plugins/kubernetes/kubernetes_client.py index 33023f36c11..631d2ecdf13 100644 --- a/metaflow/plugins/kubernetes/kubernetes_client.py +++ b/metaflow/plugins/kubernetes/kubernetes_client.py @@ -4,7 +4,7 @@ from metaflow.exception import MetaflowException -from .kubernetes_job import KubernetesJob +from .kubernetes_job import KubernetesJob, KubernetesJobSet CLIENT_REFRESH_INTERVAL_SECONDS = 300 @@ -61,5 +61,8 @@ def get(self): return self._client + def jobset(self, **kwargs): + return KubernetesJobSet(self, **kwargs) + def job(self, **kwargs): return KubernetesJob(self, **kwargs) diff --git a/metaflow/plugins/kubernetes/kubernetes_decorator.py b/metaflow/plugins/kubernetes/kubernetes_decorator.py index b6253cb5841..bdabeda5bad 100644 --- a/metaflow/plugins/kubernetes/kubernetes_decorator.py +++ b/metaflow/plugins/kubernetes/kubernetes_decorator.py @@ -32,6 +32,8 @@ from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata from .kubernetes import KubernetesException, parse_kube_keyvalue_list +from metaflow.unbounded_foreach import UBF_CONTROL +from .kubernetes_jobsets import TaskIdConstructor try: unicode @@ -239,12 +241,6 @@ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logge "Kubernetes. Please use one or the other.".format(step=step) ) - for deco in decos: - if getattr(deco, "IS_PARALLEL", False): - raise KubernetesException( - "@kubernetes does not support parallel execution currently." - ) - # Set run time limit for the Kubernetes job. self.run_time_limit = get_run_time_limit_for_task(decos) if self.run_time_limit < 60: @@ -453,6 +449,24 @@ def task_pre_step( self._save_logs_sidecar = Sidecar("save_logs_periodically") self._save_logs_sidecar.start() + num_parallel = None + if hasattr(flow, "_parallel_ubf_iter"): + num_parallel = flow._parallel_ubf_iter.num_parallel + + if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL: + control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids( + num_parallel + ) + mapper_task_ids = [control_task_id] + worker_task_ids + flow._control_mapper_tasks = [ + "%s/%s/%s" % (run_id, step_name, mapper_task_id) + for mapper_task_id in mapper_task_ids + ] + flow._control_task_is_mapper_zero = True + + if num_parallel and num_parallel > 1: + _setup_multinode_environment() + def task_finished( self, step_name, flow, graph, is_task_ok, retry_count, max_retries ): @@ -486,3 +500,19 @@ def _save_package_once(cls, flow_datastore, package): cls.package_url, cls.package_sha = flow_datastore.save_data( [package.blob], len_hint=1 )[0] + + +def _setup_multinode_environment(): + import socket + + os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"]) + os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"] + # TODO [FINAL-REFACTOR-FOR-ARGO-BASE]: Remove the need for `RANK` in this code here. + if os.environ.get("CONTROL_INDEX") is not None: + os.environ["MF_PARALLEL_NODE_INDEX"] = str(0) + elif os.environ.get("WORKER_REPLICA_INDEX") is not None: + os.environ["MF_PARALLEL_NODE_INDEX"] = str( + int(os.environ["WORKER_REPLICA_INDEX"]) + 1 + ) + else: + os.environ["MF_PARALLEL_NODE_INDEX"] = os.environ["RANK"] diff --git a/metaflow/plugins/kubernetes/kubernetes_job.py b/metaflow/plugins/kubernetes/kubernetes_job.py index adb9446e5f9..6d7c005acdb 100644 --- a/metaflow/plugins/kubernetes/kubernetes_job.py +++ b/metaflow/plugins/kubernetes/kubernetes_job.py @@ -2,14 +2,17 @@ import math import random import time - +import copy +import sys from metaflow.tracing import inject_tracing_vars - - from metaflow.exception import MetaflowException from metaflow.metaflow_config import KUBERNETES_SECRETS +from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK CLIENT_REFRESH_INTERVAL_SECONDS = 300 +from .kubernetes_jobsets import ( + KubernetesJobSet, # We need this import for Kubernetes Client. +) class KubernetesJobException(MetaflowException): @@ -58,7 +61,183 @@ def __init__(self, client, **kwargs): self._client = client self._kwargs = kwargs - def create(self): + def create_job_spec(self): + client = self._client.get() + + # tmpfs variables + use_tmpfs = self._kwargs["use_tmpfs"] + tmpfs_size = self._kwargs["tmpfs_size"] + tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs) + shared_memory = ( + int(self._kwargs["shared_memory"]) + if self._kwargs["shared_memory"] + else None + ) + volume_mounts = [] + if tmpfs_enabled: + volume_mounts.append( + client.V1VolumeMount( + mount_path=self._kwargs.get("tmpfs_path"), + name="tmpfs-ephemeral-volume", + ) + ) + if shared_memory: + volume_mounts.append( + client.V1VolumeMount( + mount_path=self._kwargs.get("shared_memory_path"), + name="dhsm", + ) + ) + if self._kwargs.get("persistent_volume_claims") is not None: + volume_mounts += [ + client.V1VolumeMount(mount_path=path, name=claim) + for claim, path in self._kwargs["persistent_volume_claims"].items() + ] + + volumes = [] + if tmpfs_enabled: + volumes.append( + client.V1Volume( + name="tmpfs-ephemeral-volume", + empty_dir=client.V1EmptyDirVolumeSource( + medium="Memory", + # Add default unit as ours differs from Kubernetes default. + size_limit="{}Mi".format(tmpfs_size), + ), + ) + ) + if self._kwargs.get("persistent_volume_claims") is not None: + volumes += [ + client.V1Volume( + name=claim, + persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( + claim_name=claim + ), + ) + for claim in self._kwargs["persistent_volume_claims"].keys() + ] + + return client.V1JobSpec( + # Retries are handled by Metaflow when it is responsible for + # executing the flow. The responsibility is moved to Kubernetes + # when Argo Workflows is responsible for the execution. + backoff_limit=self._kwargs.get("retries", 0), + completions=self._kwargs.get("completions", 1), + ttl_seconds_after_finished=7 + * 60 + * 60 # Remove job after a week. TODO: Make this configurable + * 24, + template=client.V1PodTemplateSpec( + metadata=client.V1ObjectMeta( + annotations=self._kwargs.get("annotations", {}), + labels=self._kwargs.get("labels", {}), + namespace=self._kwargs["namespace"], + ), + spec=client.V1PodSpec( + # Timeout is set on the pod and not the job (important!) + active_deadline_seconds=self._kwargs["timeout_in_seconds"], + # TODO (savin): Enable affinities for GPU scheduling. + # affinity=?, + containers=[ + client.V1Container( + command=self._kwargs["command"], + ports=[] + if self._kwargs["port"] is None + else [ + client.V1ContainerPort( + container_port=int(self._kwargs["port"]) + ) + ], + env=[ + client.V1EnvVar(name=k, value=str(v)) + for k, v in self._kwargs.get( + "environment_variables", {} + ).items() + ] + # And some downward API magic. Add (key, value) + # pairs below to make pod metadata available + # within Kubernetes container. + + [ + client.V1EnvVar( + name=k, + value_from=client.V1EnvVarSource( + field_ref=client.V1ObjectFieldSelector( + field_path=str(v) + ) + ), + ) + for k, v in { + "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace", + "METAFLOW_KUBERNETES_POD_NAME": "metadata.name", + "METAFLOW_KUBERNETES_POD_ID": "metadata.uid", + "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName", + "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP", + }.items() + ] + + [ + client.V1EnvVar(name=k, value=str(v)) + for k, v in inject_tracing_vars({}).items() + ], + env_from=[ + client.V1EnvFromSource( + secret_ref=client.V1SecretEnvSource( + name=str(k), + # optional=True + ) + ) + for k in list(self._kwargs.get("secrets", [])) + + KUBERNETES_SECRETS.split(",") + if k + ], + image=self._kwargs["image"], + image_pull_policy=self._kwargs["image_pull_policy"], + name=self._kwargs["step_name"].replace("_", "-"), + resources=client.V1ResourceRequirements( + requests={ + "cpu": str(self._kwargs["cpu"]), + "memory": "%sM" % str(self._kwargs["memory"]), + "ephemeral-storage": "%sM" + % str(self._kwargs["disk"]), + }, + limits={ + "%s.com/gpu".lower() + % self._kwargs["gpu_vendor"]: str( + self._kwargs["gpu"] + ) + for k in [0] + # Don't set GPU limits if gpu isn't specified. + if self._kwargs["gpu"] is not None + }, + ), + volume_mounts=volume_mounts, + ) + ], + node_selector=self._kwargs.get("node_selector"), + # TODO (savin): Support image_pull_secrets + # image_pull_secrets=?, + # TODO (savin): Support preemption policies + # preemption_policy=?, + # + # A Container in a Pod may fail for a number of + # reasons, such as because the process in it exited + # with a non-zero exit code, or the Container was + # killed due to OOM etc. If this happens, fail the pod + # and let Metaflow handle the retries. + restart_policy="Never", + service_account_name=self._kwargs["service_account"], + # Terminate the container immediately on SIGTERM + termination_grace_period_seconds=0, + tolerations=[ + client.V1Toleration(**toleration) + for toleration in self._kwargs.get("tolerations") or [] + ], + volumes=volumes, + # TODO (savin): Set termination_message_policy + ), + ), + ) + + def k8screate(self): # A discerning eye would notice and question the choice of using the # V1Job construct over the V1Pod construct given that we don't rely much # on any of the V1Job semantics. The major reasons at the moment are - @@ -77,11 +256,6 @@ def create(self): use_tmpfs = self._kwargs["use_tmpfs"] tmpfs_size = self._kwargs["tmpfs_size"] tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs) - shared_memory = ( - int(self._kwargs["shared_memory"]) - if self._kwargs["shared_memory"] - else None - ) self._job = client.V1Job( api_version="batch/v1", @@ -94,197 +268,7 @@ def create(self): generate_name=self._kwargs["generate_name"], namespace=self._kwargs["namespace"], # Defaults to `default` ), - spec=client.V1JobSpec( - # Retries are handled by Metaflow when it is responsible for - # executing the flow. The responsibility is moved to Kubernetes - # when Argo Workflows is responsible for the execution. - backoff_limit=self._kwargs.get("retries", 0), - completions=1, # A single non-indexed pod job - ttl_seconds_after_finished=7 - * 60 - * 60 # Remove job after a week. TODO: Make this configurable - * 24, - template=client.V1PodTemplateSpec( - metadata=client.V1ObjectMeta( - annotations=self._kwargs.get("annotations", {}), - labels=self._kwargs.get("labels", {}), - namespace=self._kwargs["namespace"], - ), - spec=client.V1PodSpec( - # Timeout is set on the pod and not the job (important!) - active_deadline_seconds=self._kwargs["timeout_in_seconds"], - # TODO (savin): Enable affinities for GPU scheduling. - # affinity=?, - containers=[ - client.V1Container( - command=self._kwargs["command"], - ports=[ - client.V1ContainerPort( - container_port=int(self._kwargs["port"]) - ) - ] - if "port" in self._kwargs and self._kwargs["port"] - else None, - env=[ - client.V1EnvVar(name=k, value=str(v)) - for k, v in self._kwargs.get( - "environment_variables", {} - ).items() - ] - # And some downward API magic. Add (key, value) - # pairs below to make pod metadata available - # within Kubernetes container. - + [ - client.V1EnvVar( - name=k, - value_from=client.V1EnvVarSource( - field_ref=client.V1ObjectFieldSelector( - field_path=str(v) - ) - ), - ) - for k, v in { - "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace", - "METAFLOW_KUBERNETES_POD_NAME": "metadata.name", - "METAFLOW_KUBERNETES_POD_ID": "metadata.uid", - "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName", - "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP", - }.items() - ] - + [ - client.V1EnvVar(name=k, value=str(v)) - for k, v in inject_tracing_vars({}).items() - ], - env_from=[ - client.V1EnvFromSource( - secret_ref=client.V1SecretEnvSource( - name=str(k), - # optional=True - ) - ) - for k in list(self._kwargs.get("secrets", [])) - + KUBERNETES_SECRETS.split(",") - if k - ], - image=self._kwargs["image"], - image_pull_policy=self._kwargs["image_pull_policy"], - name=self._kwargs["step_name"].replace("_", "-"), - resources=client.V1ResourceRequirements( - requests={ - "cpu": str(self._kwargs["cpu"]), - "memory": "%sM" % str(self._kwargs["memory"]), - "ephemeral-storage": "%sM" - % str(self._kwargs["disk"]), - }, - limits={ - "%s.com/gpu".lower() - % self._kwargs["gpu_vendor"]: str( - self._kwargs["gpu"] - ) - for k in [0] - # Don't set GPU limits if gpu isn't specified. - if self._kwargs["gpu"] is not None - }, - ), - volume_mounts=( - [ - client.V1VolumeMount( - mount_path=self._kwargs.get("tmpfs_path"), - name="tmpfs-ephemeral-volume", - ) - ] - if tmpfs_enabled - else [] - ) - + ( - [ - client.V1VolumeMount( - mount_path="/dev/shm", name="dhsm" - ) - ] - if shared_memory - else [] - ) - + ( - [ - client.V1VolumeMount( - mount_path=path, name=claim - ) - for claim, path in self._kwargs[ - "persistent_volume_claims" - ].items() - ] - if self._kwargs["persistent_volume_claims"] - is not None - else [] - ), - ) - ], - node_selector=self._kwargs.get("node_selector"), - # TODO (savin): Support image_pull_secrets - # image_pull_secrets=?, - # TODO (savin): Support preemption policies - # preemption_policy=?, - # - # A Container in a Pod may fail for a number of - # reasons, such as because the process in it exited - # with a non-zero exit code, or the Container was - # killed due to OOM etc. If this happens, fail the pod - # and let Metaflow handle the retries. - restart_policy="Never", - service_account_name=self._kwargs["service_account"], - # Terminate the container immediately on SIGTERM - termination_grace_period_seconds=0, - tolerations=[ - client.V1Toleration(**toleration) - for toleration in self._kwargs.get("tolerations") or [] - ], - volumes=( - [ - client.V1Volume( - name="tmpfs-ephemeral-volume", - empty_dir=client.V1EmptyDirVolumeSource( - medium="Memory", - # Add default unit as ours differs from Kubernetes default. - size_limit="{}Mi".format(tmpfs_size), - ), - ) - ] - if tmpfs_enabled - else [] - ) - + ( - [ - client.V1Volume( - name="dhsm", - empty_dir=client.V1EmptyDirVolumeSource( - medium="Memory", - size_limit="{}Mi".format(shared_memory), - ), - ) - ] - if shared_memory - else [] - ) - + ( - [ - client.V1Volume( - name=claim, - persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( - claim_name=claim - ), - ) - for claim in self._kwargs[ - "persistent_volume_claims" - ].keys() - ] - if self._kwargs["persistent_volume_claims"] is not None - else [] - ), - # TODO (savin): Set termination_message_policy - ), - ), - ), + spec=self.create_job_spec(), ) return self @@ -415,10 +399,11 @@ def __init__(self, client, name, uid, namespace): import atexit + # TODO: If the running object is a jobset, set the jobset parallelism to 0. def best_effort_kill(): try: self.kill() - except: + except Exception as ex: pass atexit.register(best_effort_kill) @@ -482,9 +467,12 @@ def kill(self): # 3. If the pod object hasn't shown up yet, we set the parallelism to 0 # to preempt it. client = self._client.get() + + # If the job has a label called "jobset.sigs.k8s.io/jobset-name" + # it is managed by the jobset controller. Set the parallelism + # to 0 if not self.is_done: if self.is_running: - # Case 1. from kubernetes.stream import stream diff --git a/metaflow/plugins/kubernetes/kubernetes_jobsets.py b/metaflow/plugins/kubernetes/kubernetes_jobsets.py new file mode 100644 index 00000000000..bdb327a82b4 --- /dev/null +++ b/metaflow/plugins/kubernetes/kubernetes_jobsets.py @@ -0,0 +1,745 @@ +import copy +import math +import random +import time +from metaflow.metaflow_current import current +from metaflow.exception import MetaflowException +from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK +import json +from collections import namedtuple + + +class KubernetesJobsetException(MetaflowException): + headline = "Kubernetes jobset error" + + +# TODO [DUPLICATE CODE]: Refactor this method to a separate file so that +# It can be used by both KubernetesJob and KubernetesJobset +def k8s_retry(deadline_seconds=60, max_backoff=32): + def decorator(function): + from functools import wraps + + @wraps(function) + def wrapper(*args, **kwargs): + from kubernetes import client + + deadline = time.time() + deadline_seconds + retry_number = 0 + + while True: + try: + result = function(*args, **kwargs) + return result + except client.rest.ApiException as e: + if e.status == 500: + current_t = time.time() + backoff_delay = min( + math.pow(2, retry_number) + random.random(), max_backoff + ) + if current_t + backoff_delay < deadline: + time.sleep(backoff_delay) + retry_number += 1 + continue # retry again + else: + raise + else: + raise + + return wrapper + + return decorator + + +JobsetStatus = namedtuple( + "JobsetStatus", + [ + "control_pod_failed", # boolean + "control_exit_code", + "control_pod_status", # string like ():() [used for user-messaging] + "control_started", + "control_completed", + "worker_pods_failed", + "workers_are_suspended", + "workers_have_started", + "all_jobs_are_suspended", + "jobset_finished", + "jobset_failed", + "status_unknown", + "jobset_was_terminated", + ], +) + + +def _basic_validation_for_js(jobset): + if not jobset.get("status") or not jobset.get("status").get("replicatedJobsStatus"): + return False + worker_jobs = [ + w for w in jobset.get("spec").get("replicatedJobs") if w["name"] == "worker" + ] + if len(worker_jobs) == 0: + raise KubernetesJobsetException("No worker jobs found in the jobset manifest") + control_job = [ + w for w in jobset.get("spec").get("replicatedJobs") if w["name"] == "control" + ] + if len(control_job) == 0: + raise KubernetesJobsetException("No control job found in the jobset manifest") + return True + + +def _derive_pod_status_and_status_code(control_pod): + overall_status = None + control_exit_code = None + control_pod_failed = False + if control_pod: + container_status = None + pod_status = control_pod.get("status", {}).get("phase") + container_statuses = control_pod.get("status", {}).get("containerStatuses") + if container_statuses is None: + container_status = ": ".join( + filter( + None, + [ + control_pod.get("status", {}).get("reason"), + control_pod.get("status", {}).get("message"), + ], + ) + ) + else: + for k, v in container_statuses[0].get("state", {}).items(): + if v is not None: + control_exit_code = v.get("exit_code") + container_status = ": ".join( + filter( + None, + [v.get("reason"), v.get("message")], + ) + ) + if container_status is None: + overall_status = ( + f"pod status: {pod_status} | container status: {container_status}" + ) + else: + overall_status = f"pod status: {pod_status}" + if pod_status == "Failed": + control_pod_failed = True + return overall_status, control_exit_code, control_pod_failed + + +def _construct_jobset_logical_status(jobset, control_pod=None): + if not _basic_validation_for_js(jobset): + return JobsetStatus( + control_started=False, + control_completed=False, + workers_are_suspended=False, + workers_have_started=False, + all_jobs_are_suspended=False, + jobset_finished=False, + jobset_failed=False, + status_unknown=True, + jobset_was_terminated=False, + control_exit_code=None, + control_pod_status=None, + worker_pods_failed=False, + control_pod_failed=False, + ) + + js_status = jobset.get("status") + + control_started = False + control_completed = False + workers_are_suspended = False + workers_have_started = False + all_jobs_are_suspended = jobset.get("spec", {}).get("suspend", False) + jobset_finished = False + jobset_failed = False + status_unknown = False + jobset_was_terminated = False + worker_pods_failed = False + + total_worker_jobs = [ + w["replicas"] + for w in jobset.get("spec").get("replicatedJobs", []) + if w["name"] == "worker" + ][0] + total_control_jobs = [ + w["replicas"] + for w in jobset.get("spec").get("replicatedJobs", []) + if w["name"] == "control" + ][0] + + if total_worker_jobs == 0 and total_control_jobs == 0: + jobset_was_terminated = True + + replicated_job_statuses = js_status.get("replicatedJobsStatus") + for job_status in replicated_job_statuses: + if job_status["name"] == "control": + control_started = job_status["active"] > 0 + control_completed = job_status["succeeded"] > 0 + if job_status["failed"] > 0: + jobset_failed = True + + if job_status["name"] == "worker": + workers_have_started = job_status["active"] == total_worker_jobs + workers_are_suspended = job_status["suspended"] > 0 + if job_status["failed"] > 0: + worker_pods_failed = True + jobset_failed = True + + if js_status.get("conditions"): + for condition in js_status["conditions"]: + if condition["type"] == "Completed": + jobset_finished = True + if condition["type"] == "Failed": + jobset_failed = True + + ( + overall_status, + control_exit_code, + control_pod_failed, + ) = _derive_pod_status_and_status_code(control_pod) + + return JobsetStatus( + control_started=control_started, + control_completed=control_completed, + workers_are_suspended=workers_are_suspended, + workers_have_started=workers_have_started, + all_jobs_are_suspended=all_jobs_are_suspended, + jobset_finished=jobset_finished, + jobset_failed=jobset_failed, + status_unknown=status_unknown, + jobset_was_terminated=jobset_was_terminated, + control_exit_code=control_exit_code, + control_pod_status=overall_status, + worker_pods_failed=worker_pods_failed, + control_pod_failed=control_pod_failed, + ) + + +class RunningJobSet(object): + def __init__(self, client, name, namespace, group, version): + self._client = client + self._name = name + self._pod_name = None + self._namespace = namespace + self._group = group + self._version = version + self._pod = self._fetch_pod() + self._jobset = self._fetch_jobset() + + import atexit + + def best_effort_kill(): + try: + self.kill() + except Exception as ex: + pass + + atexit.register(best_effort_kill) + + def __repr__(self): + return "{}('{}/{}')".format( + self.__class__.__name__, self._namespace, self._name + ) + + @k8s_retry() + def _fetch_jobset( + self, + ): + # name : name of jobset. + # namespace : namespace of the jobset + # Query the jobset and return the object's status field as a JSON object + client = self._client.get() + with client.ApiClient() as api_client: + api_instance = client.CustomObjectsApi(api_client) + try: + jobset = api_instance.get_namespaced_custom_object( + group=self._group, + version=self._version, + namespace=self._namespace, + plural="jobsets", + name=self._name, + ) + return jobset + except client.rest.ApiException as e: + if e.status == 404: + raise KubernetesJobsetException( + "Unable to locate Kubernetes jobset %s" % self._name + ) + raise + + @k8s_retry() + def _fetch_pod(self): + # Fetch pod metadata. + client = self._client.get() + pods = ( + client.CoreV1Api() + .list_namespaced_pod( + namespace=self._namespace, + label_selector="jobset.sigs.k8s.io/jobset-name={}".format(self._name), + ) + .to_dict()["items"] + ) + if pods: + for pod in pods: + # check the labels of the pod to see if + # the `jobset.sigs.k8s.io/replicatedjob-name` is set to `control` + if ( + pod["metadata"]["labels"].get( + "jobset.sigs.k8s.io/replicatedjob-name" + ) + == "control" + ): + return pod + return {} + + def kill(self): + plural = "jobsets" + client = self._client.get() + # Get the jobset + with client.ApiClient() as api_client: + api_instance = client.CustomObjectsApi(api_client) + try: + jobset = api_instance.get_namespaced_custom_object( + group=self._group, + version=self._version, + namespace=self._namespace, + plural="jobsets", + name=self._name, + ) + + # Suspend the jobset and set the replica's to Zero. + # + jobset["spec"]["suspend"] = True + for replicated_job in jobset["spec"]["replicatedJobs"]: + replicated_job["replicas"] = 0 + + api_instance.replace_namespaced_custom_object( + group=self._group, + version=self._version, + namespace=self._namespace, + plural=plural, + name=jobset["metadata"]["name"], + body=jobset, + ) + except Exception as e: + raise KubernetesJobsetException( + "Exception when suspending existing jobset: %s\n" % e + ) + + @property + def id(self): + if self._pod_name: + return "pod %s" % self._pod_name + if self._pod: + self._pod_name = self._pod["metadata"]["name"] + return self.id + return "jobset %s" % self._name + + @property + def is_done(self): + def done(): + return ( + self._jobset_is_completed + or self._jobset_has_failed + or self._jobset_was_terminated + ) + + if not done(): + # If not done, fetch newer status + self._jobset = self._fetch_jobset() + self._pod = self._fetch_pod() + return done() + + @property + def status(self): + if self.is_done: + return "Jobset is done" + + status = _construct_jobset_logical_status(self._jobset, control_pod=self._pod) + if status.status_unknown: + return "Jobset status is unknown" + if status.control_started: + if status.control_pod_status: + return f"Jobset is running: {status.control_pod_status}" + return "Jobset is running" + if status.all_jobs_are_suspended: + return "Jobset is waiting to be unsuspended" + + return "Jobset waiting for jobs to start" + + @property + def has_succeeded(self): + return self.is_done and self._jobset_is_completed + + @property + def has_failed(self): + return self.is_done and self._jobset_has_failed + + @property + def is_running(self): + if self.is_done: + return False + status = _construct_jobset_logical_status(self._jobset, control_pod=self._pod) + if status.control_started: + return True + return False + + @property + def _jobset_was_terminated(self): + return _construct_jobset_logical_status( + self._jobset, control_pod=self._pod + ).jobset_was_terminated + + @property + def is_waiting(self): + return not self.is_done and not self.is_running + + @property + def reason(self): + # return exit code and reason + if self.is_done and not self.has_succeeded: + self._pod = self._fetch_pod() + elif self.has_succeeded: + return 0, None + status = _construct_jobset_logical_status(self._jobset, control_pod=self._pod) + if status.control_pod_failed: + return ( + status.control_exit_code, + "control-pod failed [%s]" % status.control_pod_status, + ) + elif status.worker_pods_failed: + return None, "Worker pods failed" + return None, None + + @property + def _jobset_is_completed(self): + return _construct_jobset_logical_status( + self._jobset, control_pod=self._pod + ).jobset_finished + + @property + def _jobset_has_failed(self): + return _construct_jobset_logical_status( + self._jobset, control_pod=self._pod + ).jobset_failed + + +class TaskIdConstructor: + @classmethod + def jobset_worker_id(cls, control_task_id: str): + return "".join( + [control_task_id.replace("control", "worker"), "-", "$WORKER_REPLICA_INDEX"] + ) + + @classmethod + def join_step_task_ids(cls, num_parallel): + """ + Called within the step decorator to set the `flow._control_mapper_tasks`. + Setting these allows the flow to know which tasks are needed in the join step. + We set this in the `task_pre_step` method of the decorator. + """ + control_task_id = current.task_id + worker_task_id_base = control_task_id.replace("control", "worker") + mapper = lambda idx: worker_task_id_base + f"-{idx}" + return control_task_id, [mapper(idx) for idx in range(0, num_parallel - 1)] + + @classmethod + def argo(cls): + pass + + +def _jobset_specific_env_vars(client, jobset_main_addr, master_port, num_parallel): + return [ + client.V1EnvVar( + name="MASTER_ADDR", + value=jobset_main_addr, + ), + client.V1EnvVar( + name="MASTER_PORT", + value=str(master_port), + ), + client.V1EnvVar( + name="WORLD_SIZE", + value=str(num_parallel), + ), + ] + [ + client.V1EnvVar( + name="JOBSET_RESTART_ATTEMPT", + value_from=client.V1EnvVarSource( + field_ref=client.V1ObjectFieldSelector( + field_path="metadata.annotations['jobset.sigs.k8s.io/restart-attempt']" + ) + ), + ), + client.V1EnvVar( + name="WORKER_REPLICA_INDEX", + value_from=client.V1EnvVarSource( + field_ref=client.V1ObjectFieldSelector( + field_path="metadata.annotations['jobset.sigs.k8s.io/job-index']" + ) + ), + ), + ] + + +def get_control_job( + client, + job_spec, + jobset_main_addr, + subdomain, + port=None, + num_parallel=None, + namespace=None, +) -> dict: + master_port = port + + job_spec = copy.deepcopy(job_spec) + job_spec.parallelism = 1 + job_spec.completions = 1 + job_spec.template.spec.set_hostname_as_fqdn = True + job_spec.template.spec.subdomain = subdomain + for idx in range(len(job_spec.template.spec.containers[0].command)): + # CHECK FOR THE ubf_context in the command. + # Replace the UBF context to the one appropriately matching control/worker. + # Since we are passing the `step_cli` one time from the top level to one + # KuberentesJobSet, we need to ensure that UBF context is replaced properly + # in all the worker jobs. + if UBF_CONTROL in job_spec.template.spec.containers[0].command[idx]: + job_spec.template.spec.containers[0].command[idx] = ( + job_spec.template.spec.containers[0] + .command[idx] + .replace(UBF_CONTROL, UBF_CONTROL + " " + "--split-index 0") + ) + + job_spec.template.spec.containers[0].env = ( + job_spec.template.spec.containers[0].env + + _jobset_specific_env_vars(client, jobset_main_addr, master_port, num_parallel) + + [ + client.V1EnvVar( + name="CONTROL_INDEX", + value=str(0), + ) + ] + ) + + # Based on https://github.com/kubernetes-sigs/jobset/blob/v0.5.0/api/jobset/v1alpha2/jobset_types.go#L178 + return dict( + name="control", + template=client.api_client.ApiClient().sanitize_for_serialization( + client.V1JobTemplateSpec( + metadata=client.V1ObjectMeta( + namespace=namespace, + # We don't set any annotations here + # since they have been either set in the JobSpec + # or on the JobSet level + ), + spec=job_spec, + ) + ), + replicas=1, # The control job will always have 1 replica. + ) + + +def get_worker_job( + client, + job_spec, + job_name, + jobset_main_addr, + subdomain, + control_task_id=None, + worker_task_id=None, + replicas=1, + port=None, + num_parallel=None, + namespace=None, +) -> dict: + master_port = port + + job_spec = copy.deepcopy(job_spec) + job_spec.parallelism = 1 + job_spec.completions = 1 + job_spec.template.spec.set_hostname_as_fqdn = True + job_spec.template.spec.subdomain = subdomain + + for idx in range(len(job_spec.template.spec.containers[0].command)): + if control_task_id in job_spec.template.spec.containers[0].command[idx]: + job_spec.template.spec.containers[0].command[idx] = ( + job_spec.template.spec.containers[0] + .command[idx] + .replace(control_task_id, worker_task_id) + ) + # CHECK FOR THE ubf_context in the command. + # Replace the UBF context to the one appropriately matching control/worker. + # Since we are passing the `step_cli` one time from the top level to one + # KuberentesJobSet, we need to ensure that UBF context is replaced properly + # in all the worker jobs. + if UBF_CONTROL in job_spec.template.spec.containers[0].command[idx]: + # Since all command will have a UBF_CONTROL, we need to replace the UBF_CONTROL + # with the actual UBF Context and also ensure that we are setting the correct + # split-index for the worker jobs. + split_index_str = "--split-index `expr $WORKER_REPLICA_INDEX + 1`" # This set in the environment variables below + job_spec.template.spec.containers[0].command[idx] = ( + job_spec.template.spec.containers[0] + .command[idx] + .replace(UBF_CONTROL, UBF_TASK + " " + split_index_str) + ) + + job_spec.template.spec.containers[0].env = job_spec.template.spec.containers[ + 0 + ].env + _jobset_specific_env_vars( + client, jobset_main_addr, master_port, num_parallel + ) + + # Based on https://github.com/kubernetes-sigs/jobset/blob/v0.5.0/api/jobset/v1alpha2/jobset_types.go#L178 + return dict( + name=job_name, + template=client.api_client.ApiClient().sanitize_for_serialization( + client.V1JobTemplateSpec( + metadata=client.V1ObjectMeta( + namespace=namespace, + # We don't set any annotations here + # since they have been either set in the JobSpec + # or on the JobSet level + ), + spec=job_spec, + ) + ), + replicas=replicas, + ) + + +def _make_domain_name( + jobset_name, main_job_name, main_job_index, main_pod_index, namespace +): + return "%s-%s-%s-%s.%s.%s.svc.cluster.local" % ( + jobset_name, + main_job_name, + main_job_index, + main_pod_index, + jobset_name, + namespace, + ) + + +class KubernetesJobSet(object): + def __init__( + self, + client, + name=None, + job_spec=None, + namespace=None, + num_parallel=None, + annotations=None, + labels=None, + port=None, + task_id=None, + **kwargs + ): + self._client = client + self._kwargs = kwargs + self._group = "jobset.x-k8s.io" + self._version = "v1alpha2" + self.name = name + + main_job_name = "control" + main_job_index = 0 + main_pod_index = 0 + subdomain = self.name + num_parallel = int(1 if not num_parallel else num_parallel) + self._namespace = namespace + jobset_main_addr = _make_domain_name( + self.name, + main_job_name, + main_job_index, + main_pod_index, + self._namespace, + ) + + annotations = {} if not annotations else annotations + labels = {} if not labels else labels + + if "metaflow/task_id" in annotations: + del annotations["metaflow/task_id"] + + control_job = get_control_job( + client=self._client.get(), + job_spec=job_spec, + jobset_main_addr=jobset_main_addr, + subdomain=subdomain, + port=port, + num_parallel=num_parallel, + namespace=namespace, + ) + worker_task_id = TaskIdConstructor.jobset_worker_id(task_id) + worker_job = get_worker_job( + client=self._client.get(), + job_spec=job_spec, + job_name="worker", + jobset_main_addr=jobset_main_addr, + subdomain=subdomain, + control_task_id=task_id, + worker_task_id=worker_task_id, + replicas=num_parallel - 1, + port=port, + num_parallel=num_parallel, + namespace=namespace, + ) + worker_jobs = [worker_job] + # Based on https://github.com/kubernetes-sigs/jobset/blob/v0.5.0/api/jobset/v1alpha2/jobset_types.go#L163 + _kclient = client.get() + self._jobset = dict( + apiVersion=self._group + "/" + self._version, + kind="JobSet", + metadata=_kclient.api_client.ApiClient().sanitize_for_serialization( + _kclient.V1ObjectMeta( + name=self.name, labels=labels, annotations=annotations + ) + ), + spec=dict( + replicatedJobs=[control_job] + worker_jobs, + suspend=False, + startupPolicy=None, + successPolicy=None, + # The Failure Policy helps setting the number of retries for the jobset. + # It cannot accept a value of 0 for maxRestarts. + # So the attempt needs to be smartly set. + # If there is no retry decorator then we not set maxRestarts and instead we will + # set the attempt statically to 0. Otherwise we will make the job pickup the attempt + # from the `V1EnvVarSource.value_from.V1ObjectFieldSelector.field_path` = "metadata.annotations['jobset.sigs.k8s.io/restart-attempt']" + # failurePolicy={ + # "maxRestarts" : 1 + # }, + # The can be set for ArgoWorkflows + failurePolicy=None, + network=None, + ), + status=None, + ) + + def execute(self): + client = self._client.get() + api_instance = client.CoreV1Api() + + with client.ApiClient() as api_client: + api_instance = client.CustomObjectsApi(api_client) + try: + jobset_obj = api_instance.create_namespaced_custom_object( + group=self._group, + version=self._version, + namespace=self._namespace, + plural="jobsets", + body=self._jobset, + ) + except Exception as e: + raise KubernetesJobsetException( + "Exception when calling CustomObjectsApi->create_namespaced_custom_object: %s\n" + % e + ) + + return RunningJobSet( + client=self._client, + name=jobset_obj["metadata"]["name"], + namespace=jobset_obj["metadata"]["namespace"], + group=self._group, + version=self._version, + )