-
Notifications
You must be signed in to change notification settings - Fork 781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[@parallel on Kubernetes] support for Jobsets #1804
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
|
||
from metaflow.exception import MetaflowException | ||
|
||
from .kubernetes_job import KubernetesJob | ||
from .kubernetes_job import KubernetesJob, KubernetesJobSet | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need this import for Kubernetes clients which maybe getting used via extensions. |
||
|
||
|
||
CLIENT_REFRESH_INTERVAL_SECONDS = 300 | ||
|
@@ -61,5 +61,8 @@ def get(self): | |
|
||
return self._client | ||
|
||
def jobset(self, **kwargs): | ||
return KubernetesJobSet(self, **kwargs) | ||
|
||
def job(self, **kwargs): | ||
return KubernetesJob(self, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,8 @@ | |
|
||
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata | ||
from .kubernetes import KubernetesException, parse_kube_keyvalue_list | ||
from metaflow.unbounded_foreach import UBF_CONTROL | ||
from .kubernetes_jobsets import TaskIdConstructor | ||
|
||
try: | ||
unicode | ||
|
@@ -239,11 +241,15 @@ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logge | |
"Kubernetes. Please use one or the other.".format(step=step) | ||
) | ||
|
||
for deco in decos: | ||
if getattr(deco, "IS_PARALLEL", False): | ||
raise KubernetesException( | ||
"@kubernetes does not support parallel execution currently." | ||
if any([deco.name == "parallel" for deco in decos]) and any( | ||
[deco.name == "catch" for deco in decos] | ||
): | ||
raise MetaflowException( | ||
"Step *{step}* contains a @parallel decorator " | ||
"with the @catch decorator. @catch is not supported with @parallel on Kubernetes.".format( | ||
step=step | ||
) | ||
) | ||
|
||
# Set run time limit for the Kubernetes job. | ||
self.run_time_limit = get_run_time_limit_for_task(decos) | ||
|
@@ -421,6 +427,10 @@ def task_pre_step( | |
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME" | ||
] | ||
meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"] | ||
if os.environ.get("METAFLOW_KUBERNETES_JOBSET_NAME"): | ||
meta["kubernetes-jobset-name"] = os.environ[ | ||
"METAFLOW_KUBERNETES_JOBSET_NAME" | ||
] | ||
|
||
# TODO (savin): Introduce equivalent support for Microsoft Azure and | ||
# Google Cloud Platform | ||
|
@@ -453,6 +463,24 @@ def task_pre_step( | |
self._save_logs_sidecar = Sidecar("save_logs_periodically") | ||
self._save_logs_sidecar.start() | ||
|
||
num_parallel = None | ||
if hasattr(flow, "_parallel_ubf_iter"): | ||
num_parallel = flow._parallel_ubf_iter.num_parallel | ||
|
||
if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL: | ||
control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids( | ||
num_parallel | ||
) | ||
mapper_task_ids = [control_task_id] + worker_task_ids | ||
flow._control_mapper_tasks = [ | ||
"%s/%s/%s" % (run_id, step_name, mapper_task_id) | ||
for mapper_task_id in mapper_task_ids | ||
] | ||
flow._control_task_is_mapper_zero = True | ||
|
||
if num_parallel and num_parallel > 1: | ||
_setup_multinode_environment() | ||
Comment on lines
+466
to
+482
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needed so that Join steps has all the relevant task-ids. |
||
|
||
def task_finished( | ||
self, step_name, flow, graph, is_task_ok, retry_count, max_retries | ||
): | ||
|
@@ -486,3 +514,20 @@ def _save_package_once(cls, flow_datastore, package): | |
cls.package_url, cls.package_sha = flow_datastore.save_data( | ||
[package.blob], len_hint=1 | ||
)[0] | ||
|
||
|
||
def _setup_multinode_environment(): | ||
import socket | ||
|
||
os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"]) | ||
os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"] | ||
if os.environ.get("CONTROL_INDEX") is not None: | ||
os.environ["MF_PARALLEL_NODE_INDEX"] = str(0) | ||
elif os.environ.get("WORKER_REPLICA_INDEX") is not None: | ||
os.environ["MF_PARALLEL_NODE_INDEX"] = str( | ||
int(os.environ["WORKER_REPLICA_INDEX"]) + 1 | ||
) | ||
else: | ||
raise MetaflowException( | ||
"Jobset related ENV vars called $CONTROL_INDEX or $WORKER_REPLICA_INDEX not found" | ||
) |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
constraint added because jobset doesn't play nice with
replicas = 0
; Oncekubernetes-sigs/jobset
allow this, we can lift this constraint and add version logic to verify if the jobset can be submitted or not. Currently not supported with Jobset CRD versionjobset.x-k8s.io/v1alpha2