Skip to content

Commit

Permalink
GPU support added + container naming same as argo
Browse files Browse the repository at this point in the history
  • Loading branch information
valayDave committed Jul 29, 2022
1 parent 9745d6f commit 85d3391
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 10 deletions.
31 changes: 23 additions & 8 deletions metaflow/plugins/airflow/airflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,29 @@ def _to_job(self, node):
if k8s_deco.attributes["namespace"] is not None
else "default"
)

resources = dict(
requests={
"cpu": k8s_deco.attributes["cpu"],
"memory": "%sM" % str(k8s_deco.attributes["memory"]),
"ephemeral-storage": str(k8s_deco.attributes["disk"]),
}
)
if k8s_deco.attributes["gpu"] is not None:
resources.update(
dict(
limits={
"%s.com/gpu".lower()
% k8s_deco.attributes["gpu_vendor"]: str(
k8s_deco.attributes["gpu"]
)
}
)
)

k8s_operator_args = dict(
# like argo workflows we use step_name as name of container
name=node.name,
namespace=k8s_namespace,
service_account_name=service_account,
node_selector=k8s_deco.attributes["node_selector"],
Expand All @@ -394,14 +416,7 @@ def _to_job(self, node):
),
),
image=k8s_deco.attributes["image"],
# TODO : (savin-comments) add gpu support with limits
resources=dict(
requests={
"cpu": k8s_deco.attributes["cpu"],
"memory": "%sM" % str(k8s_deco.attributes["memory"]),
"ephemeral-storage": str(k8s_deco.attributes["disk"]),
}
),
resources=resources,
execution_timeout=dict(seconds=runtime_limit),
retries=user_code_retries,
env_vars=[dict(name=k, value=v) for k, v in env.items()],
Expand Down
2 changes: 0 additions & 2 deletions metaflow/plugins/airflow/airflow_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,6 @@ def _kubernetes_pod_operator_args(flow_name, step_name, operator_args):
args = operator_args
args.update(
{
# TODO : (savin-comments) : we should be able to have a cleaner name - take a look at the argo implementation
"name": generate_rfc1123_name(flow_name, step_name),
"secrets": secrets,
# Question for (savin):
# Default timeout in airflow is 120. I can remove `startup_timeout_seconds` for now. how should we expose it to the user?
Expand Down

0 comments on commit 85d3391

Please sign in to comment.