Skip to content

Commit

Permalink
Support configuration of GPU vendor (#3029)
Browse files Browse the repository at this point in the history
Signed-off-by: typhoonzero <typhoonzero1986@gmail.com>
Signed-off-by: Patrick Titzler <ptitzler@us.ibm.com>
Co-authored-by: Patrick Titzler <ptitzler@us.ibm.com>
  • Loading branch information
typhoonzero and ptitzler authored Nov 30, 2022
1 parent c7d9632 commit 8eca87d
Show file tree
Hide file tree
Showing 9 changed files with 73 additions and 15 deletions.
4 changes: 3 additions & 1 deletion docs/source/user_guide/pipelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,9 @@ The following alphabetically sorted list identifies the node properties that are

##### Resources: CPU, GPU, and RAM
- Resources that the notebook or script requires. RAM takes units of gigabytes (10<sup>9</sup> bytes).
- The values are ignored when the pipeline is executed locally.
- Specify a custom Kubernetes GPU vendor, if desired. The default vendor is `nvidia.com/gpu`. See [this topic in the Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/) for more information.
- The values are ignored when the pipeline is executed locally.
- Example: `amd.com/gpu`

##### Runtime image

Expand Down
8 changes: 4 additions & 4 deletions elyra/pipeline/kfp/processor_kfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,10 +784,10 @@ def _generate_workflow_tasks(
"size": operation.memory,
"units": "G",
}
workflow_task["task_modifiers"]["gpu_limit"] = {
"size": operation.gpu,
"vendor": workflow_task["task_modifiers"]["env_variables"].get("GPU_VENDOR", "nvidia"),
}
gpu_vendor = "nvidia.com/gpu"
if operation.gpu_vendor:
gpu_vendor = operation.gpu_vendor
workflow_task["task_modifiers"]["gpu_limit"] = {"size": operation.gpu, "vendor": gpu_vendor}

if is_crio_runtime:
# Attach empty dir volume
Expand Down
8 changes: 7 additions & 1 deletion elyra/pipeline/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def __init__(
cpu: number of cpus requested to run the operation
memory: amount of memory requested to run the operation (in Gi)
gpu: number of gpus requested to run the operation
gpu_vendor: gpu resource type, eg. nvidia.com/gpu, amd.com/gpu etc.
Entries for other (non-built-in) component types are a function of the respective component.
:param elyra_params: dictionary of parameter key:value pairs that are owned by Elyra
Expand All @@ -270,8 +271,9 @@ def __init__(
self._component_params["dependencies"] = Operation._scrub_list(component_params.get("dependencies", []))
self._component_params["include_subdirectories"] = component_params.get("include_subdirectories", False)
self._component_params["cpu"] = component_params.get("cpu")
self._component_params["gpu"] = component_params.get("gpu")
self._component_params["memory"] = component_params.get("memory")
self._component_params["gpu"] = component_params.get("gpu")
self._component_params["gpu_vendor"] = component_params.get("gpu_vendor")

if not elyra_params:
elyra_params = {}
Expand Down Expand Up @@ -319,6 +321,10 @@ def memory(self) -> Optional[str]:
def gpu(self) -> Optional[str]:
return self._component_params.get("gpu")

@property
def gpu_vendor(self) -> Optional[str]:
return self._component_params.get("gpu_vendor")

def __eq__(self, other: GenericOperation) -> bool:
if isinstance(self, other.__class__):
return super().__eq__(other)
Expand Down
15 changes: 15 additions & 0 deletions elyra/pipeline/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from elyra.pipeline.pipeline_definition import PipelineDefinition
from elyra.pipeline.processor import PipelineProcessorManager
from elyra.pipeline.runtime_type import RuntimeProcessorType
from elyra.util.kubernetes import is_valid_kubernetes_device_plugin_name
from elyra.util.path import get_expanded_path


Expand Down Expand Up @@ -428,6 +429,20 @@ async def _validate_generic_node_properties(self, node: Node, response: Validati
resource_value=resource_value,
response=response,
)
for resource_vendor in ["gpu_vendor"]:
vendor = node.get_component_parameter(resource_vendor)
if vendor and not is_valid_kubernetes_device_plugin_name(vendor):
response.add_message(
severity=ValidationSeverity.Error,
message_type="invalidNodeProperty",
message="Property is not a valid resource vendor name.",
data={
"nodeID": node.id,
"nodeName": node_label,
"propertyName": resource_vendor,
"value": vendor,
},
)

for param in node.elyra_owned_properties:
required = self._is_required_property(component_props, param)
Expand Down
18 changes: 13 additions & 5 deletions elyra/templates/components/generic_properties_template.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,25 @@
"description": "For CPU-intensive workloads, you can choose more than 1 CPU (e.g. 1.5).",
"minimum": 0
},
"memory": {
"type": "integer",
"title": "RAM(GB)",
"description": "The total amount of RAM specified.",
"minimum": 0
},
"gpu": {
"type": "integer",
"title": "GPU",
"description": "For GPU-intensive workloads, you can choose more than 1 GPU. Must be an integer.",
"minimum": 0
},
"memory": {
"type": "integer",
"title": "RAM(GB)",
"description": "The total amount of RAM specified.",
"minimum": 0
"gpu_vendor": {
"type": "string",
"title": "GPU Vendor",
"description": "GPU Vendor, or K8s GPU resource type, default 'nvidia.com/gpu'.",
"uihints": {
"ui:placeholder": "nvidia.com/gpu"
}
},
"dependencies": {
"title": "File Dependencies",
Expand Down
2 changes: 1 addition & 1 deletion elyra/templates/kubeflow/v1/python_dsl_template.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def generated_pipeline(
{{ task_name }}.container.set_memory_request(memory="{{ workflow_task.task_modifiers.mem_request.size }}{{ workflow_task.task_modifiers.mem_request.units }}")
{% endif %}
{% if workflow_task.task_modifiers.gpu_limit and workflow_task.task_modifiers.gpu_limit.size %}
{{ task_name }}.container.set_gpu_limit(gpu="{{ workflow_task.task_modifiers.gpu_limit.size }}", vendor="{{ workflow_task.task_modifiers.gpu_limit.vendor }}")
{{ task_name }}.container.add_resource_limit(resource_name="{{ workflow_task.task_modifiers.gpu_limit.vendor }}", value="{{ workflow_task.task_modifiers.gpu_limit.size }}")
{% endif %}
{% if workflow_task.task_modifiers.env_variables %}
{% for env_var_name, env_var_value in workflow_task.task_modifiers.env_variables.items() %}
Expand Down
7 changes: 5 additions & 2 deletions elyra/tests/pipeline/test_pipeline_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,9 @@ def test_validate_resource_values():
component_parameters = {
"filename": "elyra/pipeline/tests/resources/archive/test.ipynb",
"cpu": "4",
"gpu": "6",
"memory": "10",
"gpu": "6",
"gpu_vendor": "example.com/gpu",
"runtime_image": "tensorflow/tensorflow:latest",
}
test_operation = GenericOperation(
Expand All @@ -367,6 +368,7 @@ def test_validate_resource_values():

assert test_operation.cpu == "4"
assert test_operation.gpu == "6"
assert test_operation.gpu_vendor == "example.com/gpu"
assert test_operation.memory == "10"


Expand All @@ -385,8 +387,9 @@ def test_validate_resource_values_as_none():
)

assert test_operation.cpu is None
assert test_operation.gpu is None
assert test_operation.memory is None
assert test_operation.gpu is None
assert test_operation.gpu_vendor is None


def test_validate_gpu_accepts_zero_as_value():
Expand Down
10 changes: 10 additions & 0 deletions elyra/util/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,16 @@ def is_valid_kubernetes_key(name: str) -> bool:
return re.match(r"^[\w\-_.]+$", name) is not None


def is_valid_kubernetes_device_plugin_name(key: str) -> bool:
"""
Returns a truthy value indicating whether name meets the kubernetes
naming constraints for device plugin custom schedulable resource, as outlined in the link below.
https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins
"""
return is_valid_annotation_key(key)


def is_valid_annotation_key(key: str) -> bool:
"""
Returns a truthy value indicating whether name meets the kubernetes
Expand Down
16 changes: 15 additions & 1 deletion packages/pipeline-editor/style/index.css
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ span.bx--list-box__label {
}

.elyra-PipelineEditor .form-group.field.field-integer {
width: 30%;
width: 100%;
}

.elyra-PipelineEditor .array-pipelineDefaults.form-control {
Expand Down Expand Up @@ -836,6 +836,20 @@ input.elyra-Dialog-checkbox.jp-mod-styled {
padding-bottom: 4px;
}

div#root_component_parameters_cpu,
div#root_component_parameters_memory,
div#root_component_parameters_gpu,
div#root_component_parameters_gpu_vendor {
width: 50%;
}

input#root_component_parameters_cpu,
input#root_component_parameters_memory,
input#root_component_parameters_gpu,
input#root_component_parameters_gpu_vendor {
width: 100%;
}

#cpu,
#gpu,
#memory {
Expand Down

0 comments on commit 8eca87d

Please sign in to comment.