Update images used in the pipeline GPU tests to use RHOAI 2.15 workbe…

…nch runtimes Signed-off-by: Jorge Garcia Oncins <jgarciao@redhat.com>
jgarciao · Oct 30, 2024 · 04268b9 · 04268b9
1 parent e1bd42e
commit 04268b9
Show file tree

Hide file tree

Showing 5 changed files with 72 additions and 6 deletions.
diff --git a/...rces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py b/...rces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py
@@ -3,7 +3,7 @@
 
 #  Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
 common_base_image = (
-    "quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf"
+    "quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10"
 )
 
 

diff --git a/...pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml b/...pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml
@@ -53,7 +53,7 @@ deploymentSpec:
           \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
           \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
           \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
+        image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
     exec-verify-gpu-availability-2:
       container:
         args:
@@ -91,7 +91,7 @@ deploymentSpec:
           \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
           \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
           \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
+        image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
         resources:
           accelerator:
             count: '1'

diff --git a/...s/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py b/...s/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py
@@ -3,7 +3,7 @@
 
 #  Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
 common_base_image = (
-    "quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa"
+    "quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a"
 )
 
 

diff --git a/...eline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml b/...eline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml
@@ -53,7 +53,7 @@ deploymentSpec:
           \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
           \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
           \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
+        image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
     exec-verify-gpu-availability-2:
       container:
         args:
@@ -91,7 +91,7 @@ deploymentSpec:
           \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
           \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
           \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
+        image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
         resources:
           accelerator:
             count: '1'

diff --git a/...urces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_quickstart_tutorial.py b/...urces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_quickstart_tutorial.py
@@ -0,0 +1,66 @@
+from kfp import compiler, dsl, kubernetes
+from kfp.dsl import PipelineTask
+
+#  Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
+common_base_image = (
+    "quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa"
+)
+
+
+# Plain Python image
+# common_base_image = (
+#     "registry.redhat.io/ubi8/python-39@sha256:3523b184212e1f2243e76d8094ab52b01ea3015471471290d011625e1763af61"
+# )
+
+
+def add_pip_index_configuration(task: PipelineTask):
+    kubernetes.use_config_map_as_env(
+        task,
+        config_map_name="ds-pipeline-custom-env-vars",
+        config_map_key_to_env={"pip_index_url": "PIP_INDEX_URL", "pip_trusted_host": "PIP_TRUSTED_HOST"},
+    )
+
+
+def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int):
+    print("Adding GPU tolerations")
+    task.set_accelerator_type(accelerator=accelerator_type)
+    task.set_accelerator_limit(accelerator_limit)
+    kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")
+
+
+@dsl.component(base_image=common_base_image, packages_to_install=["torch"], pip_index_urls=["$PIP_INDEX_URL"])
+def verify_gpu_availability(gpu_toleration_added: bool):
+    import torch  # noqa: PLC0415
+
+    cuda_available = torch.cuda.is_available()
+    device_count = torch.cuda.device_count()
+    print("------------------------------")
+    print("GPU availability")
+    print("------------------------------")
+    print("gpu_toleration_added:" + str(gpu_toleration_added))
+    print("torch.cuda.is_available():" + str(cuda_available))
+    print("torch.cuda.device_count():" + str(device_count))
+    if gpu_toleration_added and not torch.cuda.is_available():
+        print("GPU availability test: FAIL")
+        raise ValueError("GPU toleration was added but there is no GPU not available for this task")
+    if not gpu_toleration_added and torch.cuda.is_available():
+        print("GPU availability test: FAIL")
+        raise ValueError("GPU toleration was not added but there is a GPU available for this task")
+    print("GPU availability test: PASS")
+
+
+@dsl.pipeline(
+    name="pytorch-quickstart-tutorial",
+    description="Verifies pipeline tasks run on GPU nodes only when tolerations are added",
+)
+def pytorch_quickstart_tutorial():
+    task_without_toleration = verify_gpu_availability(gpu_toleration_added=False).set_caching_options(False)
+    add_pip_index_configuration(task_without_toleration)
+
+    task_with_toleration = verify_gpu_availability(gpu_toleration_added=True).set_caching_options(False)
+    add_pip_index_configuration(task_with_toleration)
+    add_gpu_toleration(task_with_toleration, "nvidia.com/gpu", 1)
+
+
+if __name__ == "__main__":
+    compiler.Compiler().compile(pytorch_quickstart_tutorial, package_path=__file__.replace(".py", "_compiled.yaml"))