[patch] Address several issues w/ Nvidia playbook (#1558)

ibm-mas · Nov 16, 2024 · e81d3a3 · e81d3a3
1 parent 1792e98
commit e81d3a3
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 169 deletions.
diff --git a/ibm/mas_devops/roles/nvidia_gpu/README.md b/ibm/mas_devops/roles/nvidia_gpu/README.md
@@ -29,25 +29,17 @@ The namespace where the NVIDIA GPU operator will be deployed. For version 1.8.x,
 The channel to subscribe to for the gpu operator installation and updates. Available channels may be found in the package manifest of gpu-operator-certified operator in openshift.
 
 - Environment Variable: `GPU_CHANNEL`
-- Default Value: `v23.3`
+- Default Value: `v24.9`
 
 ### gpu_driver_version
-The gpu driver version image that needs to be pulled from the gpu driver repository. It is recommended that the right version of GPU driver is used depending on the OS version. The default versions are shown below. See the attached links for more information and to decide which driver version to use.
+By default, it will pull the latest version and the environment variable is not needed. 
+If a specific version is needed (due to OS version compatibilities), specify the following environment variable.
 
 - Environment Variable: `GPU_DRIVER_VERSION`
-- Default Value: `470.182.03` if ocp version 4.10+ and `450.80.02` otherwise
-The deciding factor for which image version to use is the RHEL or RHCOS version.
-RHEL 8.6            ---> 470.182.03
-RHEL 7.9            ---> 450.80.02
-The version of RHEL that is used by OCP version is per https://cloud.ibm.com/docs/openshift?topic=openshift-rhel_migrate
+- Default Value: N/A
 
-OCP 4.10+           ---> RHEL 8
-OCP 4.9             ---> Either RHEL 8 or RHEL 7 with 7 being the default
-OCP 4.6, 4.7, & 4.8 ---> RHEL 7
-
-RHCOS 4.9 & 4.0     ---> 470.182.03
-RHCOS 4.4 - 4.6     ---> 450.80.02
-For other RHCOS versions find an appropriate driver version at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/driver/tags
+See the attached links for more information and to decide which driver version to use.
+https://catalog.ngc.nvidia.com/orgs/nvidia/containers/driver/tags
 
 ### gpu_driver_repository_path
 The gpu driver repository. If using a different repository, you can set the value for this repo. We only support public repositories at the moment.
@@ -58,7 +50,6 @@ The gpu driver repository. If using a different repository, you can set the valu
 For more information on the NVIDIA GPU and NFD operators, visit https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/openshift/install-gpu-ocp.html
 
 
-
 Example Playbook
 ----------------
 

diff --git a/ibm/mas_devops/roles/nvidia_gpu/defaults/main.yml b/ibm/mas_devops/roles/nvidia_gpu/defaults/main.yml
@@ -3,15 +3,12 @@
 gpu_namespace: "{{ lookup('env', 'GPU_NAMESPACE') | default('nvidia-gpu-operator', true) }}"
 
 # Set channel for install and updates
-gpu_channel: "{{ lookup('env', 'GPU_CHANNEL') | default('v23.3', true) }}"
+gpu_channel: "{{ lookup('env', 'GPU_CHANNEL') | default('v24.9', true) }}"
 
 # GPU driver settings
-_gpu_driver_version: "{{ lookup('env', 'GPU_DRIVER_VERSION') }}"
-rhel8_driver_version: 525.105.17 # default for Rhel 8 and corresponding rhcos
-rhel7_driver_version: 450.80.02  # default for Rhel 7 and corresponding rhcos
+gpu_driver_version: "{{ lookup('env', 'GPU_DRIVER_VERSION') }}"
 gpu_driver_repository_path: "{{ lookup('env', 'GPU_DRIVER_REPOSITORY_PATH') | default('nvcr.io/nvidia', true) }}"
 
-
 # Variables for Node Feature Discovery Operator
 # NFD namespace
 nfd_namespace: "{{ lookup('env', 'NFD_NAMESPACE') | default('openshift-nfd', true) }}"

diff --git a/ibm/mas_devops/roles/nvidia_gpu/tasks/main.yml b/ibm/mas_devops/roles/nvidia_gpu/tasks/main.yml
@@ -18,17 +18,6 @@
   set_fact:
     ocp_version_num: "{{ ocp_version_lookup.resources[0].status.desired.version }}"
 
-- name: "Set gpu driver version default for ocp 4.10+"
-  when: ocp_version_num is version('4.10.0', '>')
-  set_fact:
-    gpu_driver_version: "{{ (_gpu_driver_version is defined and _gpu_driver_version != '') | ternary(_gpu_driver_version, rhel8_driver_version) }}"
-
-- name: "Set gpu driver version for other ocp versions"
-  when: ocp_version_num is version('4.10.0', '<')
-  set_fact:
-    gpu_driver_version: "{{ (_gpu_driver_version is defined and _gpu_driver_version != '') | ternary(_gpu_driver_version, rhel7_driver_version) }}"
-
-
 # 2. Lookup the packagemanifest for gpu-operator-certified
 # -----------------------------------------------------------------------------
 - name: Get gpu-operator-certified package manifest
@@ -62,70 +51,86 @@
       - "OCP Release Version ................ {{ ocp_version_num }}"
       - "GPU Namespace ...................... {{ gpu_namespace }}"
       - "GPU Channel   ...................... {{ gpu_channel }}"
-      - "GPU Driver Version ................. {{ gpu_driver_version }}"
-      - "GPU Driver Repo Path ............... {{ gpu_driver_repository_path }}"
-
 
-# 4. Create NVIDIA GPU project
+# 3. Check if GPU operator is already installed
 # -----------------------------------------------------------------------------
-- name: "Create gpu namespace"
-  kubernetes.core.k8s:
-    api_version: v1
-    kind: Namespace
-    name: '{{ gpu_namespace }}'
-
-
-# 5. Create NVIDIA GPU operator group and subscription
-# -----------------------------------------------------------------------------
-- name: "Create gpu operator group"
-  kubernetes.core.k8s:
-    definition: "{{ lookup('template', 'templates/gpu-operatorgroup.yml.j2') }}"
-    wait: yes
-    wait_timeout: 60 #subsequent tasks will fail if the CRD isn't fully created
-
-- name: "Create gpu subscription"
-  kubernetes.core.k8s:
-    apply: yes
-    definition: "{{ lookup('template', 'templates/gpu-subscription.yml.j2') }}"
-    wait: yes
-    wait_timeout: 300
-    wait_condition:
-      type: 'CatalogSourcesUnhealthy'
-      status: "False"
-
-- name: "Wait until ClusterPolicy CRD is available"
-  include_tasks: "{{ role_path }}/../../common_tasks/wait_for_crd.yml"
-  vars:
-    crd_name: clusterpolicies.nvidia.com
-
-
-# 6. Create GPU Cluster Policy
-# -----------------------------------------------------------------------------
-- name: "Create Cluster Policy instance for ocp 4.10+"
-  when: ocp_version_num is version('4.10', '>')
-  kubernetes.core.k8s:
-    apply: yes
-    definition: "{{ lookup('template', 'templates/clusterpolicy-v2.yml.j2') }}"
-
-- name: "Create Cluster Policy instance for other ocp versions"
-  when: ocp_version_num is version('4.10', '<')
-  kubernetes.core.k8s:
-    apply: yes
-    definition: "{{ lookup('template', 'templates/clusterpolicy.yml.j2') }}"
-
-
-# 7. Wait for Cluster Policy to be ready
-# -----------------------------------------------------------------------------
-- name: "Wait for Cluster Policy instance to be ready (60s delay)"
+- name: Check if GPU operator is already installed
   kubernetes.core.k8s_info:
     api_version: nvidia.com/v1
     name: "gpu-cluster-policy"
     kind: ClusterPolicy
-  register: gpu_cr_result
-  until:
-    - gpu_cr_result.resources is defined and gpu_cr_result.resources | length == 1
-    - gpu_cr_result.resources[0].status is defined
-    - gpu_cr_result.resources[0].status.state is defined
-    - gpu_cr_result.resources[0].status.state == 'ready'
-  retries: 30 # approx 30 minutes before we give up
-  delay: 60 # 1 minute
+  register: gpu_clusterpolicy_result
+
+- name: "Debug information"
+  debug:
+    msg:
+      - "gpu_clusterpolicy_result ...................... {{ gpu_clusterpolicy_result }}"
+
+# 4. Install GPU operator if not already installed
+# -----------------------------------------------------------------------------
+- name: Install GPU operator if not already installed
+  block:
+  # 4.1 Create NVIDIA GPU project
+  # -----------------------------------------------------------------------------
+  - name: "Create gpu namespace"
+    kubernetes.core.k8s:
+      api_version: v1
+      kind: Namespace
+      name: '{{ gpu_namespace }}'
+
+  # 4.2 Create NVIDIA GPU operator group and subscription
+  # -----------------------------------------------------------------------------
+  - name: "Create gpu operator group"
+    kubernetes.core.k8s:
+      definition: "{{ lookup('template', 'templates/gpu-operatorgroup.yml.j2') }}"
+      wait: yes
+      wait_timeout: 60 #subsequent tasks will fail if the CRD isn't fully created
+
+  - name: "Create gpu subscription"
+    kubernetes.core.k8s:
+      apply: yes
+      definition: "{{ lookup('template', 'templates/gpu-subscription.yml.j2') }}"
+      wait: yes
+      wait_timeout: 300
+      wait_condition:
+        type: 'CatalogSourcesUnhealthy'
+        status: "False"
+
+  - name: "Wait until ClusterPolicy CRD is available"
+    include_tasks: "{{ role_path }}/../../common_tasks/wait_for_crd.yml"
+    vars:
+      crd_name: clusterpolicies.nvidia.com
+
+  # 4.3 Create GPU Cluster Policy
+  # -----------------------------------------------------------------------------
+  - name: "Create Cluster Policy instance using latest driver"
+    when: gpu_driver_version is not defined
+    kubernetes.core.k8s:
+      apply: yes
+      definition: "{{ lookup('template', 'templates/clusterpolicy-v2.yml.j2') }}"
+
+  - name: "Create Cluster Policy instance using custom driver"
+    when: gpu_driver_version is defined
+    kubernetes.core.k8s:
+      apply: yes
+      definition: "{{ lookup('template', 'templates/clusterpolicy-customversion.yml.j2') }}"
+
+  # 4.4. Wait for Cluster Policy to be ready
+  # -----------------------------------------------------------------------------
+  - name: "Wait for Cluster Policy instance to be ready (60s delay)"
+    kubernetes.core.k8s_info:
+      api_version: nvidia.com/v1
+      name: "gpu-cluster-policy"
+      kind: ClusterPolicy
+    register: gpu_cr_result
+    until:
+      - gpu_cr_result.resources is defined and gpu_cr_result.resources | length == 1
+      - gpu_cr_result.resources[0].status is defined
+      - gpu_cr_result.resources[0].status.state is defined
+      - gpu_cr_result.resources[0].status.state == 'ready'
+    retries: 30 # approx 30 minutes before we give up
+    delay: 60 # 1 minute
+  when:
+    - gpu_clusterpolicy_result.resources | length == 0
+    - gpu_clusterpolicy_result.resources[0].status is not defined
+
diff --git a/ibm/mas_devops/roles/nvidia_gpu/tasks/nfd_setup.yml b/ibm/mas_devops/roles/nvidia_gpu/tasks/nfd_setup.yml
@@ -34,64 +34,82 @@
       - "NFD Channel   ...................... {{ nfd_channel }}"
 
 
-# 3. Create NFD project
+# 3. Check if NFD operator is already installed
 # -----------------------------------------------------------------------------
-- name: "Create nfd namespace"
-  kubernetes.core.k8s:
-    api_version: v1
-    kind: Namespace
-    name: '{{ nfd_namespace }}'
-
-
-# 4. Create NFD operator group and subscription
-# -----------------------------------------------------------------------------
-- name: "Create nfd operator group"
-  kubernetes.core.k8s:
-    definition: "{{ lookup('template', 'templates/nfd-operatorgroup.yml.j2') }}"
-    wait: yes
-    wait_timeout: 60 #subsequent tasks will fail if the CRD isn't fully created
-
-- name: "Create nfd subscription"
-  kubernetes.core.k8s:
-    apply: yes
-    definition: "{{ lookup('template', 'templates/nfd-subscription.yml.j2') }}"
-    wait: yes
-    wait_timeout: 300
-    wait_condition:
-      type: 'CatalogSourcesUnhealthy'
-      status: "False"
-
-- name: "Wait until NodeFeatureDiscoveries CRD is available"
-  include_tasks: "{{ role_path }}/../../common_tasks/wait_for_crd.yml"
-  vars:
-    crd_name: nodefeaturediscoveries.nfd.openshift.io
-
-
-# 5. Create NFD instance
-# -----------------------------------------------------------------------------
-- name: "Create NodeFeatureDiscovery instance"
-  kubernetes.core.k8s:
-    apply: yes
-    definition: "{{ lookup('template', 'templates/nfd-instance.yml.j2') }}"
-
-
-# 6. Make sure NFD daemonsets have been created and all pods are ready
-# -----------------------------------------------------------------------------
-# Depending on the version of NFD there may also be a nfd-master DaemonSet, but because
-# newer versions use a combined worker-master model we will only wait for the nfd-worker
-# DaemonSet so that this will work regardless of the version of OCP/NFD that is being used.
-
-- name: "Wait for 'nfd-worker' DaemonSet pods to be ready"
+- name: Check if NFD operator is already installed
   kubernetes.core.k8s_info:
     api_version: apps/v1
     name: nfd-worker
     namespace: "{{nfd_namespace}}"
     kind: DaemonSet
-  register: nfd_worker_daemonset
-  until:
-    - nfd_worker_daemonset.resources is defined
-    - nfd_worker_daemonset.resources | length > 0
-    - nfd_worker_daemonset.resources[0].status.numberReady > 0
-    - nfd_worker_daemonset.resources[0].status.numberReady == nfd_worker_daemonset.resources[0].status.desiredNumberScheduled
-  retries: 30 # approx 30 minutes before we give up
-  delay: 60 # 1 minute
+  register: nfd_worker_daemonset_result
+
+- name: "Debug information"
+  debug:
+    msg:
+      - "nfd_worker_daemonset_result ...................... {{ nfd_worker_daemonset_result }}"
+
+# 4. Install NFD operator if not already installed
+# -----------------------------------------------------------------------------
+- name: Install NFD operator if not already installed
+  block:
+  # 4.1 Create NFD project
+  # -----------------------------------------------------------------------------
+  - name: "Create nfd namespace"
+    kubernetes.core.k8s:
+      api_version: v1
+      kind: Namespace
+      name: '{{ nfd_namespace }}'
+
+  # 4.2 Create NFD operator group and subscription
+  # -----------------------------------------------------------------------------
+  - name: "Create nfd operator group"
+    kubernetes.core.k8s:
+      definition: "{{ lookup('template', 'templates/nfd-operatorgroup.yml.j2') }}"
+      wait: yes
+      wait_timeout: 60 #subsequent tasks will fail if the CRD isn't fully created
+
+  - name: "Create nfd subscription"
+    kubernetes.core.k8s:
+      apply: yes
+      definition: "{{ lookup('template', 'templates/nfd-subscription.yml.j2') }}"
+      wait: yes
+      wait_timeout: 300
+      wait_condition:
+        type: 'CatalogSourcesUnhealthy'
+        status: "False"
+
+  - name: "Wait until NodeFeatureDiscoveries CRD is available"
+    include_tasks: "{{ role_path }}/../../common_tasks/wait_for_crd.yml"
+    vars:
+      crd_name: nodefeaturediscoveries.nfd.openshift.io
+
+  # 4.3 Create NFD instance
+  # -----------------------------------------------------------------------------
+  - name: "Create NodeFeatureDiscovery instance"
+    kubernetes.core.k8s:
+      apply: yes
+      definition: "{{ lookup('template', 'templates/nfd-instance.yml.j2') }}"
+
+  # 4.4. Make sure NFD daemonsets have been created and all pods are ready
+  # -----------------------------------------------------------------------------
+  # Depending on the version of NFD there may also be a nfd-master DaemonSet, but because
+  # newer versions use a combined worker-master model we will only wait for the nfd-worker
+  # DaemonSet so that this will work regardless of the version of OCP/NFD that is being used.
+
+  - name: "Wait for 'nfd-worker' DaemonSet pods to be ready"
+    kubernetes.core.k8s_info:
+      api_version: apps/v1
+      name: nfd-worker
+      namespace: "{{nfd_namespace}}"
+      kind: DaemonSet
+    register: nfd_worker_daemonset
+    until:
+      - nfd_worker_daemonset.resources is defined
+      - nfd_worker_daemonset.resources | length > 0
+      - nfd_worker_daemonset.resources[0].status.numberReady > 0
+      - nfd_worker_daemonset.resources[0].status.numberReady == nfd_worker_daemonset.resources[0].status.desiredNumberScheduled
+    retries: 30 # approx 30 minutes before we give up
+    delay: 60 # 1 minute
+  when:
+    - nfd_worker_daemonset_result.resources | length == 0