Merge pull request #1059 from iamadrigal/gpu-operator-nvaie-vgpu

GPU Operator automation with NVIDIA AI Enterprise
NVIDIA · Dec 3, 2021 · bee2a9e · bee2a9e
2 parents b5c83c2 + 196140f
commit bee2a9e
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 50 deletions.
diff --git a/roles/nvidia-gpu-operator/defaults/main.yml b/roles/nvidia-gpu-operator/defaults/main.yml
@@ -9,7 +9,12 @@
 gpu_operator_helm_repo: "https://nvidia.github.io/gpu-operator"
 gpu_operator_chart_name: "nvidia/gpu-operator"
 gpu_operator_release_name: "nvidia-gpu-operator"
+gpu_operator_nvaie_helm_repo: "https://helm.ngc.nvidia.com/nvaie"
+gpu_operator_nvaie_chart_name: "nvaie/gpu-operator"
+
+# NVAIE GPU Operator may require different version, check NGC enterprise collection.
 gpu_operator_chart_version: "1.8.2"
+
 k8s_gpu_mig_strategy: "mixed"
 
 # Configuration customization
@@ -22,14 +27,23 @@ gpu_operator_default_runtime: "docker"
 gpu_operator_driver_registry: "nvcr.io/nvidia"
 gpu_operator_driver_version: "470.57.02"
 
-# Variables used for vGPU
-gpu_operator_nvaie_helm_repo: "https://helm.ngc.nvidia.com/nvaie"
-gpu_operator_nvaie_chart_name: "nvaie/gpu-operator"
+# This enables/disables NVAIE
+gpu_operator_nvaie_enable: false
+## When using NVAIE, this enables/disables NLS licensing
+## When true, NLS is used. When false, traditional license server is used.
+gpu_operator_nvaie_nls_enabled: true
+
+# vGPU Licensing Info
+## Value of NLS Token file String
+gpu_operator_nvaie_nls_token: ""
 ## This is the IP of the license server used for vGPU, must be set to use vGPU
 vgpu_grid_license_server: ""
+
+# NGC authentication information (Required for NVAIE)
 ## This should remain as $oauthtoken if using an NGC API key
 gpu_operator_registry_username: "$oauthtoken"
 ## This is most likely an NGC API key
 gpu_operator_registry_password: ""
 ## This is most likely an NGC email
 gpu_operator_registry_email: ""
+
diff --git a/roles/nvidia-gpu-operator/tasks/k8s.yml b/roles/nvidia-gpu-operator/tasks/k8s.yml
@@ -0,0 +1,15 @@
+---
+# While we would prefer to use the Ansible helm module, it's broken! :-(
+# See https://github.com/ansible/ansible/pull/57897
+# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released.
+# So for now we will run /usr/local/bin/helm commands directly...
+- name: install gpu-operator helm repo
+  command: /usr/local/bin/helm repo add nvidia "{{ gpu_operator_helm_repo }}"
+
+- name: update helm repos
+  command: /usr/local/bin/helm repo update
+
+# XXX: This currently installs into the default namespace, as per the GPU Operator docs
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
+- name: install nvidia gpu operator
+  command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_chart_name }}" --version "{{ gpu_operator_chart_version }}" --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.version="{{ gpu_operator_driver_version }}" --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait
diff --git a/roles/nvidia-gpu-operator/tasks/main.yml b/roles/nvidia-gpu-operator/tasks/main.yml
@@ -1,50 +1,7 @@
 ---
-# vGPU support
-- block:
-  - name: Create namespace for GPU Operator resources
-    shell: kubectl create namespace {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -
-  - name: create local directory for gpu operator configuration
-    file:
-      path: "{{ gpu_operator_grid_config_dir }}"
-      state: directory
-      owner: "root"
-      group: "root"
-      mode: "0700"
-  - name: add gridd.conf config file from template
-    template:
-      src: "gridd.conf"
-      dest: "{{ gpu_operator_grid_config_dir }}/gridd.conf"
-      owner: "root"
-      group: "root"
-      mode: "0600"
-  - name: Create a docker gridd.conf license configmap
-    shell: kubectl create configmap licensing-config -n "{{ gpu_operator_namespace }}" --from-file="{{gpu_operator_grid_config_dir }}/gridd.conf" -o yaml --dry-run=client | kubectl apply -f -
-  - name: Create a docker secret for private GPU Operator containers
-    shell: kubectl create secret docker-registry registry-secret --docker-server='{{ gpu_operator_driver_registry }}' --docker-username='{{ gpu_operator_registry_username }}' --docker-password={{ gpu_operator_registry_password }} --docker-email={{ gpu_operator_registry_email }} -n {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -
-  - name: Set secret var for helm command line if specified
-    set_fact:
-      gpu_operator_registry_secret: --set driver.imagePullSecrets[0]="registry-secret"
-  - name: Set license config var for helm command line if specified
-    set_fact:
-      gpu_operator_license_var: --set driver.licensingConfig.configMapName=licensing-config
-  - name: Use NVAIE helm charts
-    set_fact:
-      gpu_operator_chart_name: "{{ gpu_operator_nvaie_chart_name }}"
-  - name: install NVAIE gpu-operator helm repo
-    command: /usr/local/bin/helm repo add nvaie "{{ gpu_operator_nvaie_helm_repo }}" --username='{{ gpu_operator_registry_username }}' --password="{{ gpu_operator_registry_password }}"
-  when: vgpu_grid_license_server != ""
+- include: k8s.yml
+  when: not gpu_operator_nvaie_enable
 
-# While we would prefer to use the Ansible helm module, it's broken! :-(
-# See https://github.com/ansible/ansible/pull/57897
-# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released.
-# So for now we will run /usr/local/bin/helm commands directly...
-- name: install gpu-operator helm repo
-  command: /usr/local/bin/helm repo add nvidia "{{ gpu_operator_helm_repo }}"
+- include: nvaie.yml
+  when: gpu_operator_nvaie_enable
 
-- name: update helm repos
-  command: /usr/local/bin/helm repo update
-
-# XXX: This currently installs into the default namespace, as per the GPU Operator docs
-# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
-- name: install nvidia gpu operator
-  command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_chart_name }}" --version "{{ gpu_operator_chart_version }}" --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.repository="{{ gpu_operator_driver_registry }}" --set driver.version="{{ gpu_operator_driver_version }}" {{ gpu_operator_registry_secret | default("") }} {{ gpu_operator_license_var | default("") }} --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait
diff --git a/roles/nvidia-gpu-operator/tasks/nvaie.yml b/roles/nvidia-gpu-operator/tasks/nvaie.yml
@@ -0,0 +1,50 @@
+---
+# vGPU support
+# 
+- name: Create namespace for GPU Operator resources
+  shell: kubectl create namespace {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -
+
+- name: create local directory for gpu operator configuration
+  file:
+    path: "{{ gpu_operator_grid_config_dir }}"
+    state: directory
+    owner: "root"
+    group: "root"
+    mode: "0700"
+
+- name: add gridd.conf config file from template
+  template:
+    src: "gridd.conf"
+    dest: "{{ gpu_operator_grid_config_dir }}/gridd.conf"
+    owner: "root"
+    group: "root"
+    mode: "0600"
+
+- name: add client_configuration_token.tok token file from template
+  template:
+    src: "client_configuration_token.tok"
+    dest: "{{ gpu_operator_grid_config_dir }}/client_configuration_token.tok"
+    owner: "root"
+    group: "root"
+    mode: "0600"
+
+- name: Create a docker gridd.conf license configmap
+  shell: kubectl create configmap licensing-config -n "{{ gpu_operator_namespace }}" --from-file="{{ gpu_operator_grid_config_dir }}/gridd.conf" --from-file="{{ gpu_operator_grid_config_dir }}/client_configuration_token.tok" -o yaml --dry-run=client | kubectl apply -f -
+
+- name: Create a docker secret for private GPU Operator containers
+  shell: kubectl create secret docker-registry registry-secret --docker-server='{{ gpu_operator_driver_registry }}' --docker-username='{{ gpu_operator_registry_username }}' --docker-password={{ gpu_operator_registry_password }} --docker-email={{ gpu_operator_registry_email }} -n {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -
+
+# While we would prefer to use the Ansible helm module, it's broken! :-(
+# See https://github.com/ansible/ansible/pull/57897
+# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released.
+# So for now we will run /usr/local/bin/helm commands directly...
+- name: install NVAIE gpu-operator helm repo
+  command: /usr/local/bin/helm repo add nvaie "{{ gpu_operator_nvaie_helm_repo }}" --username='{{ gpu_operator_registry_username }}' --password="{{ gpu_operator_registry_password }}"
+
+- name: update helm repos
+  command: /usr/local/bin/helm repo update
+
+# XXX: This currently installs into the default namespace, as per the GPU Operator docs
+# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
+- name: install nvidia gpu operator
+  command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_nvaie_chart_name }}" --version "{{ gpu_operator_chart_version }}" --namespace "{{ gpu_operator_namespace }}" --create-namespace --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.licensingConfig.configMapName=licensing-config --set driver.licensingConfig.nlsEnabled="{{ gpu_operator_nvaie_nls_enabled }}"  --set driver.imagePullSecrets[0]="registry-secret" --set operator.imagePullSecrets[0]="registry-secret" --set driver.repository="{{ gpu_operator_driver_registry }}" --set driver.version="{{ gpu_operator_driver_version }}"  --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait
diff --git a/roles/nvidia-gpu-operator/templates/client_configuration_token.tok b/roles/nvidia-gpu-operator/templates/client_configuration_token.tok
@@ -0,0 +1 @@
+"{{ gpu_operator_nvaie_nls_token }}"