Skip to content

Commit

Permalink
Merge pull request #1059 from iamadrigal/gpu-operator-nvaie-vgpu
Browse files Browse the repository at this point in the history
GPU Operator automation with NVIDIA AI Enterprise
  • Loading branch information
ajdecon authored Dec 3, 2021
2 parents b5c83c2 + 196140f commit bee2a9e
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 50 deletions.
20 changes: 17 additions & 3 deletions roles/nvidia-gpu-operator/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
gpu_operator_helm_repo: "https://nvidia.github.io/gpu-operator"
gpu_operator_chart_name: "nvidia/gpu-operator"
gpu_operator_release_name: "nvidia-gpu-operator"
gpu_operator_nvaie_helm_repo: "https://helm.ngc.nvidia.com/nvaie"
gpu_operator_nvaie_chart_name: "nvaie/gpu-operator"

# NVAIE GPU Operator may require different version, check NGC enterprise collection.
gpu_operator_chart_version: "1.8.2"

k8s_gpu_mig_strategy: "mixed"

# Configuration customization
Expand All @@ -22,14 +27,23 @@ gpu_operator_default_runtime: "docker"
gpu_operator_driver_registry: "nvcr.io/nvidia"
gpu_operator_driver_version: "470.57.02"

# Variables used for vGPU
gpu_operator_nvaie_helm_repo: "https://helm.ngc.nvidia.com/nvaie"
gpu_operator_nvaie_chart_name: "nvaie/gpu-operator"
# This enables/disables NVAIE
gpu_operator_nvaie_enable: false
## When using NVAIE, this enables/disables NLS licensing
## When true, NLS is used. When false, traditional license server is used.
gpu_operator_nvaie_nls_enabled: true

# vGPU Licensing Info
## Value of NLS Token file String
gpu_operator_nvaie_nls_token: ""
## This is the IP of the license server used for vGPU, must be set to use vGPU
vgpu_grid_license_server: ""

# NGC authentication information (Required for NVAIE)
## This should remain as $oauthtoken if using an NGC API key
gpu_operator_registry_username: "$oauthtoken"
## This is most likely an NGC API key
gpu_operator_registry_password: ""
## This is most likely an NGC email
gpu_operator_registry_email: ""

15 changes: 15 additions & 0 deletions roles/nvidia-gpu-operator/tasks/k8s.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
---
# While we would prefer to use the Ansible helm module, it's broken! :-(
# See https://github.com/ansible/ansible/pull/57897
# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released.
# So for now we will run /usr/local/bin/helm commands directly...
- name: install gpu-operator helm repo
command: /usr/local/bin/helm repo add nvidia "{{ gpu_operator_helm_repo }}"

- name: update helm repos
command: /usr/local/bin/helm repo update

# XXX: This currently installs into the default namespace, as per the GPU Operator docs
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
- name: install nvidia gpu operator
command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_chart_name }}" --version "{{ gpu_operator_chart_version }}" --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.version="{{ gpu_operator_driver_version }}" --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait
51 changes: 4 additions & 47 deletions roles/nvidia-gpu-operator/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -1,50 +1,7 @@
---
# vGPU support
- block:
- name: Create namespace for GPU Operator resources
shell: kubectl create namespace {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -
- name: create local directory for gpu operator configuration
file:
path: "{{ gpu_operator_grid_config_dir }}"
state: directory
owner: "root"
group: "root"
mode: "0700"
- name: add gridd.conf config file from template
template:
src: "gridd.conf"
dest: "{{ gpu_operator_grid_config_dir }}/gridd.conf"
owner: "root"
group: "root"
mode: "0600"
- name: Create a docker gridd.conf license configmap
shell: kubectl create configmap licensing-config -n "{{ gpu_operator_namespace }}" --from-file="{{gpu_operator_grid_config_dir }}/gridd.conf" -o yaml --dry-run=client | kubectl apply -f -
- name: Create a docker secret for private GPU Operator containers
shell: kubectl create secret docker-registry registry-secret --docker-server='{{ gpu_operator_driver_registry }}' --docker-username='{{ gpu_operator_registry_username }}' --docker-password={{ gpu_operator_registry_password }} --docker-email={{ gpu_operator_registry_email }} -n {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -
- name: Set secret var for helm command line if specified
set_fact:
gpu_operator_registry_secret: --set driver.imagePullSecrets[0]="registry-secret"
- name: Set license config var for helm command line if specified
set_fact:
gpu_operator_license_var: --set driver.licensingConfig.configMapName=licensing-config
- name: Use NVAIE helm charts
set_fact:
gpu_operator_chart_name: "{{ gpu_operator_nvaie_chart_name }}"
- name: install NVAIE gpu-operator helm repo
command: /usr/local/bin/helm repo add nvaie "{{ gpu_operator_nvaie_helm_repo }}" --username='{{ gpu_operator_registry_username }}' --password="{{ gpu_operator_registry_password }}"
when: vgpu_grid_license_server != ""
- include: k8s.yml
when: not gpu_operator_nvaie_enable

# While we would prefer to use the Ansible helm module, it's broken! :-(
# See https://github.com/ansible/ansible/pull/57897
# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released.
# So for now we will run /usr/local/bin/helm commands directly...
- name: install gpu-operator helm repo
command: /usr/local/bin/helm repo add nvidia "{{ gpu_operator_helm_repo }}"
- include: nvaie.yml
when: gpu_operator_nvaie_enable

- name: update helm repos
command: /usr/local/bin/helm repo update

# XXX: This currently installs into the default namespace, as per the GPU Operator docs
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
- name: install nvidia gpu operator
command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_chart_name }}" --version "{{ gpu_operator_chart_version }}" --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.repository="{{ gpu_operator_driver_registry }}" --set driver.version="{{ gpu_operator_driver_version }}" {{ gpu_operator_registry_secret | default("") }} {{ gpu_operator_license_var | default("") }} --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait
50 changes: 50 additions & 0 deletions roles/nvidia-gpu-operator/tasks/nvaie.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
# vGPU support
#
- name: Create namespace for GPU Operator resources
shell: kubectl create namespace {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -

- name: create local directory for gpu operator configuration
file:
path: "{{ gpu_operator_grid_config_dir }}"
state: directory
owner: "root"
group: "root"
mode: "0700"

- name: add gridd.conf config file from template
template:
src: "gridd.conf"
dest: "{{ gpu_operator_grid_config_dir }}/gridd.conf"
owner: "root"
group: "root"
mode: "0600"

- name: add client_configuration_token.tok token file from template
template:
src: "client_configuration_token.tok"
dest: "{{ gpu_operator_grid_config_dir }}/client_configuration_token.tok"
owner: "root"
group: "root"
mode: "0600"

- name: Create a docker gridd.conf license configmap
shell: kubectl create configmap licensing-config -n "{{ gpu_operator_namespace }}" --from-file="{{ gpu_operator_grid_config_dir }}/gridd.conf" --from-file="{{ gpu_operator_grid_config_dir }}/client_configuration_token.tok" -o yaml --dry-run=client | kubectl apply -f -

- name: Create a docker secret for private GPU Operator containers
shell: kubectl create secret docker-registry registry-secret --docker-server='{{ gpu_operator_driver_registry }}' --docker-username='{{ gpu_operator_registry_username }}' --docker-password={{ gpu_operator_registry_password }} --docker-email={{ gpu_operator_registry_email }} -n {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f -

# While we would prefer to use the Ansible helm module, it's broken! :-(
# See https://github.com/ansible/ansible/pull/57897
# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released.
# So for now we will run /usr/local/bin/helm commands directly...
- name: install NVAIE gpu-operator helm repo
command: /usr/local/bin/helm repo add nvaie "{{ gpu_operator_nvaie_helm_repo }}" --username='{{ gpu_operator_registry_username }}' --password="{{ gpu_operator_registry_password }}"

- name: update helm repos
command: /usr/local/bin/helm repo update

# XXX: This currently installs into the default namespace, as per the GPU Operator docs
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html
- name: install nvidia gpu operator
command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_nvaie_chart_name }}" --version "{{ gpu_operator_chart_version }}" --namespace "{{ gpu_operator_namespace }}" --create-namespace --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.licensingConfig.configMapName=licensing-config --set driver.licensingConfig.nlsEnabled="{{ gpu_operator_nvaie_nls_enabled }}" --set driver.imagePullSecrets[0]="registry-secret" --set operator.imagePullSecrets[0]="registry-secret" --set driver.repository="{{ gpu_operator_driver_registry }}" --set driver.version="{{ gpu_operator_driver_version }}" --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"{{ gpu_operator_nvaie_nls_token }}"

0 comments on commit bee2a9e

Please sign in to comment.