-
Notifications
You must be signed in to change notification settings - Fork 332
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1059 from iamadrigal/gpu-operator-nvaie-vgpu
GPU Operator automation with NVIDIA AI Enterprise
- Loading branch information
Showing
5 changed files
with
87 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
--- | ||
# While we would prefer to use the Ansible helm module, it's broken! :-( | ||
# See https://github.com/ansible/ansible/pull/57897 | ||
# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released. | ||
# So for now we will run /usr/local/bin/helm commands directly... | ||
- name: install gpu-operator helm repo | ||
command: /usr/local/bin/helm repo add nvidia "{{ gpu_operator_helm_repo }}" | ||
|
||
- name: update helm repos | ||
command: /usr/local/bin/helm repo update | ||
|
||
# XXX: This currently installs into the default namespace, as per the GPU Operator docs | ||
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html | ||
- name: install nvidia gpu operator | ||
command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_chart_name }}" --version "{{ gpu_operator_chart_version }}" --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.version="{{ gpu_operator_driver_version }}" --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,50 +1,7 @@ | ||
--- | ||
# vGPU support | ||
- block: | ||
- name: Create namespace for GPU Operator resources | ||
shell: kubectl create namespace {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f - | ||
- name: create local directory for gpu operator configuration | ||
file: | ||
path: "{{ gpu_operator_grid_config_dir }}" | ||
state: directory | ||
owner: "root" | ||
group: "root" | ||
mode: "0700" | ||
- name: add gridd.conf config file from template | ||
template: | ||
src: "gridd.conf" | ||
dest: "{{ gpu_operator_grid_config_dir }}/gridd.conf" | ||
owner: "root" | ||
group: "root" | ||
mode: "0600" | ||
- name: Create a docker gridd.conf license configmap | ||
shell: kubectl create configmap licensing-config -n "{{ gpu_operator_namespace }}" --from-file="{{gpu_operator_grid_config_dir }}/gridd.conf" -o yaml --dry-run=client | kubectl apply -f - | ||
- name: Create a docker secret for private GPU Operator containers | ||
shell: kubectl create secret docker-registry registry-secret --docker-server='{{ gpu_operator_driver_registry }}' --docker-username='{{ gpu_operator_registry_username }}' --docker-password={{ gpu_operator_registry_password }} --docker-email={{ gpu_operator_registry_email }} -n {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f - | ||
- name: Set secret var for helm command line if specified | ||
set_fact: | ||
gpu_operator_registry_secret: --set driver.imagePullSecrets[0]="registry-secret" | ||
- name: Set license config var for helm command line if specified | ||
set_fact: | ||
gpu_operator_license_var: --set driver.licensingConfig.configMapName=licensing-config | ||
- name: Use NVAIE helm charts | ||
set_fact: | ||
gpu_operator_chart_name: "{{ gpu_operator_nvaie_chart_name }}" | ||
- name: install NVAIE gpu-operator helm repo | ||
command: /usr/local/bin/helm repo add nvaie "{{ gpu_operator_nvaie_helm_repo }}" --username='{{ gpu_operator_registry_username }}' --password="{{ gpu_operator_registry_password }}" | ||
when: vgpu_grid_license_server != "" | ||
- include: k8s.yml | ||
when: not gpu_operator_nvaie_enable | ||
|
||
# While we would prefer to use the Ansible helm module, it's broken! :-( | ||
# See https://github.com/ansible/ansible/pull/57897 | ||
# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released. | ||
# So for now we will run /usr/local/bin/helm commands directly... | ||
- name: install gpu-operator helm repo | ||
command: /usr/local/bin/helm repo add nvidia "{{ gpu_operator_helm_repo }}" | ||
- include: nvaie.yml | ||
when: gpu_operator_nvaie_enable | ||
|
||
- name: update helm repos | ||
command: /usr/local/bin/helm repo update | ||
|
||
# XXX: This currently installs into the default namespace, as per the GPU Operator docs | ||
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html | ||
- name: install nvidia gpu operator | ||
command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_chart_name }}" --version "{{ gpu_operator_chart_version }}" --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.repository="{{ gpu_operator_driver_registry }}" --set driver.version="{{ gpu_operator_driver_version }}" {{ gpu_operator_registry_secret | default("") }} {{ gpu_operator_license_var | default("") }} --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
--- | ||
# vGPU support | ||
# | ||
- name: Create namespace for GPU Operator resources | ||
shell: kubectl create namespace {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f - | ||
|
||
- name: create local directory for gpu operator configuration | ||
file: | ||
path: "{{ gpu_operator_grid_config_dir }}" | ||
state: directory | ||
owner: "root" | ||
group: "root" | ||
mode: "0700" | ||
|
||
- name: add gridd.conf config file from template | ||
template: | ||
src: "gridd.conf" | ||
dest: "{{ gpu_operator_grid_config_dir }}/gridd.conf" | ||
owner: "root" | ||
group: "root" | ||
mode: "0600" | ||
|
||
- name: add client_configuration_token.tok token file from template | ||
template: | ||
src: "client_configuration_token.tok" | ||
dest: "{{ gpu_operator_grid_config_dir }}/client_configuration_token.tok" | ||
owner: "root" | ||
group: "root" | ||
mode: "0600" | ||
|
||
- name: Create a docker gridd.conf license configmap | ||
shell: kubectl create configmap licensing-config -n "{{ gpu_operator_namespace }}" --from-file="{{ gpu_operator_grid_config_dir }}/gridd.conf" --from-file="{{ gpu_operator_grid_config_dir }}/client_configuration_token.tok" -o yaml --dry-run=client | kubectl apply -f - | ||
|
||
- name: Create a docker secret for private GPU Operator containers | ||
shell: kubectl create secret docker-registry registry-secret --docker-server='{{ gpu_operator_driver_registry }}' --docker-username='{{ gpu_operator_registry_username }}' --docker-password={{ gpu_operator_registry_password }} --docker-email={{ gpu_operator_registry_email }} -n {{ gpu_operator_namespace }} -o yaml --dry-run=client | kubectl apply -f - | ||
|
||
# While we would prefer to use the Ansible helm module, it's broken! :-( | ||
# See https://github.com/ansible/ansible/pull/57897 | ||
# Unfortunately this will not be fixed until Ansible 2.10 which is not yet released. | ||
# So for now we will run /usr/local/bin/helm commands directly... | ||
- name: install NVAIE gpu-operator helm repo | ||
command: /usr/local/bin/helm repo add nvaie "{{ gpu_operator_nvaie_helm_repo }}" --username='{{ gpu_operator_registry_username }}' --password="{{ gpu_operator_registry_password }}" | ||
|
||
- name: update helm repos | ||
command: /usr/local/bin/helm repo update | ||
|
||
# XXX: This currently installs into the default namespace, as per the GPU Operator docs | ||
# https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/getting-started.html | ||
- name: install nvidia gpu operator | ||
command: /usr/local/bin/helm upgrade --install "{{ gpu_operator_release_name }}" "{{ gpu_operator_nvaie_chart_name }}" --version "{{ gpu_operator_chart_version }}" --namespace "{{ gpu_operator_namespace }}" --create-namespace --set operator.defaultRuntime="{{ gpu_operator_default_runtime }}" --set driver.licensingConfig.configMapName=licensing-config --set driver.licensingConfig.nlsEnabled="{{ gpu_operator_nvaie_nls_enabled }}" --set driver.imagePullSecrets[0]="registry-secret" --set operator.imagePullSecrets[0]="registry-secret" --set driver.repository="{{ gpu_operator_driver_registry }}" --set driver.version="{{ gpu_operator_driver_version }}" --set mig.strategy="{{ k8s_gpu_mig_strategy }}" --wait |
1 change: 1 addition & 0 deletions
1
roles/nvidia-gpu-operator/templates/client_configuration_token.tok
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"{{ gpu_operator_nvaie_nls_token }}" |