NVIDIA · dholt · Mar 24, 2022 · Oct 5, 2021 · Oct 6, 2021 · Oct 8, 2021
diff --git a/config.example/group_vars/k8s-cluster.yml b/config.example/group_vars/k8s-cluster.yml
@@ -10,6 +10,10 @@ kubeconfig_localhost: true
 helm_enabled: true
 tiller_node_selectors: "node-role.kubernetes.io/master=''"
 
+## Container runtime
+## docker for docker, crio for cri-o and containerd for containerd.
+container_manager: containerd
+
 artifacts_dir: "{{ inventory_dir }}/artifacts"
 
 # Reset Flex Volume path to the default. Kubespray changes the path, which breaks Rook
@@ -36,9 +40,6 @@ dashboard_image_repo: "kubernetesui/dashboard"
 dashboard_metrics_scrape_tagr: "v1.0.4"
 dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper"
 
-# Override the Helm version installed by Kubespray
-helm_version: "v3.5.4"
-
 # Ensure hosts file generation only runs across k8s cluster
 hosts_add_ansible_managed_hosts_groups: ["k8s-cluster"]
 
@@ -62,6 +63,7 @@ nfs_exports:
 ################################################################################
 kube_enable_container_registry: true
 docker_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}"
+crio_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}"
 docker_registry_mirrors: "{{ groups['kube-master'] | map('regex_replace', '^(.*)$', 'http://\\1:5000') | list }}"
 
 ################################################################################

diff --git a/playbooks/k8s-cluster.yml b/playbooks/k8s-cluster.yml
@@ -1,6 +1,26 @@
 ---
 # Kubernetes Cluster Playbook
 
+# Set facts depending on container runtime
+# Use GPU operator when container runtime is not docker
+# etcd_deployment_type must be `host` when container_manager is not `docker`
+- hosts: all
+  tasks:
+    - name: Set facts when not using docker container runtime (default)
+      set_fact:
+        deepops_gpu_operator_enabled: true
+        etcd_deployment_type: host
+      when:
+        - container_manager is defined
+        - container_manager != "docker"
+    - name: Set facts when using Docker container runtime
+      set_fact:
+        etcd_deployment_type: docker
+        gpu_operator_default_runtime: "docker"
+      when:
+        - container_manager is defined
+        - container_manager == "docker"
+
 # Install python required for Ansible
 - include: bootstrap/bootstrap-python.yml
   tags:
@@ -133,7 +153,9 @@
 - include: container/nvidia-docker.yml
   tags:
     - nvidia
-  when: deepops_gpu_operator_enabled | default('false') | bool == false
+  when:
+    - deepops_gpu_operator_enabled | default('false') | bool == false
+    - container_manager is defined and container_manager == "docker"
 
 # Install k8s GPU feature discovery
 - include: k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml

diff --git a/roles/nvidia-gpu-operator/defaults/main.yml b/roles/nvidia-gpu-operator/defaults/main.yml
@@ -18,7 +18,7 @@ gpu_operator_namespace: "gpu-operator-resources"
 gpu_operator_grid_config_dir: "{{ deepops_dir }}/gpu_operator"
 
 # Defaults from https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml
-gpu_operator_default_runtime: "docker"
+gpu_operator_default_runtime: "containerd"
 gpu_operator_driver_registry: "nvcr.io/nvidia"
 gpu_operator_driver_version: "470.57.02"
 

diff --git a/submodules/kubespray b/submodules/kubespray
diff --git a/workloads/jenkins/scripts/run-gpu-job.sh b/workloads/jenkins/scripts/run-gpu-job.sh
@@ -11,6 +11,11 @@ chmod 755 "$K8S_CONFIG_DIR/artifacts/kubectl"
 kubectl get nodes
 kubectl describe nodes
 
+# Wait for GPU operator to finish
+if kubectl get pods -n gpu-operator-resources | grep nvidia-operator-validator ; then
+    kubectl wait --for=condition=ready --timeout=600s pod -n gpu-operator-resources -l app=nvidia-operator-validator
+fi
+
 # Verify GPU Feature Discovery was installed and one or more nodes were labeled, run queries and remove new lines/white space/non-gpu node output
 strategy=$(kubectl get node -o=custom-columns=:.metadata.labels.nvidia\\.com/mig\\.strategy | grep -v none | tr -d '\040\011\012\015')
 product=$(kubectl get node -o=custom-columns=:.metadata.labels.nvidia\\.com/gpu\\.product | grep -v none | tr -d '\040\011\012\015')