Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kubespray v2.18.0 and containerd runtime #1043

Merged
merged 19 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions config.example/group_vars/k8s-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ kubeconfig_localhost: true
helm_enabled: true
tiller_node_selectors: "node-role.kubernetes.io/master=''"

## Container runtime
## docker for docker, crio for cri-o and containerd for containerd.
container_manager: containerd

artifacts_dir: "{{ inventory_dir }}/artifacts"

# Reset Flex Volume path to the default. Kubespray changes the path, which breaks Rook
Expand All @@ -36,9 +40,6 @@ dashboard_image_repo: "kubernetesui/dashboard"
dashboard_metrics_scrape_tagr: "v1.0.4"
dashboard_metrics_scraper_repo: "kubernetesui/metrics-scraper"

# Override the Helm version installed by Kubespray
helm_version: "v3.5.4"

# Ensure hosts file generation only runs across k8s cluster
hosts_add_ansible_managed_hosts_groups: ["k8s-cluster"]

Expand All @@ -62,6 +63,7 @@ nfs_exports:
################################################################################
kube_enable_container_registry: true
docker_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}"
crio_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}"
docker_registry_mirrors: "{{ groups['kube-master'] | map('regex_replace', '^(.*)$', 'http://\\1:5000') | list }}"

################################################################################
Expand Down
24 changes: 23 additions & 1 deletion playbooks/k8s-cluster.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
---
# Kubernetes Cluster Playbook

# Set facts depending on container runtime
# Use GPU operator when container runtime is not docker
# etcd_deployment_type must be `host` when container_manager is not `docker`
- hosts: all
tasks:
- name: Set facts when not using docker container runtime (default)
set_fact:
deepops_gpu_operator_enabled: true
dholt marked this conversation as resolved.
Show resolved Hide resolved
etcd_deployment_type: host
when:
- container_manager is defined
- container_manager != "docker"
- name: Set facts when using Docker container runtime
set_fact:
etcd_deployment_type: docker
gpu_operator_default_runtime: "docker"
when:
- container_manager is defined
- container_manager == "docker"

# Install python required for Ansible
- include: bootstrap/bootstrap-python.yml
tags:
Expand Down Expand Up @@ -133,7 +153,9 @@
- include: container/nvidia-docker.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled | default('false') | bool == false
when:
- deepops_gpu_operator_enabled | default('false') | bool == false
supertetelman marked this conversation as resolved.
Show resolved Hide resolved
- container_manager is defined and container_manager == "docker"

# Install k8s GPU feature discovery
- include: k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml
Expand Down
2 changes: 1 addition & 1 deletion roles/nvidia-gpu-operator/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ gpu_operator_namespace: "gpu-operator-resources"
gpu_operator_grid_config_dir: "{{ deepops_dir }}/gpu_operator"

# Defaults from https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml
gpu_operator_default_runtime: "docker"
gpu_operator_default_runtime: "containerd"
dholt marked this conversation as resolved.
Show resolved Hide resolved
gpu_operator_driver_registry: "nvcr.io/nvidia"
gpu_operator_driver_version: "470.57.02"

Expand Down
2 changes: 1 addition & 1 deletion submodules/kubespray
5 changes: 5 additions & 0 deletions workloads/jenkins/scripts/run-gpu-job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ chmod 755 "$K8S_CONFIG_DIR/artifacts/kubectl"
kubectl get nodes
kubectl describe nodes

# Wait for GPU operator to finish
if kubectl get pods -n gpu-operator-resources | grep nvidia-operator-validator ; then
kubectl wait --for=condition=ready --timeout=600s pod -n gpu-operator-resources -l app=nvidia-operator-validator
fi

# Verify GPU Feature Discovery was installed and one or more nodes were labeled, run queries and remove new lines/white space/non-gpu node output
strategy=$(kubectl get node -o=custom-columns=:.metadata.labels.nvidia\\.com/mig\\.strategy | grep -v none | tr -d '\040\011\012\015')
product=$(kubectl get node -o=custom-columns=:.metadata.labels.nvidia\\.com/gpu\\.product | grep -v none | tr -d '\040\011\012\015')
Expand Down