Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kubespray v2.18.0 and containerd runtime #1043

Merged
merged 19 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion config.example/group_vars/k8s-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ kubeconfig_localhost: true
helm_enabled: true
tiller_node_selectors: "node-role.kubernetes.io/master=''"

## Container runtime
## docker for docker, crio for cri-o and containerd for containerd.
container_manager: containerd

artifacts_dir: "{{ inventory_dir }}/artifacts"

# Reset Flex Volume path to the default. Kubespray changes the path, which breaks Rook
Expand All @@ -18,7 +22,7 @@ kubelet_flexvolumes_plugins_dir: /usr/libexec/kubernetes/kubelet-plugins/volume/

# Provide option to use GPU Operator instead of setting up NVIDIA driver and
# Docker configuration.
deepops_gpu_operator_enabled: false
deepops_gpu_operator_enabled: true

# Install NVIDIA Driver and nvidia-docker on node (true), not as part of GPU Operator (driver container, nvidia-toolkit) (false)
gpu_operator_preinstalled_nvidia_software: true
Expand Down Expand Up @@ -62,6 +66,7 @@ nfs_exports:
################################################################################
kube_enable_container_registry: true
docker_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}"
crio_insecure_registries: "{{ groups['kube-master']|map('regex_replace', '^(.*)$', '\\1:5000')|list + ['registry.local:31500']}}"
docker_registry_mirrors: "{{ groups['kube-master'] | map('regex_replace', '^(.*)$', 'http://\\1:5000') | list }}"

################################################################################
Expand Down
54 changes: 49 additions & 5 deletions playbooks/k8s-cluster.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
---
# Kubernetes Cluster Playbook

# Set facts depending on container runtime
# Use GPU operator when container runtime is not docker
# etcd_deployment_type must be `host` when container_manager is not `docker`
- hosts: all
tasks:
- name: Set facts when not using docker container runtime (default)
set_fact:
deepops_gpu_operator_enabled: true
dholt marked this conversation as resolved.
Show resolved Hide resolved
etcd_deployment_type: host
when:
- container_manager is defined
- container_manager != "docker"
- name: Set facts when using Docker container runtime
set_fact:
etcd_deployment_type: docker
gpu_operator_default_runtime: "docker"
when:
- container_manager is defined
- container_manager == "docker"

# Install python required for Ansible
- include: bootstrap/bootstrap-python.yml
tags:
Expand Down Expand Up @@ -67,6 +87,26 @@
tags:
- bootstrap

# Un-hold container runtime packages on Ubuntu
# In some cases, container runtime packages can be in the 'held' state, preventing
# them from being removed and causing the install of the containerd runtime to fail
- hosts: all
gather_facts: true
tasks:
- name: un-hold container runtime packages on Ubuntu
dpkg_selections:
name: "{{ item }}"
selection: purge
with_items:
- docker-ce
- docker-ce-cli
- docker-ce-rootless-extras
- containerd.io
when:
- container_manager is defined and container_manager != "docker"
- ansible_distribution == "Ubuntu"
environment: "{{proxy_env if proxy_env is defined else {}}}"

# Install Kubernetes
# for configuration, see: config/group_vars/k8s-cluster.yml
- include: ../submodules/kubespray/cluster.yml
Expand Down Expand Up @@ -129,33 +169,37 @@
hostlist: "k8s-cluster"
tags:
- nvidia
when: deepops_gpu_operator_enabled | default('false') | bool == false or gpu_operator_preinstalled_nvidia_software
when: deepops_gpu_operator_enabled|default(true) | bool == false or
gpu_operator_preinstalled_nvidia_software|default(true)

# Install NVIDIA container runtime on GPU servers
- include: container/nvidia-docker.yml
vars:
hostlist: "k8s-cluster"
tags:
- nvidia
when: deepops_gpu_operator_enabled | default('false') | bool == false or gpu_operator_preinstalled_nvidia_software
when:
- deepops_gpu_operator_enabled|default(true) | bool == false or
gpu_operator_preinstalled_nvidia_software|default(true)
- container_manager is defined and container_manager == "docker"

# Install k8s GPU feature discovery
- include: k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled | default('false') | bool == false
when: deepops_gpu_operator_enabled|default(true) | bool == false

# Install k8s GPU device plugin
- include: k8s-cluster/nvidia-k8s-gpu-device-plugin.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled | default('false') | bool == false
when: deepops_gpu_operator_enabled|default(true) | bool == false

# Install NVIDIA GPU Operator
- include: k8s-cluster/nvidia-gpu-operator.yml
tags:
- nvidia
when: deepops_gpu_operator_enabled | default('false') | bool == true
when: deepops_gpu_operator_enabled|default(true) | bool == true

# Manage kubectl binary
- hosts: kube-master
Expand Down
2 changes: 1 addition & 1 deletion roles/nvidia-gpu-operator/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ gpu_operator_namespace: "gpu-operator-resources"
gpu_operator_grid_config_dir: "{{ deepops_dir }}/gpu_operator"

# Defaults from https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml
gpu_operator_default_runtime: "containerd"
dholt marked this conversation as resolved.
Show resolved Hide resolved
gpu_operator_driver_registry: "nvcr.io/nvidia"
gpu_operator_driver_version: "470.82.01"

Expand All @@ -55,4 +56,3 @@ gpu_operator_registry_username: "$oauthtoken"
gpu_operator_registry_password: ""
## This is most likely an NGC email
gpu_operator_registry_email: ""

2 changes: 1 addition & 1 deletion submodules/kubespray