diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index 5d92a730d..350bb8d29 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -282,6 +282,11 @@ openmpi_version: 4.0.3 # Disable cloud-init deepops_disable_cloud_init: true +# Default profile when using NVIDIA MIG Manager: https://github.com/NVIDIA/mig-parted +mig_manager_profile: all-disabled +mig_manager_config: /etc/nvidia-mig-manager/config.yml +mig_manager_hooks: /etc/nvidia-mig-manager/hooks.yaml + ################################################################################ # Container registry # ################################################################################ diff --git a/config.example/nvidia-mig-config.yml b/config.example/nvidia-mig-config.yml new file mode 100644 index 000000000..56e3c1083 --- /dev/null +++ b/config.example/nvidia-mig-config.yml @@ -0,0 +1,239 @@ +version: v1 +mig-configs: + all-disabled: + - devices: all + mig-enabled: false + + all-enabled: + - devices: all + mig-enabled: true + mig-devices: {} + + all-1g.5gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.5gb": 7 + + all-1g.10gb: + - devices: all + mig-enabled: true + mig-devices: + "1g.10gb": 7 + + all-2g.10gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.10gb": 3 + + all-2g.20gb: + - devices: all + mig-enabled: true + mig-devices: + "2g.20gb": 3 + + all-3g.20gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.20gb": 2 + + all-3g.40gb: + - devices: all + mig-enabled: true + mig-devices: + "3g.40gb": 2 + + all-7g.80gb: + - devices: all + mig-enabled: true + mig-devices: + "7g.80gb": 2 + + all-7g.80gb: + - devices: all + mig-enabled: true + mig-devices: + "7g.80gb": 2 + + all-balanced-a100-40: + - devices: all + mig-enabled: true + mig-devices: + "1g.5gb": 2 + "2g.10gb": 1 + "3g.20gb": 1 + + all-balanced-a100-80: + - devices: all + mig-enabled: true + mig-devices: + "1g.10gb": 2 + "2g.20gb": 1 + "3g.40gb": 1 + + custom-config-a100-40: + - devices: [0,1,2,3] + mig-enabled: false + - devices: [4] + mig-enabled: true + mig-devices: + "1g.5gb": 7 + - devices: [5] + mig-enabled: true + mig-devices: + "2g.10gb": 3 + - devices: [6] + mig-enabled: true + mig-devices: + "3g.20gb": 2 + - devices: [7] + mig-enabled: true + mig-devices: + "1g.5gb": 2 + "2g.10gb": 1 + "3g.20gb": 1 + + custom-config-a100-80: + - devices: [0,1,2,3] + mig-enabled: false + - devices: [4] + mig-enabled: true + mig-devices: + "1g.10gb": 7 + - devices: [5] + mig-enabled: true + mig-devices: + "2g.20gb": 3 + - devices: [6] + mig-enabled: true + mig-devices: + "3g.40gb": 2 + - devices: [7] + mig-enabled: true + mig-devices: + "1g.10gb": 2 + "2g.20gb": 1 + "3g.40gb": 1 + + all-disabled-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: false + mig-devices: {} + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-enabled-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: {} + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-1g.10gb-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: + "1g.10gb": 7 + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-2g.20gb-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: + "2g.20gb": 3 + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-3g.40gb-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: + "3g.40gb": 2 + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-4g.40gb-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: + "4g.40gb": 1 + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-7g.80gb-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: + "7g.80gb": 1 + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + all-balanced-dgx-station-80: + - device-filter: "0x20B210DE" + devices: all + mig-enabled: true + mig-devices: + "1g.10gb": 2 + "2g.20gb": 1 + "3g.40gb": 1 + - device-filter: "0x1FB010DE" + devices: all + mig-enabled: false + mig-devices: {} + + balanced-k8s-dgx-station-80: + # Training + - device-filter: "0x20B210DE" + devices: [0] + mig-enabled: false + mig-devices: {} + + # Training & Fine-tuning + - device-filter: "0x20B210DE" + devices: [1] + mig-enabled: true + mig-devices: + "3g.40gb": 2 + + # Development & Fine-tuning + - device-filter: "0x20B210DE" + devices: [2] + mig-enabled: true + mig-devices: + "2g.20gb": 3 + "1g.10gb": 1 + + # Inference & Development + - device-filter: "0x20B210DE" + devices: [4] + mig-enabled: true + mig-devices: + "1g.10gb": 7 + - device-filter: "0x1FB010DE" + devices: [3] + mig-enabled: false + mig-devices: {} diff --git a/docs/k8s-cluster/nvidia-mig.md b/docs/k8s-cluster/nvidia-mig.md index a6fa19e77..8d274fc39 100644 --- a/docs/k8s-cluster/nvidia-mig.md +++ b/docs/k8s-cluster/nvidia-mig.md @@ -2,58 +2,57 @@ Multi-Instance GPU or MIG is a feature introduced in the NVIDIA A100 GPUs that allow a single GPU to be partitioned into several smaller GPUs. For more information see the [NVIDIA MIG page](https://www.nvidia.com/en-us/technologies/multi-instance-gpu/). +There are two methods that can be used to administer MIG. This guide details the K8s native method that relies on the NVIDIA MIG Manager service included with the [NVIDIA GPU Operator](https://github.com/NVIDIA/gpu-operator) and installed by default if `deepops_gpu_operator_enabled` is set to `true`. The alternative method is a bare-metal solution using the mig-parted systemd service which can be installed using the [nvidia-mig.yml](../../playbooks/nvidia-software/nvidia-mig.yml) playbook and configured following the [official documentation](https://github.com/NVIDIA/mig-parted). + Supporting MIG requires several administrative steps and open source projects. -*Projects:* +*Projects, included in GPU Operator v1.9.0+:* * [GPU Device Plugin](https://github.com/NVIDIA/k8s-device-plugin) * [GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) +* [NVIDIA K8s MIG Manager](https://github.com/NVIDIA/mig-parted/tree/master/deployments/gpu-operator) *Admin Steps:* * Enable MIG -* Configure MIG (kubernetes) -* Configure MIG (bare-metal) +* Configure MIG mode ('single' or 'mixed') +* Configure MIG (Kubernetes configmap) * Update Application/YAML to support MIG ## Enabling MIG -MIG can be enabled on a node by running the `playbooks/nvidia-software/nvidia-mig.yml` playbook. +The K8s MIG Manager will handle enabling and disabling MIG on all devices, as necessary. There are some caveats depending on the state of your cluster and a node reboot may be necessary. -## Installing MIG in Kubernetes +## Configuring MIG Mode in Kubernetes By default, MIG support for Kubernetes is enabled in DeepOps. The default MIG strategy used is set to `mixed`. This can be controlled by the `k8s_gpu_mig_strategy`variable in `config/group_vars/k8s-cluster.yml. The "mixed" strategy is recommended for new deployments. For more information about strategies see the GPU Device Plugin [README](https://github.com/NVIDIA/k8s-device-plugin#deployment-via-helm). If DeepOps is being used to manage a Kubernetes cluster that was deployed using another method, MIG can be enabled by running: ```sh -ansible-playbook playbooks/k8s-cluster/nvidia-k8s-gpu-device-plugin.yml playbooks/k8s-cluster/nvidia-k8s-gpu-feature-discovery.yml +ansible-playbook playbooks/k8s-cluster/nvidia-gpu-operator.yml ``` > Note, the same command can be used to re-configure a new strategy -## Configuring MIG +## Configuring MIG Devices -MIG devices must be configured after enabling MIG and after **every** node reboot. When in production, it is recommended to do a rolling upgrade node-by-node following the below steps on each GPU node. +MIG devices are configured on a per-node or cluster-wide basis depending on the MIG configmap and the node labels applied to each node. When in production, it is recommended to do a rolling upgrade node-by-node following the below steps on each GPU node. Configuration and reconfiguration require that you: 1. Taint your node 2. Evacuate all GPU pods 3. Configure MIG -4. Restart the GPU Device Plugin Pod -5. Wait for GPU Feature Discovery to re-label the node -6. Remove the taint. +6. Remove the taint ```sh kubectl taint node gpu01 mig=maintenance:NoSchedule kubectl taint node gpu01 mig=maintenance:NoExecute # Optionally, Deep Learning jobs and Notebooks could be allowed to "time out" - - - -kubectl exec -- kill -SIGTERM 1 +kubectl label node gpu01 nvidia.com/mig.config=all-1g.5gb sleep 60 # 60 seconds is the default polling period of GPU Feature Discovery +kubectl describe node gpu01 # Manual verification of MIG resources kubectl taint node gpu01 mig=maintenance:NoSchedule- kubectl taint node gpu01 mig=maintenance:NoExecute- ``` diff --git a/playbooks/nvidia-software/nvidia-mig.yml b/playbooks/nvidia-software/nvidia-mig.yml index 7c6655d21..02bff055b 100644 --- a/playbooks/nvidia-software/nvidia-mig.yml +++ b/playbooks/nvidia-software/nvidia-mig.yml @@ -1,120 +1,53 @@ --- -# Enable/disable MIG mode -# run with tags --enable or --disable -- hosts: all - become: yes +# Install the NVIDIA MIG Manager tooling on all MIG-capable nodes +# Copy over the custom MIG config to all nodes and apply them +# Cluster-wide config is set in group_vars/config.yml by mig_manager_profile +# Per-node config can be configured in the inventory file +# Check if MIG capabilities and software on nodes +- hosts: all vars: - deepops_mig_devices: "all" - nv_services: - - nvsm - - nvidia-persistenced - - nvidia-fabricmanager - - nv_peer_mem - - dcgm - - docker - nv_modules: - - nv_peer_mem - - nvidia_uvm - - nvidia_drm - - nvidia_modeset - - nvidia - + mig_manager_reboot_timeout: 900 tasks: - # Check for MIG-capable devices - name: check for MIG capable devices - command: nvidia-smi --query-gpu=pci.bus_id,mig.mode.current --format=csv,noheader + shell: nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep -v 'N/A' register: has_mig + failed_when: false - # Pre-tasks - - name: stop system services - systemd: - state: stopped - enabled: no - name: "{{ item }}" - with_items: "{{ nv_services }}" - tags: enable, disable, never - - name: unload drivers - modprobe: - state: absent - name: "{{ item }}" - with_items: "{{ nv_modules }}" - tags: enable, disable, never - - # Manage MIG - - name: enable MIG mode (all devices) - command: nvidia-smi -mig 1 - tags: enable, never - when: deepops_mig_devices | default("") == "all" - - name: enable MIG mode (per device) - command: nvidia-smi -mig 1 -i "{{ deepops_mig_devices }}" - tags: enable, never - when: deepops_mig_devices | default("") != "" and deepops_mig_devices | default("") != "all" + - name: Install MIG Manager + include_role: + name: nvidia-mig-manager + when: + - has_mig.rc == 0 + + # TODO: Consider storing a custom copy of the hooks.yaml configuration alongside the config.yaml + - name: copy cluster-wide mig config file + copy: + src: "../../config/nvidia-mig-config.yml" + dest: "{{ mig_manager_config }}" + when: has_mig.rc == 0 - - name: disable MIG mode (all devices) - command: nvidia-smi -mig 0 - tags: disable, never - when: deepops_mig_devices | default("") == "all" - - name: disable MIG mode (per device) - command: nvidia-smi -mig 0 -i "{{ deepops_mig_devices }}" - tags: disable, never - when: deepops_mig_devices | default("") != "" and deepops_mig_devices | default("") != "all" + - name: Apply MIG configuration + command: nvidia-mig-parted apply -f {{ mig_manager_config }} -c {{ mig_manager_profile }} -k {{ mig_manager_hooks }} + when: has_mig.rc == 0 - # Post-tasks - - name: wait for MIG stuff to settle down and nvidia-persistenced to start again - pause: - seconds: 20 - tags: enable, disable, never - - name: stop system services - systemd: - state: stopped - enabled: no - name: "{{ item }}" - with_items: "{{ nv_services }}" - tags: enable, disable, never - - name: unload drivers - modprobe: - state: absent - name: "{{ item }}" - with_items: "{{ nv_modules }}" - tags: enable, disable, never - - name: start fabric manager - systemd: - state: started - name: nvidia-fabricmanager - tags: enable, disable, never - - name: stop nvidia-persistenced again - systemd: - state: stopped - name: nvidia-persistenced - tags: enable, disable, never - - name: reset GPUs - command: nvidia-smi --gpu-reset - tags: enable, disable, never - - name: load drivers - modprobe: - state: present - name: "{{ item }}" - with_items: "{{ nv_modules }}" - ignore_errors: true - tags: enable, disable, never - - name: start system services - systemd: - state: started - enabled: yes - name: "{{ item }}" - with_items: "{{ nv_services }}" - ignore_errors: true - tags: enable, disable, never + # Reboot nodes if necessary and poll for them to come up + - name: Reboot if necessary + shell: sleep 2 && /sbin/shutdown -r now "Reboot required" + async: 1 + poll: 0 + when: + - reboot_required is defined + - reboot_required | default(false) + - name: Wait for server to reboot (if required) + wait_for_connection: + delay=15 + timeout={{ reboot_timeout }} + when: + - reboot_required is defined + - reboot_required | default(false) + - has_mig.rc == 0 - # Permissions - - name: grant user permissions to manage MIG instances - file: - path: "{{ item }}" - owner: root - group: root - mode: '0444' - with_items: - - /proc/driver/nvidia/capabilities/mig/config - - /proc/driver/nvidia/capabilities/mig/monitor - tags: enable, never + - name: Assert MIG configuration was applied + command: nvidia-mig-parted assert -f {{ mig_manager_config }} -c {{ mig_manager_profile }} + when: has_mig.rc == 0 diff --git a/roles/nvidia-mig-manager/defaults/main.yml b/roles/nvidia-mig-manager/defaults/main.yml new file mode 100644 index 000000000..ed08230f4 --- /dev/null +++ b/roles/nvidia-mig-manager/defaults/main.yml @@ -0,0 +1,3 @@ +--- +mig_manager_url_deb: https://github.com/NVIDIA/mig-parted/releases/download/v0.2.0/nvidia-mig-manager_0.2.0-1_amd64.deb +mig_manager_url_rpm: https://github.com/NVIDIA/mig-parted/releases/download/v0.2.0/nvidia-mig-manager-0.2.0-1.x86_64.rpm diff --git a/roles/nvidia-mig-manager/tasks/main.yml b/roles/nvidia-mig-manager/tasks/main.yml new file mode 100644 index 000000000..9344fa287 --- /dev/null +++ b/roles/nvidia-mig-manager/tasks/main.yml @@ -0,0 +1,28 @@ +--- +# Install the NVIDIA MIG Manager tooling on all MIG-capable nodes + +# Check node state +- name: check for MIG capable devices + shell: nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep -v 'N/A' + register: has_mig + failed_when: false +- name: check for NVIDIA MIG parted + shell: which nvidia-mig-parted + register: has_mig_parted + failed_when: false + +# Install NVIDIA MIG Manager Systemd service +- name: Install MIG Manager (apt) + apt: + deb: "{{ mig_manager_url_deb }}" + when: + - has_mig.rc == 0 + - has_mig_parted.rc != 0 + - ansible_os_family == "Debian" +- name: Install MIG Manager (yum) + yum: + name: "{{ mig_manager_url_rpm }}" + state: present + when: + - has_mig_parted.rc != 0 + - ansible_os_family == "RedHat"