Adds support for NVIDIA MIG configuration

jovial · jovial · commit 994d8f6e6124 · 2025-04-25T21:46:55.000+01:00
diff --git a/ansible/.gitignore b/ansible/.gitignore
@@ -90,3 +90,6 @@ roles/*
 !roles/gateway/**
 !roles/alertmanager/
 !roles/alertmanager/**
+!roles/slurm_recompile/**
+!roles/slurm_recompile/**
+
diff --git a/ansible/extras.yml b/ansible/extras.yml
@@ -48,6 +48,20 @@
         name: cuda
         tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
 
+- name: Setup vGPU
+  hosts: vgpu
+  become: yes
+  gather_facts: yes
+  tags: vgpu
+  tasks:
+    - include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
+  handlers:
+    - name: reboot
+      fail:
+        msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
+
 - name: Persist hostkeys across rebuilds
   # Must be after filesystems.yml (for storage)
   # and before portal.yml (where OOD login node hostkeys are scanned)
diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml
@@ -250,6 +250,16 @@
         name: cloudalchemy.grafana
         tasks_from: install.yml
 
+- name: Add support for NVIDIA GPU auto detection to Slurm
+  hosts: cuda
+  become: yes
+  tasks:
+    - name: Recompile slurm
+      import_role:
+        name: slurm_recompile
+      vars:
+        recompile_slurm_nvml: "{{ groups.cuda | length > 0 }}"
+
 - name: Run post.yml hook
   vars:
     appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md
@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
 | extras.yml               | basic_users             | All functionality [6]           | No                  |
 | extras.yml               | eessi                   | All functionality [7]           | No                  |
 | extras.yml               | cuda                    | None required - use image build | Yes [8]             |
+| extras.yml               | vgpu                    | All functionality               | Yes                 |
 | extras.yml               | persist_hostkeys        | Not relevant for compute nodes  | n/a                 |
 | extras.yml               | compute_init (export)   | Not relevant for compute nodes  | n/a                 |
 | extras.yml               | k9s (install)           | Not relevant during boot        | n/a                 |
diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml
@@ -19,6 +19,8 @@
     enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
     enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
     enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
+    enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"
+
 
     # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
     resolv_conf_nameservers: []
@@ -295,6 +297,12 @@
             cmd: "cvmfs_config setup"
       when: enable_eessi
 
+    - name: Configure VGPUs
+      include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: 'configure.yml'
+      when: enable_vgpu
+
     # NB: don't need conditional block on enable_compute as have already exited
     # if not the case
     - name: Write Munge key
diff --git a/ansible/roles/cuda/tasks/facts.yml b/ansible/roles/cuda/tasks/facts.yml
@@ -0,0 +1,5 @@
+---
+
+- name: Set cuda_facts_version_short
+  set_fact:
+    cuda_facts_version_short: "{{ cuda_version_short }}" 
diff --git a/ansible/roles/slurm_recompile/defaults/main.yml b/ansible/roles/slurm_recompile/defaults/main.yml
@@ -0,0 +1,2 @@
+---
+slurm_recompile_nvml: false
diff --git a/ansible/roles/slurm_recompile/tasks/main.yml b/ansible/roles/slurm_recompile/tasks/main.yml
@@ -0,0 +1,41 @@
+---
+- name: Get facts about CUDA installation
+  import_role:
+    name: cuda
+    tasks_from: facts.yml
+
+- name: Gather the package facts
+  ansible.builtin.package_facts:
+    manager: auto
+
+- name: Set fact containing slurm package facts
+  set_fact:
+    slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
+
+- name: Recompile and install slurm packages
+  shell: |
+    #!/bin/bash
+    source /etc/profile
+    set -eux
+    dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
+    rpm -i slurm-ohpc-*.src.rpm
+    cd /root/rpmbuild/SPECS
+    dnf builddep -y slurm.spec
+    rpmbuild -bb{% if slurm_recompile_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
+    dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
+  become: true
+
+- name: Workaround missing symlink
+  # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
+  command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
+  args:
+    creates: /lib64/libnvidia-ml.so
+  when: slurm_recompile_nvml | bool
+
+- name: Cleanup Dependencies
+  shell: |
+    #!/bin/bash
+    set -eux
+    set -o pipefail
+    dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 |  awk '{print $1}' | xargs dnf history -y undo
+  become: true
diff --git a/ansible/validate.yml b/ansible/validate.yml
@@ -83,3 +83,13 @@
     - import_role:
         name: lustre
         tasks_from: validate.yml
+
+- name: Validate vGPU configuration
+  hosts: vgpu
+  become: yes
+  gather_facts: yes
+  tags: vgpu
+  tasks:
+    - include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: validate.yml
diff --git a/docs/mig.md b/docs/mig.md
@@ -0,0 +1,65 @@
+# vGPU/MIG configuration
+
+This page details how to configure Multi Instance GPU (MIG) in Slurm.
+
+## Pre-requisites
+
+- Image built with cuda support. This should automatically recompile slurm against NVML.
+
+## Inventory
+
+Add relevant hosts to the ``vgpu`` group, for example in ```environments/$ENV/inventory/groups``:
+
+```
+[vgpu:children]
+cuda
+```
+
+## Configuration
+
+Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu) role.
+
+For example in: `environments/<environment>/inventory/group_vars/all/vgpu`:
+
+```
+---
+vgpu_definitions:
+    - pci_address: "0000:17:00.0"
+      mig_devices:
+        "1g.10gb": 4
+        "4g.40gb": 1
+    - pci_address: "0000:81:00.0"
+      mig_devices:
+        "1g.10gb": 4
+        "4g.40gb": 1
+```
+
+The appliance will use the driver installed via the ``cuda`` role. Use ``lspci`` to determine the PCI
+addresses.
+
+## compute_init
+
+Use the ``vgpu`` metadata option to enable creation of mig devices on rebuild.
+
+## gres configuration
+
+Enable gres autodetection. This can be set as a host or group var.
+
+```
+openhpc_gres_autodetect: nvml
+```
+
+You should stop terraform templating out partitions.yml and specify `openhpc_slurm_partitions` manually.
+An example of specifying gres resources is shown below
+(`environments/<environment>/inventory/group_vars/all/partitions-manual.yml`):
+
+```
+openhpc_slurm_partitions:
+    - name: cpu
+    - name: gpu
+      gres:
+        # Two cards not partitioned with MIG
+        - conf: "gpu:nvidia_h100_80gb_hbm3:2"
+        - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2"
+        - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6"
+```
diff --git a/environments/common/inventory/group_vars/all/vgpu b/environments/common/inventory/group_vars/all/vgpu
@@ -0,0 +1,4 @@
+---
+
+# Nvidia driver is provided by cuda role.
+vgpu_nvidia_driver_install_enabled: false
diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups
@@ -112,6 +112,10 @@ freeipa_client
 [cuda]
 # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md
 
+[vgpu]
+# FIXME: Update once PR merged
+# Hosts where vGPU/MIG should be configured - see https://github.com/stackhpc/ansible-collection-linux/pull/43/files#diff-74e43d9a34244aa54721f4dbd12a029baa87957afd762b88c2677aa75414f514R75
+
 [eessi]
 # Hosts on which EESSI stack should be configured
 
diff --git a/requirements.yml b/requirements.yml
@@ -4,7 +4,7 @@ roles:
     version: v25.3.2
     name: stackhpc.nfs
   - src: https://github.com/stackhpc/ansible-role-openhpc.git
-    version: v0.28.0
+    version: feature/gres-autodetect
     name: stackhpc.openhpc
   - src: https://github.com/stackhpc/ansible-node-exporter.git
     version: stackhpc
@@ -55,4 +55,7 @@ collections:
     version: 0.0.15
   - name: stackhpc.pulp
     version: 0.5.5
+  - name: https://github.com/stackhpc/ansible-collection-linux
+    type: git
+    version: feature/mig-only
 ...