File tree Expand file tree Collapse file tree 13 files changed +171
-1
lines changed
environments/common/inventory Expand file tree Collapse file tree 13 files changed +171
-1
lines changed Original file line number Diff line number Diff line change @@ -90,3 +90,6 @@ roles/*
9090! roles /gateway /**
9191! roles /alertmanager /
9292! roles /alertmanager /**
93+ ! roles /slurm_recompile /**
94+ ! roles /slurm_recompile /**
95+
Original file line number Diff line number Diff line change 4848 name : cuda
4949 tasks_from : " {{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
5050
51+ - name : Setup vGPU
52+ hosts : vgpu
53+ become : yes
54+ gather_facts : yes
55+ tags : vgpu
56+ tasks :
57+ - include_role :
58+ name : stackhpc.linux.vgpu
59+ tasks_from : " {{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
60+ handlers :
61+ - name : reboot
62+ fail :
63+ msg : Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
64+
5165- name : Persist hostkeys across rebuilds
5266 # Must be after filesystems.yml (for storage)
5367 # and before portal.yml (where OOD login node hostkeys are scanned)
Original file line number Diff line number Diff line change 250250 name : cloudalchemy.grafana
251251 tasks_from : install.yml
252252
253+ - name : Add support for NVIDIA GPU auto detection to Slurm
254+ hosts : cuda
255+ become : yes
256+ tasks :
257+ - name : Recompile slurm
258+ import_role :
259+ name : slurm_recompile
260+ vars :
261+ recompile_slurm_nvml : " {{ groups.cuda | length > 0 }}"
262+
253263- name : Run post.yml hook
254264 vars :
255265 appliances_environment_root : " {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
Original file line number Diff line number Diff line change @@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575| extras.yml | basic_users | All functionality [ 6] | No |
7676| extras.yml | eessi | All functionality [ 7] | No |
7777| extras.yml | cuda | None required - use image build | Yes [ 8] |
78+ | extras.yml | vgpu | All functionality | Yes |
7879| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a |
7980| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a |
8081| extras.yml | k9s (install) | Not relevant during boot | n/a |
Original file line number Diff line number Diff line change 1919 enable_basic_users : " {{ os_metadata.meta.basic_users | default(false) | bool }}"
2020 enable_eessi : " {{ os_metadata.meta.eessi | default(false) | bool }}"
2121 enable_chrony : " {{ os_metadata.meta.chrony | default(false) | bool }}"
22+ enable_vgpu : " {{ os_metadata.meta.vpgu | default(false) | bool }}"
23+
2224
2325 # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2426 resolv_conf_nameservers : []
295297 cmd : " cvmfs_config setup"
296298 when : enable_eessi
297299
300+ - name : Configure VGPUs
301+ include_role :
302+ name : stackhpc.linux.vgpu
303+ tasks_from : ' configure.yml'
304+ when : enable_vgpu
305+
298306 # NB: don't need conditional block on enable_compute as have already exited
299307 # if not the case
300308 - name : Write Munge key
Original file line number Diff line number Diff line change 1+ ---
2+
3+ - name : Set cuda_facts_version_short
4+ set_fact :
5+ cuda_facts_version_short : " {{ cuda_version_short }}"
Original file line number Diff line number Diff line change 1+ ---
2+ slurm_recompile_nvml : false
Original file line number Diff line number Diff line change 1+ ---
2+ - name : Get facts about CUDA installation
3+ import_role :
4+ name : cuda
5+ tasks_from : facts.yml
6+
7+ - name : Gather the package facts
8+ ansible.builtin.package_facts :
9+ manager : auto
10+
11+ - name : Set fact containing slurm package facts
12+ set_fact :
13+ slurm_package : " {{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
14+
15+ - name : Recompile and install slurm packages
16+ shell : |
17+ #!/bin/bash
18+ source /etc/profile
19+ set -eux
20+ dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
21+ rpm -i slurm-ohpc-*.src.rpm
22+ cd /root/rpmbuild/SPECS
23+ dnf builddep -y slurm.spec
24+ rpmbuild -bb{% if slurm_recompile_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
25+ dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
26+ become : true
27+
28+ - name : Workaround missing symlink
29+ # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
30+ command : ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
31+ args :
32+ creates : /lib64/libnvidia-ml.so
33+ when : slurm_recompile_nvml | bool
34+
35+ - name : Cleanup Dependencies
36+ shell : |
37+ #!/bin/bash
38+ set -eux
39+ set -o pipefail
40+ dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 | awk '{print $1}' | xargs dnf history -y undo
41+ become : true
Original file line number Diff line number Diff line change 8383 - import_role :
8484 name : lustre
8585 tasks_from : validate.yml
86+
87+ - name : Validate vGPU configuration
88+ hosts : vgpu
89+ become : yes
90+ gather_facts : yes
91+ tags : vgpu
92+ tasks :
93+ - include_role :
94+ name : stackhpc.linux.vgpu
95+ tasks_from : validate.yml
Original file line number Diff line number Diff line change 1+ # vGPU/MIG configuration
2+
3+ This page details how to configure Multi Instance GPU (MIG) in Slurm.
4+
5+ ## Pre-requisites
6+
7+ - Image built with cuda support. This should automatically recompile slurm against NVML.
8+
9+ ## Inventory
10+
11+ Add relevant hosts to the `` vgpu `` group, for example in ```environments/$ENV/inventory/groups``:
12+
13+ ```
14+ [vgpu:children]
15+ cuda
16+ ```
17+
18+ ## Configuration
19+
20+ Use variables from the [ stackhpc.linux.vgpu] ( https://github.com/stackhpc/ansible-collection-linux/tree/main/roles/vgpu ) role.
21+
22+ For example in: ` environments/<environment>/inventory/group_vars/all/vgpu ` :
23+
24+ ```
25+ ---
26+ vgpu_definitions:
27+ - pci_address: "0000:17:00.0"
28+ mig_devices:
29+ "1g.10gb": 4
30+ "4g.40gb": 1
31+ - pci_address: "0000:81:00.0"
32+ mig_devices:
33+ "1g.10gb": 4
34+ "4g.40gb": 1
35+ ```
36+
37+ The appliance will use the driver installed via the `` cuda `` role. Use `` lspci `` to determine the PCI
38+ addresses.
39+
40+ ## compute_init
41+
42+ Use the `` vgpu `` metadata option to enable creation of mig devices on rebuild.
43+
44+ ## gres configuration
45+
46+ Enable gres autodetection. This can be set as a host or group var.
47+
48+ ```
49+ openhpc_gres_autodetect: nvml
50+ ```
51+
52+ You should stop terraform templating out partitions.yml and specify ` openhpc_slurm_partitions ` manually.
53+ An example of specifying gres resources is shown below
54+ (` environments/<environment>/inventory/group_vars/all/partitions-manual.yml ` ):
55+
56+ ```
57+ openhpc_slurm_partitions:
58+ - name: cpu
59+ - name: gpu
60+ gres:
61+ # Two cards not partitioned with MIG
62+ - conf: "gpu:nvidia_h100_80gb_hbm3:2"
63+ - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2"
64+ - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6"
65+ ```
You can’t perform that action at this time.
0 commit comments