File tree Expand file tree Collapse file tree 14 files changed +360
-1
lines changed 
environments/common/inventory Expand file tree Collapse file tree 14 files changed +360
-1
lines changed Original file line number Diff line number Diff line change @@ -90,3 +90,5 @@ roles/*
9090! roles /gateway /** 
9191! roles /alertmanager /
9292! roles /alertmanager /** 
93+ ! roles /slurm_recompile /** 
94+ ! roles /slurm_recompile /** 
Original file line number Diff line number Diff line change 4848        name : cuda 
4949        tasks_from : " {{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" 
5050
51+ - name : Setup vGPU 
52+   hosts : vgpu 
53+   become : yes 
54+   gather_facts : yes 
55+   tags : vgpu 
56+   tasks :
57+     - include_role :
58+         name : stackhpc.linux.vgpu 
59+         tasks_from : " {{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}" 
60+   handlers :
61+     - name : reboot 
62+       fail :
63+         msg : Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable. 
64+ 
5165- name : Persist hostkeys across rebuilds 
5266  #  Must be after filesystems.yml (for storage)
5367  #  and before portal.yml (where OOD login node hostkeys are scanned)
Original file line number Diff line number Diff line change 250250        name : cloudalchemy.grafana 
251251        tasks_from : install.yml 
252252
253+ - name : Add support for NVIDIA GPU auto detection to Slurm 
254+   hosts : cuda 
255+   become : yes 
256+   tasks :
257+     - name : Recompile slurm 
258+       import_role :
259+         name : slurm_recompile 
260+       vars :
261+         slurm_recompile_with_nvml : " {{ groups.cuda | length > 0 }}" 
262+ 
253263- name : Run post.yml hook 
254264  vars :
255265    appliances_environment_root : " {{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" 
Original file line number Diff line number Diff line change @@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
7575|  extras.yml               |  basic_users             |  All functionality [ 6]            |  No                  | 
7676|  extras.yml               |  eessi                   |  All functionality [ 7]            |  No                  | 
7777|  extras.yml               |  cuda                    |  None required - use image build |  Yes [ 8]              | 
78+ |  extras.yml               |  vgpu                    |  All functionality               |  Yes                 | 
7879|  extras.yml               |  persist_hostkeys        |  Not relevant for compute nodes  |  n/a                 | 
7980|  extras.yml               |  compute_init (export)   |  Not relevant for compute nodes  |  n/a                 | 
8081|  extras.yml               |  k9s (install)           |  Not relevant during boot        |  n/a                 | 
Original file line number Diff line number Diff line change 1919    enable_basic_users : " {{ os_metadata.meta.basic_users | default(false) | bool }}" 
2020    enable_eessi : " {{ os_metadata.meta.eessi | default(false) | bool }}" 
2121    enable_chrony : " {{ os_metadata.meta.chrony | default(false) | bool }}" 
22+     enable_vgpu : " {{ os_metadata.meta.vpgu | default(false) | bool }}" 
2223
2324    #  TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
2425    resolv_conf_nameservers : [] 
295296            cmd : " cvmfs_config setup" 
296297      when : enable_eessi 
297298
299+     - name : Configure VGPUs 
300+       include_role :
301+         name : stackhpc.linux.vgpu 
302+         tasks_from : ' configure.yml' 
303+       when : enable_vgpu 
304+ 
298305    #  NB: don't need conditional block on enable_compute as have already exited
299306    #  if not the case
300307    - name : Write Munge key 
Original file line number Diff line number Diff line change 1+ ---
2+ - name : Set cuda_facts_version_short 
3+   set_fact :
4+     cuda_facts_version_short : " {{ cuda_version_short }}" 
Original file line number Diff line number Diff line change 1+ # slurm_recompile  
2+ ================= 
3+ 
4+ Recompiles slurm from source RPMs and installs the packages that were built.
5+ 
6+ Requirements
7+ ------------ 
8+ 
9+ Role Variables
10+ -------------- 
11+ 
12+ See ` defaults/main.yml ` .
13+ 
14+ Dependencies
15+ ------------ 
16+ 
17+ Example Playbook
18+ ---------------- 
19+ 
20+     - hosts: compute 
21+       tasks: 
22+         - import_role: 
23+             name: slurm_recompile 
24+ 
25+ License
26+ ------- 
27+ 
28+ Apache-2.0
Original file line number Diff line number Diff line change 1+ ---
2+ #  Whether to link slurm against the NVIDIA management library
3+ slurm_recompile_with_nvml : false 
4+ 
Original file line number Diff line number Diff line change 1+ ---
2+ - name : Get facts about CUDA installation 
3+   import_role :
4+     name : cuda 
5+     tasks_from : facts.yml 
6+ 
7+ - name : Gather the package facts 
8+   ansible.builtin.package_facts :
9+     manager : auto 
10+ 
11+ - name : Set fact containing slurm package facts 
12+   set_fact :
13+     slurm_package : " {{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}" 
14+ 
15+ - name : Recompile and install slurm packages 
16+   shell : | 
17+     #!/bin/bash 
18+     source /etc/profile 
19+     set -eux 
20+     dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }} 
21+     rpm -i slurm-ohpc-*.src.rpm 
22+     cd /root/rpmbuild/SPECS 
23+     dnf builddep -y slurm.spec 
24+     rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec 
25+     dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm 
26+    become : true 
27+ 
28+ - name : Workaround missing symlink 
29+   #  Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
30+   command : ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so 
31+   args :
32+     creates : /lib64/libnvidia-ml.so 
33+   when : slurm_recompile_with_nvml | bool 
34+ 
35+ - name : Cleanup Dependencies 
36+   shell : | 
37+     #!/bin/bash 
38+     set -eux 
39+     set -o pipefail 
40+     dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 |  awk '{print $1}' | xargs dnf history -y undo 
41+    become : true 
Original file line number Diff line number Diff line change 8888    - import_role :
8989        name : lustre 
9090        tasks_from : validate.yml 
91+ 
92+ - name : Validate vGPU configuration 
93+   hosts : vgpu 
94+   become : yes 
95+   gather_facts : yes 
96+   tags : vgpu 
97+   tasks :
98+     - include_role :
99+         name : stackhpc.linux.vgpu 
100+         tasks_from : validate.yml 
    
 
   
 
     
   
   
          
     
  
    
     
 
    
      
     
 
     
    You can’t perform that action at this time.
  
 
    
  
     
    
      
        
     
 
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments