From 504109236dd1295eb40ca723e4024ebe04553125 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:08:24 +0100 Subject: [PATCH 01/13] template mpi.conf --- tasks/runtime.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 5edd5fc0..0147cb24 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -103,6 +103,20 @@ register: ohpc_cgroup_conf # NB uses restart rather than reload as this is needed in some cases +- name: Template mpi.conf + template: + src: "{{ openhpc_mpi_template }}" + dest: "{{ openhpc_slurm_conf_path | dirname }}/mpi.conf" + owner: root + group: root + mode: "0644" + when: + - openhpc_enable.control | default(false) + - openhpc_mpi_conf | length > 0 + notify: + - Restart slurmctld service + register: ohpc_mpi_conf + # Workaround for https://bugs.rockylinux.org/view.php?id=10165 - name: Fix permissions on /etc for Munge service ansible.builtin.file: From 4996f54857b8651b1ec301c4a6f652fcf0a8d7b1 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:13:29 +0100 Subject: [PATCH 02/13] Update main.yml --- defaults/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/defaults/main.yml b/defaults/main.yml index 8f597ec3..5bbb1ab6 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -55,6 +55,7 @@ openhpc_state_save_location: /var/spool/slurm openhpc_slurmd_spool_dir: /var/spool/slurm openhpc_slurm_conf_path: /etc/slurm/slurm.conf openhpc_slurm_conf_template: slurm.conf.j2 +openhpc_mpi_conf_template: mpi.conf.j2 # Accounting openhpc_slurm_accounting_storage_host: "{{ openhpc_slurmdbd_host }}" From 154683810b88c376d376b17b5fc90179ae18d74c Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:14:59 +0100 Subject: [PATCH 03/13] Update main.yml --- defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/defaults/main.yml b/defaults/main.yml index 5bbb1ab6..4a88d3df 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -50,12 +50,12 @@ openhpc_config: {} openhpc_cgroup_config: {} openhpc_gres_template: gres.conf.j2 openhpc_cgroup_template: cgroup.conf.j2 +openhpc_mpi_template: mpi.conf.j2 openhpc_state_save_location: /var/spool/slurm openhpc_slurmd_spool_dir: /var/spool/slurm openhpc_slurm_conf_path: /etc/slurm/slurm.conf openhpc_slurm_conf_template: slurm.conf.j2 -openhpc_mpi_conf_template: mpi.conf.j2 # Accounting openhpc_slurm_accounting_storage_host: "{{ openhpc_slurmdbd_host }}" From 7572bb2fbad173f58992ea618c4bcfa6d4a1233a Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:16:17 +0100 Subject: [PATCH 04/13] Create mpi.conf.j2 --- templates/mpi.conf.j2 | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 templates/mpi.conf.j2 diff --git a/templates/mpi.conf.j2 b/templates/mpi.conf.j2 new file mode 100644 index 00000000..d212c759 --- /dev/null +++ b/templates/mpi.conf.j2 @@ -0,0 +1,4 @@ +# {{ ansible_managed }} +{% for line in openhpc_mpi_conf %} +{{ line }} +{% endfor %} From b2367a7864be5837fa2791a6694f3506b7fab90a Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 15:19:17 +0100 Subject: [PATCH 05/13] empty default mpi conf --- defaults/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/defaults/main.yml b/defaults/main.yml index 4a88d3df..5ba72e16 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -51,6 +51,7 @@ openhpc_cgroup_config: {} openhpc_gres_template: gres.conf.j2 openhpc_cgroup_template: cgroup.conf.j2 openhpc_mpi_template: mpi.conf.j2 +openhpc_mpi_conf: [] openhpc_state_save_location: /var/spool/slurm openhpc_slurmd_spool_dir: /var/spool/slurm From 72cc226d209e8879b90af2d1f04b8789941e668f Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 16:53:30 +0100 Subject: [PATCH 06/13] notify slurmd restart handler for mpi.conf changes --- tasks/runtime.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 0147cb24..b4607482 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -148,6 +148,7 @@ when: - openhpc_slurm_control_host in ansible_play_hosts - hostvars[openhpc_slurm_control_host].ohpc_slurm_conf.changed or + hostvars[openhpc_slurm_control_host].ohpc_mpi_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_cgroup_conf.changed or hostvars[openhpc_slurm_control_host].ohpc_gres_conf.changed # noqa no-handler notify: From 355dd0a9d1ccd6fb7930c6dc149d6c5fcc48a20e Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:22:07 +0100 Subject: [PATCH 07/13] Update runtime.yml --- tasks/runtime.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index b4607482..3c9d4005 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -112,7 +112,7 @@ mode: "0644" when: - openhpc_enable.control | default(false) - - openhpc_mpi_conf | length > 0 + - (openhpc_mpi_default_config | combine(openhpc_mpi_config)) | length > 0 notify: - Restart slurmctld service register: ohpc_mpi_conf From 278f55a550bfd24aa82b502c7e5cff8b7cd90587 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:23:54 +0100 Subject: [PATCH 08/13] Update main.yml --- defaults/main.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/defaults/main.yml b/defaults/main.yml index 5ba72e16..96be7008 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -50,8 +50,10 @@ openhpc_config: {} openhpc_cgroup_config: {} openhpc_gres_template: gres.conf.j2 openhpc_cgroup_template: cgroup.conf.j2 + openhpc_mpi_template: mpi.conf.j2 -openhpc_mpi_conf: [] +openhpc_mpi_default_config: {} +openhpc_mpi_config: {} openhpc_state_save_location: /var/spool/slurm openhpc_slurmd_spool_dir: /var/spool/slurm From 5c9fa0111b91774afbce4625941f003ce0858437 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:24:32 +0100 Subject: [PATCH 09/13] Update mpi.conf.j2 --- templates/mpi.conf.j2 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/templates/mpi.conf.j2 b/templates/mpi.conf.j2 index d212c759..4e9b329b 100644 --- a/templates/mpi.conf.j2 +++ b/templates/mpi.conf.j2 @@ -1,4 +1,6 @@ # {{ ansible_managed }} -{% for line in openhpc_mpi_conf %} -{{ line }} +{% for k, v in openhpc_mpi_default_config | combine(openhpc_mpi_config) | items %} +{% if v != "omit" %}{# allow removing items using setting key: omit #} +{{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }} +{% endif %} {% endfor %} From a53771101649be74eb7913b467e97fa79315964d Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 17 Sep 2025 17:30:11 +0100 Subject: [PATCH 10/13] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 18c5853f..f421e5e7 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,13 @@ used to supplement or override the template defaults. Templated parameters can also be removed by setting the value to the literal string `'omit'` - note that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit). +`openhpc_mpi_config`: Optional. Mapping of additional parameters and values for +[mpi.conf](https://slurm.schedmd.com/mpi.conf.html). Keys are mpi.conf +parameter names and values are lists or strings as appropriate. This can be +used to supplement or override the template defaults. Templated parameters can +also be removed by setting the value to the literal string `'omit'` - note +that this is *not the same* as the Ansible `omit` [special variable](https://docs.ansible.com/ansible/latest/reference_appendices/special_variables.html#term-omit). + `openhpc_ram_multiplier`: Optional, default `0.95`. Multiplier used in the calculation: `total_memory * openhpc_ram_multiplier` when setting `RealMemory` for the partition in slurm.conf. Can be overriden on a per partition basis using `openhpc_slurm_partitions.ram_multiplier`. Has no effect if `openhpc_slurm_partitions.ram_mb` is set. `openhpc_state_save_location`: Optional. Absolute path for Slurm controller state (`slurm.conf` parameter [StateSaveLocation](https://slurm.schedmd.com/slurm.conf.html#OPT_StateSaveLocation)) From cfe864f7c09e7bf08398909d7ad0e96c32297144 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:56:33 +0100 Subject: [PATCH 11/13] Update main.yml --- defaults/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/defaults/main.yml b/defaults/main.yml index 96be7008..29e33adf 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -52,7 +52,6 @@ openhpc_gres_template: gres.conf.j2 openhpc_cgroup_template: cgroup.conf.j2 openhpc_mpi_template: mpi.conf.j2 -openhpc_mpi_default_config: {} openhpc_mpi_config: {} openhpc_state_save_location: /var/spool/slurm From f9ebaf59714445f9e269182f563af6a24c858863 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:58:31 +0100 Subject: [PATCH 12/13] Update mpi.conf.j2 --- templates/mpi.conf.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/mpi.conf.j2 b/templates/mpi.conf.j2 index 4e9b329b..986b62cf 100644 --- a/templates/mpi.conf.j2 +++ b/templates/mpi.conf.j2 @@ -1,5 +1,5 @@ # {{ ansible_managed }} -{% for k, v in openhpc_mpi_default_config | combine(openhpc_mpi_config) | items %} +{% for k, v in openhpc_mpi_config.items %} {% if v != "omit" %}{# allow removing items using setting key: omit #} {{ k }}={{ v | join(',') if (v is sequence and v is not string) else v }} {% endif %} From 3b502087214dd33b5de6a88b652fa36afc87f58a Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:04:41 +0100 Subject: [PATCH 13/13] Update runtime.yml --- tasks/runtime.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/runtime.yml b/tasks/runtime.yml index 3c9d4005..c2e30d45 100644 --- a/tasks/runtime.yml +++ b/tasks/runtime.yml @@ -112,7 +112,7 @@ mode: "0644" when: - openhpc_enable.control | default(false) - - (openhpc_mpi_default_config | combine(openhpc_mpi_config)) | length > 0 + - openhpc_mpi_config | length > 0 notify: - Restart slurmctld service register: ohpc_mpi_conf