Skip to content

Commit acb69ce

Browse files
committed
Merge branch 'upstream' from v1.148 (incl. image update)
2 parents 210a266 + 7ee7eee commit acb69ce

File tree

32 files changed

+582
-240
lines changed

32 files changed

+582
-240
lines changed

.github/workflows/fatimage.yml

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,26 @@ name: Build fat image
33
'on':
44
workflow_dispatch:
55
inputs:
6-
use_RL9:
6+
use_RL8:
77
required: true
8-
description: Include RL9 image build
8+
description: Include RL8 image build
99
type: boolean
1010
default: false
11+
concurrency:
12+
group: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
13+
cancel-in-progress: true
1114
jobs:
1215
openstack:
1316
name: openstack-imagebuild
1417
runs-on: ubuntu-20.04
15-
concurrency: ${{ github.ref }}-{{ matrix.os_version }} # to branch/PR + OS
1618
strategy:
1719
matrix:
1820
os_version: [RL8, RL9]
19-
rl9_selected:
20-
- ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch
21+
rl8_selected:
22+
- ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
2123
exclude:
22-
- os_version: RL9
23-
rl9_selected: false
24+
- os_version: RL8
25+
rl8_selected: false
2426
env:
2527
ANSIBLE_FORCE_COLOR: True
2628
OS_CLOUD: openstack
@@ -61,18 +63,19 @@ jobs:
6163
. environments/.stackhpc/activate
6264
cd packer/
6365
packer init .
64-
PACKER_LOG=1 packer build -only openstack.openhpc -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
66+
PACKER_LOG=1 packer build -on-error=${{ vars.PACKER_ON_ERROR }} -var-file=$PKR_VAR_environment_root/${{ vars.CI_CLOUD }}.pkrvars.hcl openstack.pkr.hcl
6567
env:
6668
PKR_VAR_os_version: ${{ matrix.os_version }}
6769

68-
- name: Get created image name from manifest
70+
- name: Get created image names from manifest
6971
id: manifest
7072
run: |
7173
. venv/bin/activate
72-
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
73-
while ! openstack image show -f value -c name $IMAGE_ID; do
74-
sleep 30
74+
for IMAGE_ID in $(jq --raw-output '.builds[].artifact_id' packer/packer-manifest.json)
75+
do
76+
while ! openstack image show -f value -c name $IMAGE_ID; do
77+
sleep 5
78+
done
79+
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
80+
echo $IMAGE_NAME
7581
done
76-
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
77-
echo "IMAGE_ID=${IMAGE_ID}" >> "$GITHUB_OUTPUT"
78-
echo "IMAGE_NAME=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"

.github/workflows/stackhpc.yml

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ name: Test deployment and reimage on OpenStack
33
on:
44
workflow_dispatch:
55
inputs:
6-
use_RL9:
6+
use_RL8:
77
required: true
8-
description: Include RL9 tests
8+
description: Include RL8 tests
99
type: boolean
1010
default: false
1111
# push:
@@ -20,14 +20,17 @@ jobs:
2020
strategy:
2121
matrix:
2222
os_version: [RL8, RL9]
23-
rl9_selected:
24-
- ${{ inputs.use_RL9 == true }} # only potentially true for workflow_dispatch
25-
rl9_branch:
26-
- ${{ startsWith(github.head_ref, 'rl9') == true }} # only potentially for pull_request, always false on merge
23+
rl8_selected:
24+
- ${{ inputs.use_RL8 == true }} # only potentially true for workflow_dispatch
25+
rl8_branch:
26+
- ${{ startsWith(github.head_ref, 'rl8') == true }} # only potentially for pull_request, always false on merge
27+
rl8_label:
28+
- ${{ contains(github.event.pull_request.labels.*.name, 'RL8') }} # NB: needs a new commit if added after PR created
2729
exclude:
28-
- os_version: RL9
29-
rl9_selected: false
30-
rl9_branch: false
30+
- os_version: RL8
31+
rl8_selected: false
32+
rl8_branch: false
33+
rl8_label: false
3134
env:
3235
ANSIBLE_FORCE_COLOR: True
3336
OS_CLOUD: openstack

ansible/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,5 @@ roles/*
5252
!roles/image_build/**
5353
!roles/persist_hostkeys/
5454
!roles/persist_hostkeys/**
55-
!roles/requirements.yml
55+
!roles/ofed/
56+
!roles/ofed/**

ansible/bootstrap.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,3 +196,11 @@
196196
- name: update facts
197197
setup:
198198
when: (sestatus.changed | default(false)) or (sestatus.reboot_required | default(false))
199+
200+
- hosts: ofed
201+
gather_facts: no
202+
become: yes
203+
tags: ofed
204+
tasks:
205+
- include_role:
206+
name: ofed

ansible/roles/cluster_infra/templates/resources.tf.j2

Lines changed: 94 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,13 @@ data "openstack_networking_network_v2" "cluster_external_network" {
116116
name = "{{ cluster_external_network }}"
117117
}
118118

119+
# Storage network
120+
{% if cluster_storage_network is defined %}
121+
data "openstack_networking_network_v2" "cluster_storage" {
122+
name = "{{ cluster_storage_network }}"
123+
}
124+
{% endif %}
125+
119126
data "openstack_networking_subnet_ids_v2" "cluster_external_subnets" {
120127
network_id = "${data.openstack_networking_network_v2.cluster_external_network.id}"
121128
}
@@ -177,6 +184,11 @@ data "openstack_networking_subnet_v2" "cluster_subnet" {
177184
##### Cluster ports
178185
#####
179186

187+
###
188+
# Login node
189+
###
190+
191+
# Primary network
180192
resource "openstack_networking_port_v2" "login" {
181193
name = "{{ cluster_name }}-login-0"
182194
network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
@@ -193,14 +205,31 @@ resource "openstack_networking_port_v2" "login" {
193205

194206
binding {
195207
vnic_type = "{{ cluster_vnic_type | default('normal') }}"
196-
{% if cluster_vnic_profile is defined %}
197-
profile = <<EOF
198-
{{ cluster_vnic_profile | to_json }}
199-
EOF
200-
{% endif %}
201208
}
202209
}
203210

211+
# Storage network
212+
{% if cluster_storage_network is defined %}
213+
resource "openstack_networking_port_v2" "login_storage" {
214+
name = "{{ cluster_name }}-login-storage-0"
215+
network_id = data.openstack_networking_network_v2.cluster_storage.id
216+
admin_state_up = "true"
217+
218+
security_group_ids = [
219+
"${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}",
220+
]
221+
222+
binding {
223+
vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
224+
}
225+
}
226+
{% endif %}
227+
228+
###
229+
# Control node
230+
###
231+
232+
# Primary network
204233
resource "openstack_networking_port_v2" "control" {
205234
name = "{{ cluster_name }}-control-0"
206235
network_id = "${data.openstack_networking_network_v2.cluster_network.id}"
@@ -216,15 +245,32 @@ resource "openstack_networking_port_v2" "control" {
216245

217246
binding {
218247
vnic_type = "{{ cluster_vnic_type | default('normal') }}"
219-
{% if cluster_vnic_profile is defined %}
220-
profile = <<EOF
221-
{{ cluster_vnic_profile | to_json }}
222-
EOF
223-
{% endif %}
248+
249+
}
250+
}
251+
252+
# Storage network
253+
{% if cluster_storage_network is defined %}
254+
resource "openstack_networking_port_v2" "control_storage" {
255+
name = "{{ cluster_name }}-control-storage-0"
256+
network_id = data.openstack_networking_network_v2.cluster_storage.id
257+
admin_state_up = "true"
258+
259+
security_group_ids = [
260+
"${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}"
261+
]
262+
263+
binding {
264+
vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
224265
}
225266
}
267+
{% endif %}
226268

269+
###
270+
# Workers
271+
###
227272
{% for partition in openhpc_slurm_partitions %}
273+
# Primary network
228274
resource "openstack_networking_port_v2" "{{ partition.name }}" {
229275
count = {{ partition.count }}
230276
name = "{{ cluster_name }}-compute-{{ partition.name }}-${count.index}"
@@ -241,14 +287,27 @@ resource "openstack_networking_port_v2" "{{ partition.name }}" {
241287

242288
binding {
243289
vnic_type = "{{ cluster_vnic_type | default('normal') }}"
244-
{% if cluster_vnic_profile is defined %}
245-
profile = <<EOF
246-
{{ cluster_vnic_profile | to_json }}
247-
EOF
248-
{% endif %}
249290
}
250291
}
251292

293+
# Storage network
294+
{% if cluster_storage_network is defined %}
295+
resource "openstack_networking_port_v2" "{{ partition.name }}_storage" {
296+
count = {{ partition.count }}
297+
name = "{{ cluster_name }}-compute-{{ partition.name }}-storage-${count.index}"
298+
network_id = data.openstack_networking_network_v2.cluster_storage.id
299+
admin_state_up = "true"
300+
301+
security_group_ids = [
302+
"${openstack_networking_secgroup_v2.secgroup_slurm_cluster.id}"
303+
]
304+
305+
binding {
306+
vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}"
307+
}
308+
}
309+
{% endif %}
310+
252311
{% endfor %}
253312

254313
#####
@@ -274,9 +333,15 @@ resource "openstack_compute_instance_v2" "login" {
274333
{% endif %}
275334

276335
network {
277-
port = "${openstack_networking_port_v2.login.id}"
336+
port = openstack_networking_port_v2.login.id
278337
}
279338

339+
{% if cluster_storage_network is defined %}
340+
network {
341+
port = openstack_networking_port_v2.login_storage.id
342+
}
343+
{% endif %}
344+
280345
# root device:
281346
block_device {
282347
uuid = "{{ cluster_image }}"
@@ -317,9 +382,15 @@ resource "openstack_compute_instance_v2" "control" {
317382
{% endif %}
318383

319384
network {
320-
port = "${openstack_networking_port_v2.control.id}"
385+
port = openstack_networking_port_v2.control.id
321386
}
322387

388+
{% if cluster_storage_network is defined %}
389+
network {
390+
port = openstack_networking_port_v2.control_storage.id
391+
}
392+
{% endif %}
393+
323394
# root device:
324395
block_device {
325396
uuid = "{{ cluster_image }}"
@@ -393,6 +464,12 @@ resource "openstack_compute_instance_v2" "{{ partition.name }}" {
393464
port = openstack_networking_port_v2.{{ partition.name }}[count.index].id
394465
}
395466

467+
{% if cluster_storage_network is defined %}
468+
network {
469+
port = openstack_networking_port_v2.{{ partition.name }}_storage[count.index].id
470+
}
471+
{% endif %}
472+
396473
# root device:
397474
block_device {
398475
uuid = "{{ cluster_image }}"

ansible/roles/freeipa/tasks/addhost.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
community.general.ipa_host:
55
name: "{{ node_fqdn }}"
66
ip_address: "{{ freeipa_client_ip }}"
7-
ipa_host: "{{ groups['freeipa_server'].0 }}"
87
ipa_pass: "{{ vault_freeipa_admin_password }}"
98
ipa_user: admin
109
state: present
@@ -19,12 +18,12 @@
1918
community.general.ipa_host:
2019
name: "{{ node_fqdn }}"
2120
ip_address: "{{ freeipa_client_ip }}"
22-
ipa_host: "{{ groups['freeipa_server'].0 }}"
2321
ipa_pass: "{{ vault_freeipa_admin_password }}"
2422
ipa_user: admin
2523
random_password: true
2624
state: present
2725
validate_certs: false
26+
ipa_timeout: 30
2827
delegate_to: "{{ groups['freeipa_server'].0 }}"
2928
when: "'sshpubkeyfp' not in _ipa_host_check.host"
3029
register: _ipa_host_add

ansible/roles/ofed/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# ofed
2+
3+
This role installs Mellanox OFED:
4+
- It checks that the running kernel is the latest installed one, and errors if not.
5+
- Installation uses the `mlnxofedinstall` command, with support for the running kernel
6+
and (by default) without firmware updates.
7+
8+
As OFED installation takes a long time generally this should only be used during image build,
9+
for example by setting:
10+
11+
```
12+
environments/groups/<environment>/groups:
13+
[ofed:children]
14+
builder
15+
```
16+
17+
# Role variables
18+
19+
See `defaults/main.yml`
20+
21+
Note ansible facts are required, unless setting `ofed_distro_version` and `ofed_arch` specifically.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
ofed_version: '24.04-0.6.6.0' # LTS version 23.10-2.1.3.1 does not support RL9.4
2+
ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz
3+
ofed_distro: rhel # NB: not expected to work on other distros due to installation differences
4+
ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9'
5+
ofed_arch: "{{ ansible_architecture }}"
6+
ofed_tmp_dir: /tmp
7+
ofed_update_firmware: false
8+
ofed_build_packages: # may require additional packages depending on ofed_package_selection
9+
- autoconf
10+
- automake
11+
- gcc
12+
- gcc-gfortran
13+
- kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}
14+
- kernel-rpm-macros
15+
- libtool
16+
- lsof
17+
- patch
18+
- pciutils
19+
- perl
20+
- rpm-build
21+
- tcl
22+
- tk
23+
ofed_build_rl8_packages:
24+
- gdb-headless
25+
- python36
26+
ofed_package_selection: # list of package selection flags for mlnxofedinstall script
27+
- hpc
28+
- with-nfsrdma

0 commit comments

Comments
 (0)