forked from flux-framework/flux-core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
t0034-flub.t
executable file
·193 lines (176 loc) · 5.64 KB
/
t0034-flub.t
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/bin/sh
#
test_description='Test flub bootstrap method'
. `dirname $0`/sharness.sh
test_under_flux 8 full
export FLUX_URI_RESOLVE_LOCAL=t
# usage: get_job_uri id
get_job_uri() {
flux job wait-event -t10 $1 memo >/dev/null && flux uri $1
}
# usage: wait_for_service uri name
wait_for_service() {
flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\"
}
test_expect_success 'broker fails with bad broker.boot-server' '
test_must_fail flux broker \
-Sbroker.rc1_path= -Sbroker.rc3_path= \
-Sbroker.boot-server=local://noexist/path \
/bin/true 2>server.err &&
grep "was not found" server.err
'
test_expect_success 'start a 1 node job with 0 extra ranks' '
id=$(flux batch -N1 --wrap sleep inf) &&
get_job_uri $id >test1.uri
'
test_expect_success 'job has size 1' '
size=$(flux proxy $(cat test1.uri) flux getattr size) &&
test $size -eq 1
'
test_expect_success 'flub bootstrap fails with no available ranks' '
test_must_fail flux broker \
-Sbroker.boot-server=$(cat test1.uri) 2>noranks.err &&
grep "no available ranks" noranks.err
'
test_expect_success 'clean up' '
flux cancel --all
'
#
# Start 2 node batch job with one extra slot.
# Submit 1 node broker job that fills the slot.
# Run a parallel job across all three nodes in the batch job.
# This test is constrained so that all flubbed nodes are leaf nodes,
# and the flubbed nodes connect to rank 0 only.
test_expect_success 'create config with 3 fake nodes' '
cat >fake3.toml <<-EOT
[resource]
noverify = true
[[resource.config]]
hosts = "a,b,c"
cores = "0-3"
EOT
'
test_expect_success 'start a 2 node job with 1 extra rank' '
id=$(flux batch -N2 \
--broker-opts=--config-path=fake3.toml \
--broker-opts=-Ssize=3 \
--broker-opts=-Sbroker.quorum=2 \
--broker-opts=-Stbon.topo=kary:0 \
--wrap sleep inf) &&
get_job_uri $id >test2.uri
'
test_expect_success 'job has size 3' '
size=$(flux proxy $(cat test2.uri) flux getattr size) &&
test $size -eq 3
'
test_expect_success 'overlay status shows extra node offline' '
flux proxy $(cat test2.uri) \
flux overlay status --no-pretty >ov2.out &&
grep "2 extra0: offline" ov2.out
'
test_expect_success 'run a 2 node job in the initial instance' '
wait_for_service $(cat test2.uri) job-ingest &&
run_timeout 30 flux proxy $(cat test2.uri) \
flux run --label-io -N2 flux pmi barrier
'
test_expect_success 'submit a job that starts 1 extra broker' '
id=$(flux submit -N1 flux broker \
--config-path=fake3.toml \
-Stbon.topo=kary:0 \
-Sbroker.boot-server=$(cat test2.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test2.uri) \
flux overlay status --summary --wait full --timeout 30s
'
test_expect_success 'run a 3 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test2.uri) \
flux run --label-io -N3 flux pmi barrier
'
test_expect_success 'clean up' '
flux cancel --all
'
test_expect_success 'create config with 7 fake nodes' '
cat >fake7.toml <<-EOT
[resource]
noverify = true
[[resource.config]]
hosts = "a,b,c,d,e,f,g"
cores = "0-3"
EOT
'
#
# Start 1 node batch job with 6 extra slots (kary:2).
# Submit 6 node broker job that fills all the slots.
# Run a 7 node parallel job.
#
test_expect_success 'start a 1 node job with 6 extra ranks' '
id=$(flux batch -N1 \
--broker-opts=--config-path=fake7.toml \
--broker-opts=-Ssize=7 \
--broker-opts=-Sbroker.quorum=1 \
--broker-opts=-Stbon.topo=kary:2 \
--wrap sleep inf) &&
get_job_uri $id >test5.uri
'
test_expect_success 'run a 1 node job in the initial instance' '
wait_for_service $(cat test5.uri) job-ingest &&
run_timeout 30 flux proxy $(cat test5.uri) \
flux run --label-io -N1 flux pmi barrier
'
test_expect_success 'job has size 7' '
size=$(flux proxy $(cat test5.uri) flux getattr size) &&
test $size -eq 7
'
# N.B. include exit-timeout=none so we can safely disconnect one node later
test_expect_success 'submit a job that starts 6 extra brokers' '
id=$(flux submit -N6 -o exit-timeout=none \
flux broker \
--config-path=fake7.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test5.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start &&
echo $id >xtra_id
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test5.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 7 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test5.uri) \
flux run --label-io -N7 flux pmi barrier
'
#
# Show that a node can be replaced
test_expect_success 'disconnect rank 6' '
flux proxy $(cat test5.uri) \
flux overlay disconnect 6
'
test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' '
test_must_fail flux proxy $(cat test5.uri) \
flux ping -c1 6
'
test_expect_success 'wait for overlay status to be degraded' '
flux proxy $(cat test5.uri) \
flux overlay status --summary --wait degraded --timeout 10s
'
test_expect_success 'submit a job that starts 1 broker' '
id=$(flux submit -N1 flux broker \
--config-path=fake7.toml \
-Stbon.topo=kary:2 \
-Sbroker.boot-server=$(cat test5.uri)) &&
flux job wait-event -p guest.exec.eventlog $id shell.start
'
test_expect_success 'wait for overlay status to be full' '
flux proxy $(cat test5.uri) \
flux overlay status --summary --wait full --timeout 10s
'
test_expect_success 'run a 7 node job in the expanded instance' '
run_timeout 30 flux proxy $(cat test5.uri) \
flux run --label-io -N7 flux pmi barrier
'
test_expect_success 'clean up' '
flux cancel --all
'
test_done