forked from awslabs/libfabric-ci-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
multinode_runfabtests.sh
executable file
·164 lines (143 loc) · 5.35 KB
/
multinode_runfabtests.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
. ~/.bash_profile
run_test_with_expected_ret()
{
SERVER_IP=$1
CLIENT_IP=$2
SERVER_CMD=$3
CLIENT_CMD=$4
EXPECT_RESULT=$5
ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no ${SERVER_IP} ${SERVER_CMD} >& server.out &
server_pid=$!
sleep 1
ssh -o ConnectTimeout=30 -o StrictHostKeyChecking=no ${CLIENT_IP} ${CLIENT_CMD} ${SERVER_IP} >& client.out &
client_pid=$!
wait $client_pid
client_ret=$?
if [ $client_ret -ne 0 ]; then
kill -9 $server_pid
fi
wait $server_pid
server_ret=$?
ret=0
if [ ${EXPECT_RESULT} = "FAIL" ]; then
if [ $server_ret -ne 0 ] || [ $client_ret -ne 0 ]; then
echo "Test ${PROGRAM_TO_RUN} Passed!"
else
echo "Test ${PROGRAM_TO_RUN} Failed!"
ret=1
fi
else
if [ $server_ret -eq 0 ] && [ $client_ret -eq 0 ]; then
echo "Test ${PROGRAM_TO_RUN} Passed!"
else
echo "Test ${PROGRAM_TO_RUN} Failed!"
ret=1
fi
fi
echo "server output:"
cat server.out
echo "client output:"
cat client.out
return $ret
}
set -xe
PROVIDER=$1
SERVER_IP=$2
CLIENT_IP=$3
BUILD_GDR=$5
# Runs all the tests in the fabtests suite while only expanding failed cases
EXCLUDE=${HOME}/libfabric/fabtests/install/share/fabtests/test_configs/${PROVIDER}/${PROVIDER}.exclude
if [ -f ${EXCLUDE} ]; then
EXCLUDE="-R -f ${EXCLUDE}"
else
EXCLUDE=""
fi
# Each individual test has a "-b" option and "-E" option. Both will
# use out-of-band address exchange.
# The difference is "-b" will use out-of-band synchronization, -E
# does not.
#
# runfabtests.sh's "-b" option actually uses the -E option of each indivdual
# test (for historical reasons).
#
runfabtests_script="${HOME}/libfabric/fabtests/install/bin/runfabtests.sh"
b_option_available="$($runfabtests_script -h 2>&1 | grep '\-b' || true)"
# Check if '-P' option (Run provider specific fabtests) is available
P_option_available="$($runfabtests_script -h 2>&1 | grep '\-P' || true)"
FABTESTS_OPTS="-E LD_LIBRARY_PATH=\"$LD_LIBRARY_PATH\" -vvv ${EXCLUDE}"
FABTESTS_OPTS+=" -p ${HOME}/libfabric/fabtests/install/bin/"
if [ ${PROVIDER} == "efa" ]; then
if [ -n "$P_option_available" ]; then
FABTESTS_OPTS+=" -P"
fi
if [ -n "$b_option_available" ]; then
FABTESTS_OPTS+=" -b -t all"
else
gid_c=$4
gid_s=$(ibv_devinfo -v | grep GID | awk '{print $3}')
FABTESTS_OPTS+=" -C \"-P 0\" -s $gid_s -c $gid_c -t all"
fi
fi
bash -c "$runfabtests_script ${FABTESTS_OPTS} ${PROVIDER} ${SERVER_IP} ${CLIENT_IP}"
if [ ${PROVIDER} == "efa" ]; then
# dgram_pingpong test has been excluded during installation
# (in install-fabtests.sh), because it does not work with "-E" option.
# So here we run it separately using "-b" option
bash_option=$-
restore_e=0
if [[ $bash_option =~ e ]]; then
restore_e=1
set +e
fi
exit_code=0
ami_arch=$(uname -m)
# Run fi_dgram_pingpong on x86 only as it currently does not work on c6gn instances.
# This change will be reverted once the issue is fixed.
if [[ "$ami_arch" == "x86_64" ]]; then
echo "Run fi_dgram_pingpong with out-of-band synchronization"
SERVER_CMD="${HOME}/libfabric/fabtests/install/bin/fi_dgram_pingpong -k -p efa -b"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
fi
# Run fi_rdm_tagged_bw with fork when different environment variables are set.
fork_option_available=$(${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -h 2>&1 | grep '\-K' || true)
if [ -n "$fork_option_available" ]; then
echo "Run fi_rdm_tagged_bw with fork"
SERVER_CMD="${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -p efa -K -E"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "FAIL"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
echo "Run fi_rdm_tagged_bw with fork and RDMAV_FORK_SAFE set"
SERVER_CMD="RDMAV_FORK_SAFE=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -v -p efa -K -E"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
echo "Run fi_rdm_tagged_bw with fork and FI_EFA_FORK_SAFE set"
SERVER_CMD="FI_EFA_FORK_SAFE=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -v -p efa -K -E"
CLIENT_CMD="${SERVER_CMD}"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
fi
if [[ ${BUILD_GDR} -eq 1 ]]; then
echo "Run fi_rdm_tagged_bw with server using device (GPU) memory and client using host memory"
CLIENT_CMD="FI_EFA_USE_DEVICE_RDMA=1 ${HOME}/libfabric/fabtests/install/bin/fi_rdm_tagged_bw -p efa -E"
SERVER_CMD="${CLIENT_CMD} -D cuda"
run_test_with_expected_ret ${SERVER_IP} ${CLIENT_IP} "${SERVER_CMD}" "${CLIENT_CMD}" "PASS"
if [ "$?" -ne 0 ]; then
exit_code=1
fi
fi
if [ $restore_e -eq 1 ]; then
set -e
fi
exit $exit_code
fi