-
Notifications
You must be signed in to change notification settings - Fork 22
/
azure_ib_write_bw_gdr.nhc
103 lines (84 loc) · 2.99 KB
/
azure_ib_write_bw_gdr.nhc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
source /etc/nhc/scripts/azure_common.nhc
IB_WRITE_BW_EXE_PATH=$AZ_NHC_ROOT/bin/ib_write_bw
SLEEP_TIME=5
HOSTNAME=$(hostname)
function run_ib_bw_gdr(){
EXP_IB_BW=$1
PORTARG=$2
device=$3
dev_idx=$4
#NUMA mapping
if ! numa_node=$(get_ib_numa_node "$device"); then
die 1 "check_ib_bw_gdr: $IB_WRITE_BW, IB=$device, could not determine NUMA node. FaultCode: NHC2005"
return 1
fi
MESSAGE_SIZE=8388608
IB_WRITE_BW=$( basename $IB_WRITE_BW_EXE_PATH )
IB_WRITE_BW_ARGS="-p $PORTARG -s $MESSAGE_SIZE -x 0 -F --report_gbits --perform_warm_up"
IB_WRITE_BW_OUT1=$(numactl -N $numa_node -m $numa_node $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=$dev_idx -d $device > /dev/null &)
IB_WRITE_BW_OUT1_RC=$?
if [[ $IB_WRITE_BW_OUT1_RC != 0 ]]; then
NETSTAT_OUT=$(netstat -lnp | grep $PORTARG)
log "Running: netstat -lnp | grep $PORTARG:"
log "$NETSTAT_OUT"
die 1 "check_ib_bw_gdr: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT1_RC. FaultCode: NHCNA"
return 1
fi
sleep $SLEEP_TIME
device_peer=$device
IB_WRITE_BW_OUT2=$(numactl -N $numa_node -m $numa_node $IB_WRITE_BW_EXE_PATH $IB_WRITE_BW_ARGS --use_cuda=$dev_idx -d $device_peer $HOSTNAME)
IB_WRITE_BW_OUT2_RC=$?
if [[ $IB_WRITE_BW_OUT2_RC != 0 ]]; then
NETSTAT_OUT=$(netstat -lnp | grep $PORTARG)
log "Running: netstat -lnp | grep $PORTARG:"
log "$NETSTAT_OUT"
die 1 "check_ib_bw_gdr: $IB_WRITE_BW returned error code $IB_WRITE_BW_OUT2_RC. FaultCode: NHCNA"
return 1
fi
IFS=$'\n'
IB_WRITE_BW_OUT2_LINES=( $IB_WRITE_BW_OUT2 )
IFS=$' \t\n'
for ((i=0; i<${#IB_WRITE_BW_OUT2_LINES[*]}; i++))
do
if [[ "${IB_WRITE_BW_OUT2_LINES[$i]//$MESSAGE_SIZE}" != "${IB_WRITE_BW_OUT2_LINES[$i]}" ]]; then
LINE=( ${IB_WRITE_BW_OUT2_LINES[$i]} )
ib_bandwidth=${LINE[3]}
dbg "IB devices=${device}, ${device_peer}: numa domains=$numa_node,$numa_node, Measured IB BW $ib_bandwidth Gbps"
break
fi
done
dbg "ib_write_lb_${device}: $ib_bandwidth Gbps"
if (( $(echo "$ib_bandwidth < $EXP_IB_BW" | bc -l) )); then
log "$IB_WRITE_BW_OUT2"
die 1 "check_ib_bw_gdr: $IB_WRITE_BW, IB=$device, $device_peer, IB BW (expected > $EXP_IB_BW Gbps, but measured $ib_bandwidth Gbps. FaultCode: NHC2003"
return 1
fi
pass 0 "check_ib_bw_gdr: IB write bandwidth test IB_WRITE_BW passed for IB=$device, IB BW=$ib_bandwidth Gbps"
return 0
}
function check_ib_bw_gdr(){
trap "background_cleanup" RETURN
GPU_FREQ=$(boost_gpu_clock)
if [[ $? != 0 ]]; then
die 1 -e "$FUNCNAME: Failed to boost GPU clocks. FaultCode: NHCNA"
return 0
fi
EXP_IB_BW=$1
port_numbers=(18510 18511 18512 18513 18514 18515 18516 18517 )
pid_array=()
# log pkeys
print_pkeys $FUNCNAME
#get devices
devices=($(determine_IB_devices))
dev_idx=0
for device in "${devices[@]}";
do
run_ib_bw_gdr $EXP_IB_BW ${port_numbers[$dev_idx]} $device $dev_idx &
pid_array+=($!)
(( dev_idx++ ))
done
wait "${pid_array[@]}"
remove_clock_boost
return 0
}