-
Notifications
You must be signed in to change notification settings - Fork 23
/
csc_nvidia_smi.nhc
75 lines (63 loc) · 2.79 KB
/
csc_nvidia_smi.nhc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
# Check for GPU ECC errors
source /etc/nhc/scripts/azure_common.nhc
NVIDIA_SMI_HEALTHMON="${NVIDIA_SMI_HEALTHMON:-nvidia-smi}"
NVIDIA_SMI_HEALTHMON_ARGS="${NVIDIA_SMI_HEALTHMON_ARGS}"
if lspci | grep -iq NVIDIA ; then
NVSMI_OUTPUT_VAR="$(nvidia-smi)"
fi
NVSMI_HEALTHMON_LINES=( )
NVSMI_HEALTHMON_OUTPUT=""
NVSMI_HEALTHMON_RC=""
export NVSMI_HEALTHMON_LINES NVSMI_HEALTHMON_OUTPUT NVSMI_HEALTHMON_RC NVSMI_OUTPUT_VAR
function nhc_nvsmi_gather_data() {
local IFS
NVSMI_HEALTHMON_OUTPUT=$($NVIDIA_SMI_HEALTHMON $NVIDIA_SMI_HEALTHMON_ARGS 2>/dev/null)
NVSMI_HEALTHMON_RC=$?
IFS=$'\n'
NVSMI_HEALTHMON_LINES=( $NVSMI_HEALTHMON_OUTPUT )
}
# Run the nvidia-smi utility and verify that all GPUs
# are functioning properly.
function check_nvsmi_healthmon() {
dbg "$NVSMI_OUTPUT_VAR"
if [[ -z "$NVSMI_HEALTHMON_RC" ]]; then
nhc_nvsmi_gather_data
fi
if [[ $NVSMI_HEALTHMON_RC -eq 0 ]]; then
dbg "$FUNCNAME: $NVIDIA_SMI_HEALTHMON completed successfully"
pass 0 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON completed successfully"
return 0
elif [[ $NVSMI_HEALTHMON_RC -eq 4 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Permission denied. FaultCode: NHCNA"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 8 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Power cables not attached. FaultCode: NHC2001"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 2 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Invalid argument or flag. FaultCode: NHCNA"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 9 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVIDIA driver not loaded. FaultCode: NHCNA"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 10 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Interrupt issue with a GPU. FaultCode: NHC2001"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 12 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: NVML shared library could not be found. faultCode: NHCNA"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 14 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: InfoROM is corrupted. FaultCode: NHC2008"
return 1
elif [[ $NVSMI_HEALTHMON_RC -eq 15 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: The GPU has fallen off the bus or has otherwise become inaccessible. FaultCode: NHC2010"
return 1
elif [[ $NVSMI_HEALTHMON_RC -gt 127 ]]; then
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Caught fatal signal $((NVSMI_HEALTHMON_RC&0x7f)). FaultCode: NHC2015"
return 1
else
log "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: \"$NVSMI_HEALTHMON_OUTPUT\""
die 1 "$FUNCNAME: $NVIDIA_SMI_HEALTHMON: Returned failure code $NVSMI_HEALTHMON_RC. FaultCode: NHC2001"
return 1
fi
}