-
Notifications
You must be signed in to change notification settings - Fork 0
/
pestat
executable file
·281 lines (263 loc) · 7.71 KB
/
pestat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#!/bin/bash
# Prints a Slurm cluster status with 1 line per node/partition and job info
# Author: Ole.H.Nielsen@fysik.dtu.dk
# URL: ftp://ftp.fysik.dtu.dk/pub/Slurm/pestat or http://ftp.fysik.dtu.dk/pub/Slurm/pestat
my_version="pestat version 0.50. Date: 22 June 2017"
# CONFIGURE the paths of commands:
# Directory where Slurm commands live:
export prefix=/usr/bin
# The awk command
my_awk=$prefix/awk
# Command usage:
function usage()
{
cat <<EOF
Usage: pestat [-p partition(s)] [-u username] [-q qoslist] [-s statelist]
[-f | -m free_mem | -M free_mem ] [-V] [-h]
where:
-p partition: Select only partion <partition>
-u username: Print only user <username>
-q qoslist: Print only QOS in the qoslist <qoslist>
-s statelist: Print only nodes with state in <statelist>
-f: Print only nodes that are flagged by * (unexpected load etc.)
-m free_mem: Print only nodes with free memory LESS than free_mem MB
-M free_mem: Print only nodes with free memory GREATER than free_mem MB (under-utilized)
-h: Print this help information
-V: Version information
EOF
}
# Flagging nodes
flagging=-1
# Free memory threshold value (off by default)
export free_mem_less=-1
export free_mem_more=-1
# Colors may be used in the output (also controlled by the PESTAT_COLOR environment variable).
# Colored output by default:
export colors=1
# Print all nodes by default
export printallnodes=1
export partition=""
while getopts "p:u:q:s:m:M:hVf" options; do
case $options in
p ) export partition="-p $OPTARG"
echo Print only nodes in partition $OPTARG
;;
u ) export username=$OPTARG
if test "`$prefix/sacctmgr -p -n show assoc where users=$username`"
then
echo Select only user $username
else
echo Error selecting Slurm username $username
exit -1
fi
;;
q ) export qoslist=$OPTARG
if test "`$prefix/sacctmgr -n -p show qos $qoslist`"
then
echo Select only QOS=$qoslist
else
echo Error selecting QOS $qoslist
echo Print all available QOSes by: sacctmgr show qos
exit -1
fi
;;
s ) statelist="--states $OPTARG"
echo Select only nodes with state=$OPTARG
;;
f ) flagging=1
if test $free_mem_less -ge 0 -o $free_mem_more -ge 0
then
echo ERROR: The -f -m -M flags are mutually exclusive
exit -1
fi
export printallnodes=0
echo Print only nodes that are flagged by \*
;;
m ) export free_mem_less=$OPTARG
if test $flagging -gt 0 -o $free_mem_more -ge 0
then
echo ERROR: The -f -m -M flags are mutually exclusive
exit -1
fi
export printallnodes=0
echo Select only nodes with free memory LESS than $free_mem_less MB
;;
M ) export free_mem_more=$OPTARG
if test $flagging -gt 0 -o $free_mem_less -ge 0
then
echo ERROR: The -f -m -M flags are mutually exclusive
exit -1
fi
export printallnodes=0
echo Select only nodes with free memory GREATER than $free_mem_more MB
;;
V ) echo $my_version
exit 1;;
h|? ) usage
exit 1;;
* ) usage
exit 1;;
esac
done
# Test for extraneous command line arguments
if test $# -gt $(($OPTIND-1))
then
echo ERROR: Too many command line arguments: $*
usage
exit 1
fi
# Check user environment variable PESTAT_COLOR for color
if test "$PESTAT_COLOR" = "0"
then
export colors=0
fi
# Check if output is NOT a terminal: Turn colors off (can be overruled by "-c" flag).
FD=1 # File Descriptor no. 1 = stdout
if test ! -t $FD
then
export colors=0
fi
# Print all nodes: NODELIST PARTITION CPU CPU_LOAD MEMORY FREE_MEM STATE
$prefix/sinfo -h -N $partition $statelist -o "%N %P %C %O %m %e %t" | $my_awk '
BEGIN {
# Prologue section
# Read the required environment variables:
prefix=ENVIRON["prefix"]
username=ENVIRON["username"]
qoslist=ENVIRON["qoslist"]
free_mem_less=ENVIRON["free_mem_less"]
free_mem_more=ENVIRON["free_mem_more"]
printallnodes=ENVIRON["printallnodes"]
colors=ENVIRON["colors"]
# Define terminal colors for the output if requested
if (colors != 0) {
# See http://en.wikipedia.org/wiki/ANSI_escape_code#Colors
RED="\033[1;4;31m"
GREEN="\033[1;32m"
MAGENTA="\033[1;35m"
NORMAL="\033[0m"
}
if (qoslist != "") selection = selection " -q " qoslist
# The "scontrol show hostnames" command is used to expand NodeList expressions
HOSTLIST=prefix "/scontrol show hostnames "
# Gather the list of running jobs
# Running jobs info: JobState JobId User NodeList
JOBLIST = prefix "/squeue -t RUNNING -h -o \"%T %A %u %N\" " selection
while ((JOBLIST | getline) > 0) {
JobState=$1
# Replaced by -t RUNNING flag: Skip jobs if not in RUNNING state
# if (JobState != "RUNNING") continue
JobId=$2
User=$3
NodeList=$4
# Create the list of nodes for this job, expand the list if necessary
if (index(NodeList,"[") == 0) {
jobnodes[1] = NodeList
} else {
# Put hostname lines into an array jobnodes[]
cmd = HOSTLIST NodeList
i=0
while ((cmd | getline) > 0) jobnodes[++i] = $1
close (cmd)
}
# Populate the node arrays with "JobId User" (multiple jobs may exist)
for (i in jobnodes) {
n = jobnodes[i]
hostname[n] = n
jobs[n] = jobs[n] JobId " " User " "
numjobs[n]++
# If username has been selected and node "n" runs job belonging to username:
if (User == username) selecteduser[n] = User
}
delete jobnodes
}
close (JOBLIST)
# Print a header line
printf("%8s %15s %8s %7s %8s %8s %8s %s\n", "Hostname", "Partition", "Node", "Num_CPU", "CPUload", "Memsize", "Freemem", "Joblist")
printf("%8s %15s %8s %7s %8s %8s %8s %s\n", "", "", "State", "Use/Tot", "", "(MB)", "(MB)", "JobId User ...")
}
{
# Main section
node=$1
# Selection of subset of nodes
if (selection != "" && jobs[node] == "") next
if (username != "" && selecteduser[node] == "") next
partition=$2
# sinfo -o %C gives number of CPUs by state in the format "allocated/idle/other/total"
split($3,cpulist,"/")
cpuload=$4
memory=$5
freemem=$6
state=$7
# Select only subset of nodes with certain values/states
listnode = printallnodes
if (free_mem_less > 0) {
# Free memory on the node LESS than free_mem_less
if (freemem < free_mem_less) listnode++
} else if (free_mem_more > 0) {
# Free memory on the node GREATER than free_mem_more
if (freemem > free_mem_more) listnode++
} else {
if (state == "drain" || state == "drng" || state == "resv" || state == "down" || state == "error") {
# Flag nodes with status down, drain etc.
stateflag="*"
statecolor=RED
listnode++
} else {
stateflag=" "
statecolor=NORMAL
}
# Flag unexpected CPU load average
loadratio = cpuload/(cpulist[1] + 0.001)
if (loadratio > 4 || loadratio < 0.25) {
loadflag="*"
loadcolor=RED
cpucolor=GREEN
listnode++
} else if (loadratio > 2 || loadratio < 0.5) {
loadflag="*"
loadcolor=MAGENTA
cpucolor=GREEN
listnode++
} else {
loadflag=" "
loadcolor=NORMAL
cpucolor=NORMAL
}
# Flag unexpected number of jobs
if (numjobs[node] > cpulist[1]) { # Should be at least 1 task per job
jobflag="*"
jobcolor=RED
listnode++
} else {
jobflag=" "
jobcolor=NORMAL
}
# Free memory on the node
if (freemem < memory/10) { # Very high memory usage (<10% free)
memflag="*"
freememcolor=RED
memcolor=GREEN
listnode++
} else if (freemem < memory/5) { # High memory usage (<20% free)
memflag="*"
freememcolor=MAGENTA
memcolor=GREEN
listnode++
} else {
memflag=" "
freememcolor=NORMAL
memcolor=NORMAL
}
}
if (listnode > 0) {
printf("%8s %15s ", node, partition)
printf("%s%8s%1s%s", statecolor, state, stateflag, NORMAL)
printf("%s%3d%s %3d ", cpucolor, cpulist[1], NORMAL, cpulist[4])
printf("%s%7.2f%1s%s ", loadcolor, cpuload, loadflag, NORMAL)
printf("%s%8d %s%8d%1s%s ", memcolor, memory, freememcolor, freemem, memflag, NORMAL)
printf("%s%s%1s%s", jobcolor, jobs[node], jobflag, NORMAL)
printf("\n")
}
delete cpulist
}'