-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbeegfs-ondemand-stoplocal
383 lines (349 loc) · 12 KB
/
beegfs-ondemand-stoplocal
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#!/bin/bash
# beegfs-ondemand-stoplocal
# This file contains helper functions to stop BeeOND services locally on one node.
# This is meant to be sourced from another script (i.e. beeond)
# This file has to be placed into /opt/beegfs/lib/
# Checks the return code of the last command that has been executed. If the code is !=0, indicating
# an error, it prints a message and sets an error flag.
# Parameter:
# A string containing a hint on what was being done that could have caused the error. It is used
# for the error message.
# Modifies:
# ERROR: Is set to "true" when an error was encountered.
sl_checkerror()
{
# Note: we can't copy $1 to a local variable here as this would clear the status code
if [ $? != 0 ]
then
echo "ERROR: There was a problem ${1} on host `hostname`"
ERROR="true"
fi
}
# Prints an info message if the QUIET variable is not set.
# Parameter:
# A string (the message). It is prefixed with INFO when printed.
# Checks:
# QUIET: If "true", nothing is printed.
sl_print_info()
{
local MESSAGE=${1}
if [ "${QUIET}" != "true" ]
then
echo "INFO: ${MESSAGE}"
fi
}
# unmounts tmpfs mounts listed in the status file
sl_unmount_tmpfs()
{
for LINE in `grep ",tmpfs," "${STATUSFILE}"`
do
local MOUNTPOINT=`echo ${LINE} | cut -f 3 -d ","`
sl_print_info "Unmounting tmpfs at ${MOUNTPOINT}"
if [ "${CLEANUP}" != "true" ]
then
sudo /usr/sbin/fuser -k ${MOUNTPOINT}
sudo umount -l ${MOUNTPOINT}
sl_checkerror "unmounting tmpfs"
else
sudo /usr/sbin/fuser -k ${MOUNTPOINT} 2>/dev/null
sudo umount -l ${MOUNTPOINT} 2>/dev/null
true
fi
done
}
# Unmounts all local mounts listed in the status file
sl_unmount_local_mounts()
{
for LINE in `grep ",${CLIENTSERVICE}," "${STATUSFILE}"`
do
local MOUNTPOINT=`echo ${LINE} | cut -f 3 -d ","`
sl_print_info "Unmounting ${MOUNTPOINT}"
if [ "${CLEANUP}" != "true" ]
then
sudo /usr/sbin/fuser -k ${MOUNTPOINT} # no "sl_checkerror" after this, becuase fuser also returns non-zero
# when there are no processes accessing the file system
sudo umount -l ${MOUNTPOINT}
sl_checkerror "unmounting the ondemand file system"
else
sudo /usr/sbin/fuser -k ${MOUNTPOINT} 2>/dev/null
sudo umount -l ${MOUNTPOINT} 2>/dev/null
true # reset error code before next invocation of sl_checkerror
fi
done
}
# sends a SIGTERM to a process, then waits until the process is stopped or appriximately 10 seconds
# have passed.
# Parameter:
# The PID of the proces
# Returns:
# 0 if process was stopped within 10 seconds, 1 if it wasn't, 255 if initial kill returned an
# error.
sl_kill_check()
{
local PID=$1
kill $PID
if [ $? -ne 0 ]
then
return 255
fi
for ((i=0; i<100; i++))
do
kill -0 $PID 2>/dev/null
if [ $? -eq 0 ]
then
sleep 0.1
else
return 0
fi
done
return 1
}
# stops all services listed in the status file except for clients
sl_stop_services()
{
for LINE in `cat ${STATUSFILE}`
do
local SERVICE=`echo ${LINE} | cut -f 2 -d ","`
local DATAPATH=`echo ${LINE} | cut -f 3 -d ","`
local PIDFILE=`echo ${LINE} | cut -f 5 -d ","`
if [ "${PIDFILE}" != "-" ] # pidfile is "-" for beegfs-client and tmpfs, because it is not
# a process
then
if [ -e "${PIDFILE}" ]
then
PID=`cat ${PIDFILE}`
sl_kill_check ${PID}
RES=$?
if [ $RES -eq 1 ]
then
echo "ERROR: ${SERVICE} did not stop within 10 seconds (PID ${PID})."
ERROR="true"
elif [ $RES -eq 255 ]
then
echo "ERROR: ${SERVICE} does not seem to be running any more (PID ${PID})."
fi
else
if [ "${CLEANUP}" != "true" ]
then
echo "ERROR: PID file ${PIDFILE} does not exist on host `hostname`"
ERROR="true"
fi
fi
# delete data...
if [ "${DELETE_DATA}" = "true" ]
then
if [ "${DATAPATH}" != "-" ]
then
sl_print_info "Deleting stored data; Data path: ${DATAPATH}"
rm -rf ${DATAPATH}
sl_checkerror "deleting ${DATAPATH}"
fi
fi
# delete preferredMds and preferredTarget files
rm -f ${PREFERRED_MDS_FILE}
sl_checkerror "deleting ${PREFERRED_MDS_FILE}"
rm -f ${PREFERRED_TARGET_FILE}
sl_checkerror "deleting ${PREFERRED_TARGET_FILE}"
fi
done
# unmount tempfs if it was used
sl_unmount_tmpfs
}
# deletes the logfiles listed in the status file if ERROR is set to false
# If the log directory is empty afterwards, it is also deleted
sl_delete_logfiles()
{
local LOGFILE # declare it here, because the last LOGFILE path is needed to delete the directory
# after the loop
# delete log files
if [ "${ERROR}" != "true" ] # if we haven't encountered an error yet.
then
# delete log files
for LINE in `cat ${STATUSFILE}`
do
local SERVICE=`echo ${LINE} | cut -f 2 -d ","`
LOGFILE=`echo ${LINE} | cut -f 4 -d ","`
if [ "${ONLY_UNMOUNT}" = "true" ] && [ "${SERVICE}" != "${CLIENTSERVICE}" ]
then continue; fi
if [ "${ONLY_STOP_SERVER}" = "true" ] && [ "${SERVICE}" = "${CLIENTSERVICE}" ]
then continue; fi
if [ ${LOGFILE} != "-" ]
then
sl_print_info "Deleting log file ${LOGFILE}"
rm -f ${LOGFILE} 2>/dev/null # beegfs-client does not (always) generate a logfile.
# in this case rm gives an error message, but we don't
# want to see it. - for the same reason no sl_checkerror
# here
fi
done
# delete log directory if empty
local LOG_DIR=`dirname "${LOGFILE}"`
if [ "${LOG_DIR}" != "." ] && [ ! "`ls -A \"${LOG_DIR}\"`" ]
then
echo "Deleting log directory ${LOG_DIR}"
rmdir ${LOG_DIR}
sl_checkerror "deleting ${LOG_DIR}"
fi
else
sl_print_info "Not deleting log files because of a previous error."
fi
}
# The "main" stoplocal function. From here, the functions to unmount the file system and stop the
# services are called. If there was no error, sl_delete_logfiles is called, and the status file is
# also removed.
# Checks the following variables:
# STATUSFILE The location of the status file
# ONLY_STOP_SERVER If "true", the umount_local_mounts step is skipped, and status file is not
# removed.
# ONLY_UNMOUNT If "true", the stop_services step is skipped, and status file is not
# removed.
# Modifies:
# ERROR Is set to "true" (and an error message is printed to %2) if an error is
# encountered in any step.
stoplocal()
{
sl_print_info "Using status file ${STATUSFILE}"
# do the actual shutdown process
# unmount the file system (skip this step if we only want to stop the server)
if [ "${ONLY_STOP_SERVER}" != "true" ]
then
sl_unmount_local_mounts
fi
# stop the services (skip this step if we only got asked to unmount the file system)
if [ "${ONLY_UNMOUNT}" != "true" ]
then
sl_stop_services
fi
# delete the logfiles
if [ "${ERROR}" != "true" ] && [ "${DELETE_LOGS}" = "true" ]
then
sl_delete_logfiles
fi
# delete the status file (only if a full shutdown was requested)
if [ "${ONLY_UNMOUNT}" != "true" ] && [ "${ONLY_STOP_SERVER}" != "true" ]
then
rm -f ${STATUSFILE}
sl_checkerror "deleting the status file"
fi
}
# the user interface / main entry point to stoplocal
# Options:
# -i FILENAME => Status information filename
# (DEFAULT: ${DEFAULT_STATUSFILE})
# -d => Delete BeeGFS data on disks
# -L => Delete log files after successful shutdown
# -q => Suppress \"INFO\" messages, only print \"ERROR\"s
# -c => "Cleanup": Remove remaining processes and directories of a
# potentially unsuccessful shutdown of an earlier beeond
# instance. This switch silences the error message when a status
# information file is not found or an unmount command fails;
# instead, a message is printed (if \"INFO\" messages are not
# suppressed) when a status file DOES exist, because this means
# there actually was an instance before that is now being
# cleaned up.
# -u => ONLY unmount the file systems(*)
# -s => ONLY stop non-client services(*)
#
# (*) Options -u and -s are mutually exclusive
# If -u or -s are given, the status file is not deleted.
do_stoplocal()
{
# TODO get rid of variables already defined in the main script
local DEFAULT_STATUSFILE=/tmp/beeond.tmp
local CLIENTSERVICE=beegfs-client
local DELETE_DATA="false"
local DELETE_LOGS="false"
local ONLY_UNMOUNT="false"
local ONLY_STOP_SERVER="false"
local PREFERRED_MDS_FILE=/tmp/preferredMds.fod
local PREFERRED_TARGET_FILE=/tmp/preferredTarget.fod
local QUIET="false"
local ERROR="false"
local STATUSFILE="${DEFAULT_STATUSFILE}"
local OPTIND=1
local OPTARG=""
while getopts ":i:dLusqc" opt "$@"
do
case $opt in
i)
STATUSFILE=$OPTARG
;;
d)
DELETE_DATA="true"
;;
L)
DELETE_LOGS="true"
;;
u)
if [ "${ONLY_STOP_SERVER}" = "true" ]
then
echo "ERROR: Options -s and -$OPTARG are mutually exclusive" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
fi
ONLY_UNMOUNT="true"
;;
s)
if [ "${ONLY_UNMOUNT}" = "true" ]
then
echo "ERROR: Options -u and -$OPTARG are mutually exclusive" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
fi
ONLY_STOP_SERVER="true"
;;
q)
QUIET="true"
;;
c)
CLEANUP="true"
;;
\?)
echo "ERROR: invalid option -$OPTARG" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
;;
:)
echo "ERROR: Option -$OPTARG requires an argument" >&2
if declare -f -F print_usage_and_exit >/dev/null
then print_usage_and_exit; fi
return 1
;;
esac
done
# if statusfile can't be found, print a message and exit.
if [ ! -f ${STATUSFILE} ]
then
# only print message when we're not doing a cleanup run.
if [ "${CLEANUP}" != "true" ]
then
echo "ERROR: Status file ${STATUSFILE} not found." >&2
# If the user has specified a status file, just give a brief error message and exit.
# If the user has not specified a status file, give the full usage info - maybe the user
# didn't know how to specify a status file.
if [ "${STATUSFILE}" = "${DEFAULT_STATUSFILE}" ]
then
if declare -f -F "print_usage_and_exit" >/dev/null
then print_usage_and_exit; fi
fi
return 1
else
return 0 # return 0 if we're doing a cleanup so that pdsh doesn't complain
fi
fi
# if we're doing a cleanup run, inform the user that a status file was found.
if [ "${CLEANUP}" = "true" ]
then
sl_print_info "Status file found."
fi
stoplocal
if [ "${ERROR}" = "true" ]
then
return 1
else
return 0
fi
}