-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdracut-kdump.sh
executable file
·631 lines (537 loc) · 14.1 KB
/
dracut-kdump.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
#!/bin/sh
#
# The main kdump routine in capture kernel, bash may not be the
# default shell. Any code added must be POSIX compliant.
. /lib/dracut-lib.sh
. /lib/kdump-logger.sh
. /lib/kdump-lib-initramfs.sh
#initiate the kdump logger
if ! dlog_init; then
echo "failed to initiate the kdump logger."
exit 1
fi
KDUMP_PATH="/var/crash"
KDUMP_LOG_FILE="/run/initramfs/kexec-dmesg.log"
KDUMP_LOG_DEST=""
KDUMP_LOG_OP=""
CORE_COLLECTOR=""
DEFAULT_CORE_COLLECTOR="makedumpfile -l --message-level 7 -d 31"
DMESG_COLLECTOR="/sbin/vmcore-dmesg"
FAILURE_ACTION="systemctl reboot -f"
DATEDIR=$(date +%Y-%m-%d-%T)
HOST_IP='127.0.0.1'
DUMP_INSTRUCTION=""
SSH_KEY_LOCATION=$DEFAULT_SSHKEY
DD_BLKSIZE=512
FINAL_ACTION="systemctl reboot -f"
KDUMP_PRE=""
KDUMP_POST=""
NEWROOT="/sysroot"
OPALCORE="/sys/firmware/opal/mpipl/core"
KDUMP_CONF_PARSED="/tmp/kdump.conf.$$"
# POSIX doesn't have pipefail, only apply when using bash
# shellcheck disable=SC3040
[ -n "$BASH" ] && set -o pipefail
DUMP_RETVAL=0
kdump_read_conf > $KDUMP_CONF_PARSED
get_kdump_confs()
{
while read -r config_opt config_val; do
# remove inline comments after the end of a directive.
case "$config_opt" in
path)
KDUMP_PATH="$config_val"
;;
core_collector)
[ -n "$config_val" ] && CORE_COLLECTOR="$config_val"
;;
sshkey)
if [ -f "$config_val" ]; then
SSH_KEY_LOCATION=$config_val
fi
;;
kdump_pre)
KDUMP_PRE="$config_val"
;;
kdump_post)
KDUMP_POST="$config_val"
;;
fence_kdump_args)
FENCE_KDUMP_ARGS="$config_val"
;;
fence_kdump_nodes)
FENCE_KDUMP_NODES="$config_val"
;;
failure_action | default)
case $config_val in
shell)
FAILURE_ACTION="kdump_emergency_shell"
;;
reboot)
FAILURE_ACTION="systemctl reboot -f && exit"
;;
halt)
FAILURE_ACTION="halt && exit"
;;
poweroff)
FAILURE_ACTION="systemctl poweroff -f && exit"
;;
dump_to_rootfs)
FAILURE_ACTION="dump_to_rootfs"
;;
esac
;;
final_action)
case $config_val in
reboot)
FINAL_ACTION="systemctl reboot -f"
;;
halt)
FINAL_ACTION="halt"
;;
poweroff)
FINAL_ACTION="systemctl poweroff -f"
;;
esac
;;
esac
done < "$KDUMP_CONF_PARSED"
if [ -z "$CORE_COLLECTOR" ]; then
CORE_COLLECTOR="$DEFAULT_CORE_COLLECTOR"
if is_ssh_dump_target || is_raw_dump_target; then
CORE_COLLECTOR="$CORE_COLLECTOR -F"
fi
fi
}
# store the kexec kernel log to a file.
save_log()
{
# LOG_OP is empty when log can't be saved, eg. raw target
[ -n "$KDUMP_LOG_OP" ] || return
dmesg -T > $KDUMP_LOG_FILE
if command -v journalctl > /dev/null; then
journalctl -ab >> $KDUMP_LOG_FILE
fi
chmod 600 $KDUMP_LOG_FILE
dinfo "saving the $KDUMP_LOG_FILE to $KDUMP_LOG_DEST/"
eval "$KDUMP_LOG_OP"
}
# $1: dump path, must be a mount point
dump_fs()
{
ddebug "dump_fs _mp=$1"
if ! is_mounted "$1"; then
dinfo "dump path '$1' is not mounted, trying to mount..."
if ! mount --target "$1"; then
derror "failed to dump to '$1', it's not a mount point!"
return 1
fi
fi
# Remove -F in makedumpfile case. We don't want a flat format dump here.
case $CORE_COLLECTOR in
*makedumpfile*)
CORE_COLLECTOR=$(echo "$CORE_COLLECTOR" | sed -e "s/-F//g")
;;
esac
_dump_fs_path=$(echo "$1/$KDUMP_PATH/$HOST_IP-$DATEDIR/" | tr -s /)
dinfo "saving to $_dump_fs_path"
# Only remount to read-write mode if the dump target is mounted read-only.
_dump_mnt_op=$(get_mount_info OPTIONS target "$1" -f)
case $_dump_mnt_op in
ro*)
dinfo "Remounting the dump target in rw mode."
mount -o remount,rw "$1" || return 1
;;
esac
mkdir -p "$_dump_fs_path" || return 1
save_vmcore_dmesg_fs ${DMESG_COLLECTOR} "$_dump_fs_path"
save_opalcore_fs "$_dump_fs_path"
dinfo "saving vmcore"
KDUMP_LOG_DEST=$_dump_fs_path/
KDUMP_LOG_OP="mv '$KDUMP_LOG_FILE' '$KDUMP_LOG_DEST/'"
$CORE_COLLECTOR /proc/vmcore "$_dump_fs_path/vmcore-incomplete"
_dump_exitcode=$?
if [ $_dump_exitcode -eq 0 ]; then
sync -f "$_dump_fs_path/vmcore-incomplete"
_sync_exitcode=$?
if [ $_sync_exitcode -eq 0 ]; then
mv "$_dump_fs_path/vmcore-incomplete" "$_dump_fs_path/vmcore"
dinfo "saving vmcore complete"
else
derror "sync vmcore failed, exitcode:$_sync_exitcode"
return 1
fi
else
derror "saving vmcore failed, exitcode:$_dump_exitcode"
return 1
fi
# improper kernel cmdline can cause the failure of echo, we can ignore this kind of failure
return 0
}
# $1: dmesg collector
# $2: dump path
save_vmcore_dmesg_fs()
{
dinfo "saving vmcore-dmesg.txt to $2"
if $1 /proc/vmcore > "$2/vmcore-dmesg-incomplete.txt"; then
mv "$2/vmcore-dmesg-incomplete.txt" "$2/vmcore-dmesg.txt"
chmod 600 "$2/vmcore-dmesg.txt"
# Make sure file is on disk. There have been instances where later
# saving vmcore failed and system rebooted without sync and there
# was no vmcore-dmesg.txt available.
sync
dinfo "saving vmcore-dmesg.txt complete"
else
if [ -f "$2/vmcore-dmesg-incomplete.txt" ]; then
chmod 600 "$2/vmcore-dmesg-incomplete.txt"
fi
derror "saving vmcore-dmesg.txt failed"
fi
}
# $1: dump path
save_opalcore_fs()
{
if [ ! -f $OPALCORE ]; then
# Check if we are on an old kernel that uses a different path
if [ -f /sys/firmware/opal/core ]; then
OPALCORE="/sys/firmware/opal/core"
else
return 0
fi
fi
dinfo "saving opalcore:$OPALCORE to $1/opalcore"
if ! cp $OPALCORE "$1/opalcore"; then
derror "saving opalcore failed"
return 1
fi
sync
dinfo "saving opalcore complete"
return 0
}
dump_to_rootfs()
{
if [ "$(systemctl status dracut-initqueue | sed -n "s/^\s*Active: \(\S*\)\s.*$/\1/p")" = "inactive" ]; then
dinfo "Trying to bring up initqueue for rootfs mount"
systemctl start dracut-initqueue
fi
dinfo "Clean up dead systemd services"
systemctl cancel
dinfo "Waiting for rootfs mount, will timeout after 90 seconds"
systemctl start --no-block sysroot.mount
_loop=0
while [ $_loop -lt 90 ] && ! is_mounted /sysroot; do
sleep 1
_loop=$((_loop + 1))
done
if ! is_mounted /sysroot; then
derror "Failed to mount rootfs"
return
fi
ddebug "NEWROOT=$NEWROOT"
dump_fs $NEWROOT
}
kdump_emergency_shell()
{
ddebug "Switching to kdump emergency shell..."
[ -f /etc/profile ] && . /etc/profile
export PS1='kdump:${PWD}# '
. /lib/dracut-lib.sh
if [ -f /dracut-state.sh ]; then
. /dracut-state.sh 2> /dev/null
fi
source_conf /etc/conf.d
type plymouth > /dev/null 2>&1 && plymouth quit
source_hook "emergency"
while read -r _tty rest; do
(
echo
echo
echo 'Entering kdump emergency mode.'
echo 'Type "journalctl" to view system logs.'
echo 'Type "rdsosreport" to generate a sosreport, you can then'
echo 'save it elsewhere and attach it to a bug report.'
echo
echo
) > "/dev/$_tty"
done < /proc/consoles
sh -i -l
/bin/rm -f -- /.console_lock
}
do_failure_action()
{
dinfo "Executing failure action $FAILURE_ACTION"
eval $FAILURE_ACTION
}
do_final_action()
{
dinfo "Executing final action $FINAL_ACTION"
eval "$FINAL_ACTION"
}
do_dump()
{
eval "$DUMP_INSTRUCTION"
_ret=$?
if [ $_ret -ne 0 ]; then
derror "saving vmcore failed"
fi
return $_ret
}
do_kdump_pre()
{
if [ -n "$KDUMP_PRE" ]; then
"$KDUMP_PRE"
_ret=$?
if [ $_ret -ne 0 ]; then
derror "$KDUMP_PRE exited with $_ret status"
return $_ret
fi
fi
# if any script fails, it just raises warning and continues
if [ -d /etc/kdump/pre.d ]; then
for file in /etc/kdump/pre.d/*; do
"$file"
_ret=$?
if [ $_ret -ne 0 ]; then
derror "$file exited with $_ret status"
fi
done
fi
return 0
}
do_kdump_post()
{
if [ -d /etc/kdump/post.d ]; then
for file in /etc/kdump/post.d/*; do
"$file" "$1"
_ret=$?
if [ $_ret -ne 0 ]; then
derror "$file exited with $_ret status"
fi
done
fi
if [ -n "$KDUMP_POST" ]; then
"$KDUMP_POST" "$1"
_ret=$?
if [ $_ret -ne 0 ]; then
derror "$KDUMP_POST exited with $_ret status"
fi
fi
}
# $1: block target, eg. /dev/sda
dump_raw()
{
[ -b "$1" ] || return 1
dinfo "saving to raw disk $1"
if ! echo "$CORE_COLLECTOR" | grep -q makedumpfile; then
_src_size=$(stat --format %s /proc/vmcore)
_src_size_mb=$((_src_size / 1048576))
/kdumpscripts/monitor_dd_progress $_src_size_mb &
fi
dinfo "saving vmcore"
$CORE_COLLECTOR /proc/vmcore | dd of="$1" bs=$DD_BLKSIZE >> /tmp/dd_progress_file 2>&1 || return 1
sync
dinfo "saving vmcore complete"
return 0
}
# $1: ssh key file
# $2: ssh address in <user>@<host> format
dump_ssh()
{
_ret=0
_ssh_opts="-i $1 -o BatchMode=yes -o StrictHostKeyChecking=yes"
_ssh_dir="$KDUMP_PATH/$HOST_IP-$DATEDIR"
if is_ipv6_address "$2"; then
_scp_address=${2%@*}@"[${2#*@}]"
else
_scp_address=$2
fi
dinfo "saving to $2:$_ssh_dir"
cat /var/lib/random-seed > /dev/urandom
# shellcheck disable=SC2086 # need to word-split $_ssh_opts
ssh -q $_ssh_opts "$2" mkdir -p "$_ssh_dir" || return 1
save_vmcore_dmesg_ssh "$DMESG_COLLECTOR" "$_ssh_dir" "$_ssh_opts" "$2"
dinfo "saving vmcore"
KDUMP_LOG_DEST=$2:$_ssh_dir/
KDUMP_LOG_OP="scp -q $_ssh_opts '$KDUMP_LOG_FILE' '$_scp_address:$_ssh_dir/'"
save_opalcore_ssh "$_ssh_dir" "$_ssh_opts" "$2" "$_scp_address"
if [ "${CORE_COLLECTOR%%[[:blank:]]*}" = "scp" ]; then
# shellcheck disable=SC2086 # need to word-split $_ssh_opts
scp -q $_ssh_opts /proc/vmcore "$_scp_address:$_ssh_dir/vmcore-incomplete"
_ret=$?
_vmcore="vmcore"
else
# shellcheck disable=SC2086,SC2029 # need to word-split $_ssh_opts and expand $_ssh_dir
$CORE_COLLECTOR /proc/vmcore | ssh $_ssh_opts "$2" "umask 0077 && dd bs=512 of='$_ssh_dir/vmcore-incomplete'"
_ret=$?
_vmcore="vmcore.flat"
fi
if [ $_ret -eq 0 ]; then
# shellcheck disable=SC2086,SC2029 # need to word-split $_ssh_opts and expand $_ssh_dir
ssh $_ssh_opts "$2" "mv '$_ssh_dir/vmcore-incomplete' '$_ssh_dir/$_vmcore'"
_ret=$?
if [ $_ret -ne 0 ]; then
derror "moving vmcore failed, exitcode:$_ret"
else
dinfo "saving vmcore complete"
fi
else
derror "saving vmcore failed, exitcode:$_ret"
fi
return $_ret
}
# $1: dump path
# $2: ssh opts
# $3: ssh address in <user>@<host> format
# $4: scp address, similar with ssh address but IPv6 addresses are quoted
save_opalcore_ssh()
{
if [ ! -f $OPALCORE ]; then
# Check if we are on an old kernel that uses a different path
if [ -f /sys/firmware/opal/core ]; then
OPALCORE="/sys/firmware/opal/core"
else
return 0
fi
fi
dinfo "saving opalcore:$OPALCORE to $3:$1"
# shellcheck disable=SC2086 # need to word-split $2
if ! scp $2 $OPALCORE "$4:$1/opalcore-incomplete"; then
derror "saving opalcore failed"
return 1
fi
# shellcheck disable=SC2086,SC2029 # need to word-split $1 and expand $2
ssh $2 "$3" mv "$1/opalcore-incomplete" "$1/opalcore"
dinfo "saving opalcore complete"
return 0
}
# $1: dmesg collector
# $2: dump path
# $3: ssh opts
# $4: ssh address in <user>@<host> format
save_vmcore_dmesg_ssh()
{
dinfo "saving vmcore-dmesg.txt to $4:$2"
# shellcheck disable=SC2086,SC2029 # need to word-split $3 and expand $2
if $1 /proc/vmcore | ssh $3 "$4" "umask 0077 && dd of='$2/vmcore-dmesg-incomplete.txt'"; then
ssh -q $3 "$4" mv "$2/vmcore-dmesg-incomplete.txt" "$2/vmcore-dmesg.txt"
dinfo "saving vmcore-dmesg.txt complete"
else
derror "saving vmcore-dmesg.txt failed"
fi
}
wait_online_network()
{
# In some cases, network may still not be ready because nm-online is called
# with "-s" which means to wait for NetworkManager startup to complete, rather
# than waiting for network connectivity specifically. Wait 10mins more for the
# network to be truely ready in these cases.
_loop=0
while [ $_loop -lt 600 ]; do
sleep 1
_loop=$((_loop + 1))
if _route=$(kdump_get_ip_route "$1" 2> /dev/null); then
printf "%s" "$_route"
return
else
dwarn "Waiting for network to be ready (${_loop}s / 10min)"
fi
done
derror "Oops. The network still isn't ready after waiting 10mins."
exit 1
}
get_host_ip()
{
if ! is_nfs_dump_target && ! is_ssh_dump_target; then
return 0
fi
_kdump_remote_ip=$(getarg kdump_remote_ip=)
if [ -z "$_kdump_remote_ip" ]; then
derror "failed to get remote IP address!"
return 1
fi
if ! _route=$(wait_online_network "$_kdump_remote_ip"); then
return 1
fi
_netdev=$(kdump_get_ip_route_field "$_route" "dev")
if ! _kdumpip=$(ip addr show dev "$_netdev" | grep '[ ]*inet'); then
derror "Failed to get IP of $_netdev"
return 1
fi
_kdumpip=$(echo "$_kdumpip" | head -n 1 | awk '{print $2}')
_kdumpip="${_kdumpip%%/*}"
HOST_IP=$_kdumpip
}
read_kdump_confs()
{
if [ ! -f "$KDUMP_CONFIG_FILE" ]; then
derror "$KDUMP_CONFIG_FILE not found"
return
fi
get_kdump_confs
# rescan for add code for dump target
while read -r config_opt config_val; do
# remove inline comments after the end of a directive.
case "$config_opt" in
dracut_args)
config_val=$(get_dracut_args_target "$config_val")
if [ -n "$config_val" ]; then
config_val=$(get_mntpoint_from_target "$config_val")
DUMP_INSTRUCTION="dump_fs $config_val"
fi
;;
ext[234] | xfs | btrfs | minix | nfs | virtiofs)
config_val=$(get_mntpoint_from_target "$config_val")
DUMP_INSTRUCTION="dump_fs $config_val"
;;
raw)
DUMP_INSTRUCTION="dump_raw $config_val"
;;
ssh)
DUMP_INSTRUCTION="dump_ssh $SSH_KEY_LOCATION $config_val"
;;
esac
done < "$KDUMP_CONF_PARSED"
}
fence_kdump_notify()
{
if [ -n "$FENCE_KDUMP_NODES" ]; then
# shellcheck disable=SC2086
$FENCE_KDUMP_SEND $FENCE_KDUMP_ARGS $FENCE_KDUMP_NODES &
fi
}
if [ "$1" = "--error-handler" ]; then
get_kdump_confs
do_failure_action
do_final_action
exit $?
fi
# continue here only if we have to save dump.
if [ -f /etc/fadump.initramfs ] && [ ! -f /proc/device-tree/rtas/ibm,kernel-dump ] && [ ! -f /proc/device-tree/ibm,opal/dump/mpipl-boot ]; then
exit 0
fi
read_kdump_confs
fence_kdump_notify
if ! get_host_ip; then
derror "get_host_ip exited with non-zero status!"
exit 1
fi
if [ -z "$DUMP_INSTRUCTION" ]; then
DUMP_INSTRUCTION="dump_fs $NEWROOT"
fi
if ! do_kdump_pre; then
derror "kdump_pre script exited with non-zero status!"
do_final_action
# During systemd service to reboot the machine, stop this shell script running
exit 1
fi
make_trace_mem "kdump saving vmcore" '1:shortmem' '2+:mem' '3+:slab'
do_dump
DUMP_RETVAL=$?
if ! do_kdump_post $DUMP_RETVAL; then
derror "kdump_post script exited with non-zero status!"
fi
save_log
if [ $DUMP_RETVAL -ne 0 ]; then
exit 1
fi
do_final_action