Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 'zpool status -e' flag to see unhealthy vdevs #15769

Merged
merged 1 commit into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 54 additions & 4 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2161,6 +2161,7 @@ typedef struct status_cbdata {
boolean_t cb_explain;
boolean_t cb_first;
boolean_t cb_dedup_stats;
boolean_t cb_print_unhealthy;
boolean_t cb_print_status;
boolean_t cb_print_slow_ios;
boolean_t cb_print_vdev_init;
Expand Down Expand Up @@ -2357,6 +2358,35 @@ health_str_to_color(const char *health)
return (NULL);
}

/*
* Called for each leaf vdev. Returns 0 if the vdev is healthy.
* A vdev is unhealthy if any of the following are true:
* 1) there are read, write, or checksum errors,
* 2) its state is not ONLINE, or
* 3) slow IO reporting was requested (-s) and there are slow IOs.
*/
static int
vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data)
{
status_cbdata_t *cb = data;
vdev_stat_t *vs;
uint_t vsc;
(void) hdl_data;

if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
(uint64_t **)&vs, &vsc) != 0)
return (1);

if (vs->vs_checksum_errors || vs->vs_read_errors ||
vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY)
return (1);

if (cb->cb_print_slow_ios && vs->vs_slow_ios)
return (1);

return (0);
}

/*
* Print out configuration state as requested by status_callback.
*/
Expand All @@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
const char *state;
const char *type;
const char *path = NULL;
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL;
const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL,
*scolor = NULL;

if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
Expand All @@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
state = gettext("AVAIL");
}

/*
* If '-e' is specified then top-level vdevs and their children
* can be pruned if all of their leaves are healthy.
*/
if (cb->cb_print_unhealthy && depth > 0 &&
for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) {
return;
}

printf_color(health_str_to_color(state),
"\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth,
name, state);
Expand All @@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
if (vs->vs_checksum_errors)
ccolor = ANSI_RED;

if (vs->vs_slow_ios)
scolor = ANSI_BLUE;

if (cb->cb_literal) {
fputc(' ', stdout);
printf_color(rcolor, "%5llu",
Expand Down Expand Up @@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}

if (cb->cb_literal)
printf(" %5llu", (u_longlong_t)vs->vs_slow_ios);
printf_color(scolor, " %5llu",
(u_longlong_t)vs->vs_slow_ios);
else
printf(" %5s", rbuf);
printf_color(scolor, " %5s", rbuf);
}
if (cb->cb_print_power) {
if (children == 0) {
Expand Down Expand Up @@ -9106,9 +9150,11 @@ status_callback(zpool_handle_t *zhp, void *data)
(void) printf(gettext(
"errors: No known data errors\n"));
} else if (!cbp->cb_verbose) {
color_start(ANSI_RED);
(void) printf(gettext("errors: %llu data "
"errors, use '-v' for a list\n"),
(u_longlong_t)nerr);
color_end();
} else {
print_error_log(zhp);
}
Expand All @@ -9129,6 +9175,7 @@ status_callback(zpool_handle_t *zhp, void *data)
* [pool] [interval [count]]
*
* -c CMD For each vdev, run command CMD
* -e Display only unhealthy vdevs
* -i Display vdev initialization status.
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
Expand Down Expand Up @@ -9160,7 +9207,7 @@ zpool_do_status(int argc, char **argv)
};

/* check options */
while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options,
while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
NULL)) != -1) {
switch (c) {
case 'c':
Expand All @@ -9187,6 +9234,9 @@ zpool_do_status(int argc, char **argv)
}
cmd = optarg;
break;
case 'e':
cb.cb_print_unhealthy = B_TRUE;
break;
case 'i':
cb.cb_print_vdev_init = B_TRUE;
break;
Expand Down
4 changes: 3 additions & 1 deletion man/man8/zpool-status.8
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm zpool
.Cm status
.Op Fl DigLpPstvx
.Op Fl DeigLpPstvx
.Op Fl T Sy u Ns | Ns Sy d
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns
.Oo Ar pool Oc Ns
Expand Down Expand Up @@ -69,6 +69,8 @@ See the
option of
.Nm zpool Cm iostat
for complete details.
.It Fl e
cmharr marked this conversation as resolved.
Show resolved Hide resolved
Only show unhealthy vdevs (not-ONLINE or with errors).
.It Fl i
Display vdev initialization status.
.It Fl g
Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,8 @@ tags = ['functional', 'cli_root', 'zpool_split']
tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
'zpool_status_003_pos', 'zpool_status_004_pos',
'zpool_status_005_pos', 'zpool_status_006_pos',
'zpool_status_007_pos', 'zpool_status_features_001_pos']
'zpool_status_007_pos', 'zpool_status_008_pos',
'zpool_status_features_001_pos']
tags = ['functional', 'cli_root', 'zpool_status']

[tests/functional/cli_root/zpool_sync]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -1239,6 +1239,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
functional/cli_root/zpool_sync/cleanup.ksh \
functional/cli_root/zpool_sync/setup.ksh \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ else
fi

set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \
"-vx $testpool"
"-vx $testpool" "-e $testpool" "-es $testpool"

log_assert "Executing 'zpool status' with correct options succeeds"

Expand All @@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do
(( i = i + 1 ))
done

cleanup

log_pass "'zpool status' with correct options succeeded"
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
# 3. Read the file
# 4. Take a snapshot and make a clone
# 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v'
# and 'zpool status -ev'

function cleanup
{
Expand Down Expand Up @@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2
log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'"
log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'"
log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'"
log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'"
log_mustnot eval "zpool status -v | grep '$TESTFS1'"

log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone"
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/bin/ksh -p

#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#

#
# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
#

. $STF_SUITE/include/libtest.shlib

#
# DESCRIPTION:
# Verify 'zpool status -e' only shows unhealthy devices.
#
# STRATEGY:
# 1. Create zpool
# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs
# 3. Verify vdevs are reported correctly with -e and -s
# 4. Verify parents are reported as DEGRADED
# 5. Verify healthy children are not reported
#

function cleanup
{
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
zinject -c all
poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
log_must rm -f $all_vdevs
}

log_assert "Verify 'zpool status -e'"

log_onexit cleanup

all_vdevs=$(echo $TESTDIR/vdev{1..6})
log_must mkdir -p $TESTDIR
log_must truncate -s $MINVDEVSIZE $all_vdevs

OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)

for raid_type in "draid2:3d:6c:1s" "raidz2"; do

log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs

# Check DEGRADED vdevs are shown.
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE"
log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED"

# Check FAULTED vdevs are shown.
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE"
log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED"

# Check no ONLINE vdevs are shown
log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE"

# Check no ONLINE slow vdevs are show. Then mark IOs greater than
# 10ms slow, delay IOs 20ms to vdev6, check slow IOs.
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE"
log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE"

log_must set_tunable64 ZIO_SLOW_IO_MS 10
log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2
log_must mkfile 1048576 /$TESTPOOL2/testfile
sync_pool $TESTPOOL2
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO

# Check vdev6 slow IOs are only shown when requested with -s.
log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"
log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE"

# Pool level and top-vdev level status must be DEGRADED.
log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED"
log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED"

# Check that healthy vdevs[1-3] aren't shown with -e.
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE"
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE"
log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE"
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE"
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE"
log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE"

log_must zinject -c all
log_must zpool status -es $TESTPOOL2

zpool destroy $TESTPOOL2
done

log_pass "Verify zpool status -e shows only unhealthy vdevs"
Loading