From 59a7881e4738ced1f1fe3f82244f570cc101e413 Mon Sep 17 00:00:00 2001 From: Cameron Harr Date: Tue, 16 Jan 2024 17:59:29 -0800 Subject: [PATCH] Add 'zpool status -e' flag to see unhealthy vdevs When very large pools are present, it can be laborious to find reasons for why a pool is degraded and/or where an unhealthy vdev is. This option filters out vdevs that are ONLINE and with no errors to make it easier to see where the issues are. Root and parents of unhealthy vdevs will always be printed. Testing: ZFS errors and drive failures for multiple vdevs were simulated with zinject. Sample vdev listings with '-e' option - All vdevs healthy NAME STATE READ WRITE CKSUM iron5 ONLINE 0 0 0 - ZFS errors NAME STATE READ WRITE CKSUM iron5 ONLINE 0 0 0 raidz2-5 ONLINE 1 0 0 L23 ONLINE 1 0 0 L24 ONLINE 1 0 0 L37 ONLINE 1 0 0 - Vdev faulted NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-6 DEGRADED 0 0 0 L67 FAULTED 0 0 0 too many errors - Vdev faults and data errors NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-1 DEGRADED 0 0 0 L2 FAULTED 0 0 0 too many errors raidz2-5 ONLINE 1 0 0 L23 ONLINE 1 0 0 L24 ONLINE 1 0 0 L37 ONLINE 1 0 0 raidz2-6 DEGRADED 0 0 0 L67 FAULTED 0 0 0 too many errors - Vdev missing NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-6 DEGRADED 0 0 0 L67 UNAVAIL 3 1 0 - Slow devices when -s provided with -e NAME STATE READ WRITE CKSUM SLOW iron5 DEGRADED 0 0 0 - raidz2-5 DEGRADED 0 0 0 - L10 FAULTED 0 0 0 0 external device fault L51 ONLINE 0 0 0 14 Signed-off-by: Cameron Harr --- cmd/zpool/zpool_main.c | 58 +++++++++++++- man/man8/zpool-status.8 | 4 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_status/zpool_status_002_pos.ksh | 4 +- .../zpool_status/zpool_status_003_pos.ksh | 2 + .../zpool_status/zpool_status_008_pos.ksh | 78 +++++++++++++++++++ 7 files changed, 143 insertions(+), 7 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 11486f3f185e..8753d7263914 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2161,6 +2161,7 @@ typedef struct status_cbdata { boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; + boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; @@ -2357,6 +2358,35 @@ health_str_to_color(const char *health) return (NULL); } +/* + * Called for each leaf vdev. Returns 0 if the vdev is healthy. + * A vdev is unhealthy if any of the following are true: + * 1) there are read, write, or checksum errors, + * 2) its state is not ONLINE, or + * 3) slow IO reporting was requested (-s) and there are slow IOs. + */ +static int +vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) +{ + status_cbdata_t *cb = data; + vdev_stat_t *vs; + uint_t vsc; + (void) hdl_data; + + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) != 0) + return (1); + + if (vs->vs_checksum_errors || vs->vs_read_errors || + vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) + return (1); + + if (cb->cb_print_slow_ios && vs->vs_slow_ios) + return (1); + + return (0); +} + /* * Print out configuration state as requested by status_callback. */ @@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, const char *state; const char *type; const char *path = NULL; - const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, + *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, state = gettext("AVAIL"); } + /* + * If '-e' is specified then top-level vdevs and their children + * can be pruned if all of their leaves are healthy. + */ + if (cb->cb_print_unhealthy && depth > 0 && + for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { + return; + } + printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); @@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (vs->vs_checksum_errors) ccolor = ANSI_RED; + if (vs->vs_slow_ios) + scolor = ANSI_BLUE; + if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", @@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } if (cb->cb_literal) - printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + printf_color(scolor, " %5llu", + (u_longlong_t)vs->vs_slow_ios); else - printf(" %5s", rbuf); + printf_color(scolor, " %5s", rbuf); } if (cb->cb_print_power) { if (children == 0) { @@ -9106,9 +9150,11 @@ status_callback(zpool_handle_t *zhp, void *data) (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { + color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); + color_end(); } else { print_error_log(zhp); } @@ -9129,6 +9175,7 @@ status_callback(zpool_handle_t *zhp, void *data) * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -e Display only unhealthy vdevs * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -9160,7 +9207,7 @@ zpool_do_status(int argc, char **argv) }; /* check options */ - while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options, + while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9187,6 +9234,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'e': + cb.cb_print_unhealthy = B_TRUE; + break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 56fa4aed057b..24ad6e643cae 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DigLpPstvx +.Op Fl DeigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,6 +69,8 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl e +Only show unhealthy vdevs (not-ONLINE or with errors). .It Fl i Display vdev initialization status. .It Fl g diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7e0990b5d9f9..951239f4111d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -536,7 +536,8 @@ tags = ['functional', 'cli_root', 'zpool_split'] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', - 'zpool_status_007_pos', 'zpool_status_features_001_pos'] + 'zpool_status_007_pos', 'zpool_status_008_pos', + 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4040e60434a7..9092bc47a20e 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1239,6 +1239,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh index 3bdd7db649f9..d6f32cdc7ac6 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh @@ -51,7 +51,7 @@ else fi set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \ - "-vx $testpool" + "-vx $testpool" "-e $testpool" "-es $testpool" log_assert "Executing 'zpool status' with correct options succeeds" @@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do (( i = i + 1 )) done +cleanup + log_pass "'zpool status' with correct options succeeded" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh index b501aac5ad6d..52b22dd833f0 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -37,6 +37,7 @@ # 3. Read the file # 4. Take a snapshot and make a clone # 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' +# and 'zpool status -ev' function cleanup { @@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'" +log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'" log_mustnot eval "zpool status -v | grep '$TESTFS1'" log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh new file mode 100755 index 000000000000..65acb2bb592c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status -e' only shows unhealthy devices. +# +# STRATEGY: +# 1. Create zpool +# 2. Force DEGRADE vdev +# 3. Verify only the DEGRADED vdev and parents show +# + +function mkvdev +{ + for vdev in {1..$children};do + truncate -s $MINVDEVSIZE $TESTDIR/vdev$vdev + done +} + +function cleanup +{ + log_must zinject -c all + datasetexists $TESTPOOL2 && log_must zpool destroy $TESTPOOL2 + rm -f $TESTDIR/vdev_a +} + +log_assert "Verify 'zpool -e'" + +for raid_type in "draid" "raidz1"; do + + parity=1 + data=4 + spare=1 + children=6 + + if [[ "$raid_type" = "draid" ]]; then + raidfmt="draid${parity}:${data}d:${children}c:${spare}s" + else + raidfmt="raidz${parity}" + fi + + log_must mkdir -p $TESTDIR + log_must eval mkvdev + log_must eval "zpool create -f -m /$TESTPOOL2 $TESTPOOL2 $raidfmt " \ + "$TESTDIR/vdev1 $TESTDIR/vdev2 $TESTDIR/vdev3" \ + "$TESTDIR/vdev4 $TESTDIR/vdev5" "$TESTDIR/vdev6" + + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE" + log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2 + log_must eval "zpool status $TESTPOOL2" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "DEGRADED" + log_must eval "zpool status -e $TESTPOOL2" + log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" + cleanup +done + +log_pass "Verify zpool status -e shows only unhealthy vdevs"