From ae70d628ff1ff8b26ce2ba1debd334191d78bd7d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 9 Nov 2021 16:50:18 -0800 Subject: [PATCH] zed: Control NVMe fault LEDs The ZED code currently can only turn on the fault LED for a faulted disk in a JBOD enclosure. This extends support for faulted NVMe disks as well. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #12648 Closes #12695 --- cmd/zed/zed.d/statechange-led.sh | 73 +++++++++- cmd/zed/zed.d/zed.rc | 4 +- cmd/zpool/zpool.d/ses | 8 +- lib/libzutil/os/linux/zutil_device_path_os.c | 146 ++++++++++++++++++- lib/libzutil/zutil_nicenum.c | 9 ++ 5 files changed, 229 insertions(+), 11 deletions(-) diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh index 0f9da3204317..26e6064fa94a 100755 --- a/cmd/zed/zed.d/statechange-led.sh +++ b/cmd/zed/zed.d/statechange-led.sh @@ -29,7 +29,8 @@ [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" -if [ ! -d /sys/class/enclosure ] ; then +if [ ! -d /sys/class/enclosure ] && [ ! -d /sys/bus/pci/slots ] ; then + # No JBOD enclosure or NVMe slots exit 1 fi @@ -92,6 +93,29 @@ check_and_set_led() done } +# Fault LEDs for JBODs and NVMe drives are handled a little differently. +# +# On JBODs the fault LED is called 'fault' and on a path like this: +# +# /sys/class/enclosure/0:0:1:0/SLOT 10/fault +# +# On NVMe it's called 'attention' and on a path like this: +# +# /sys/bus/pci/slot/0/attention +# +# This function returns the full path to the fault LED file for a given +# enclosure/slot directory. +# +path_to_led() +{ + dir=$1 + if [ -f "$dir/fault" ] ; then + echo "$dir/fault" + elif [ -f "$dir/attention" ] ; then + echo "$dir/attention" + fi +} + state_to_val() { state="$1" @@ -105,6 +129,38 @@ state_to_val() esac } +# +# Given a nvme name like 'nvme0n1', pass back its slot directory +# like "/sys/bus/pci/slots/0" +# +nvme_dev_to_slot() +{ + dev="$1" + + # Get the address "0000:01:00.0" + address=$(cat "/sys/class/block/$dev/device/address") + + # For each /sys/bus/pci/slots subdir that is an actual number + # (rather than weird directories like "1-3/"). + # shellcheck disable=SC2010 + for i in $(ls /sys/bus/pci/slots/ | grep -E "^[0-9]+$") ; do + this_address=$(cat "/sys/bus/pci/slots/$i/address") + + # The format of address is a little different between + # /sys/class/block/$dev/device/address and + # /sys/bus/pci/slots/ + # + # address= "0000:01:00.0" + # this_address = "0000:01:00" + # + if echo "$address" | grep -Eq ^"$this_address" ; then + echo "/sys/bus/pci/slots/$i" + break + fi + done +} + + # process_pool (pool) # # Iterate through a pool and set the vdevs' enclosure slot LEDs to @@ -134,6 +190,11 @@ process_pool() # Get dev name (like 'sda') dev=$(basename "$(echo "$therest" | awk '{print $(NF-1)}')") vdev_enc_sysfs_path=$(realpath "/sys/class/block/$dev/device/enclosure_device"*) + if [ ! -d "$vdev_enc_sysfs_path" ] ; then + # This is not a JBOD disk, but it could be a PCI NVMe drive + vdev_enc_sysfs_path=$(nvme_dev_to_slot "$dev") + fi + current_val=$(echo "$therest" | awk '{print $NF}') if [ "$current_val" != "0" ] ; then @@ -145,9 +206,10 @@ process_pool() continue fi - if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then + led_path=$(path_to_led "$vdev_enc_sysfs_path") + if [ ! -e "$led_path" ] ; then rc=3 - zed_log_msg "vdev $vdev '$file/fault' doesn't exist" + zed_log_msg "vdev $vdev '$led_path' doesn't exist" continue fi @@ -158,7 +220,7 @@ process_pool() continue fi - if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then + if ! check_and_set_led "$led_path" "$val"; then rc=3 fi done @@ -169,7 +231,8 @@ if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; the # Got a statechange for an individual vdev val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") vdev=$(basename "$ZEVENT_VDEV_PATH") - check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val" + ledpath=$(path_to_led "$ZEVENT_VDEV_ENC_SYSFS_PATH") + check_and_set_led "$ledpath" "$val" else # Process the entire pool poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 55da4c1ab30c..3bbd701f33fa 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -106,8 +106,8 @@ ## # Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for -# device mapper and multipath devices as well. Your enclosure must be -# supported by the Linux SES driver for this to work. +# device mapper and multipath devices as well. This works with JBOD enclosures +# and NVMe PCI drives (assuming they're supported by Linux in sysfs). # ZED_USE_ENCLOSURE_LEDS=1 diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses index f6b7520dfb6c..b1836d676528 100755 --- a/cmd/zpool/zpool.d/ses +++ b/cmd/zpool/zpool.d/ses @@ -41,7 +41,13 @@ for i in $scripts ; do val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) ;; fault_led) - val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + # JBODs fault LED is called 'fault', NVMe fault LED is called + # 'attention'. + if [ -f "$VDEV_ENC_SYSFS_PATH/fault" ] ; then + val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + elif [ -f "$VDEV_ENC_SYSFS_PATH/attention" ] ; then + val=$(cat "$VDEV_ENC_SYSFS_PATH/attention" 2>/dev/null) + fi ;; locate_led) val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null) diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c index 2a6f4ae2a222..13f8bd031612 100644 --- a/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -154,18 +154,148 @@ zfs_strip_path(char *path) return (strrchr(path, '/') + 1); } +/* + * Read the contents of a sysfs file into an allocated buffer and remove the + * last newline. + * + * This is useful for reading sysfs files that return a single string. Return + * an allocated string pointer on success, NULL otherwise. Returned buffer + * must be freed by the user. + */ +static char * +zfs_read_sysfs_file(char *filepath) +{ + char buf[4096]; /* all sysfs files report 4k size */ + char *str = NULL; + + FILE *fp = fopen(filepath, "r"); + if (fp == NULL) { + return (NULL); + } + if (fgets(buf, sizeof (buf), fp) == buf) { + /* success */ + + /* Remove the last newline (if any) */ + size_t len = strlen(buf); + if (buf[len - 1] == '\n') { + buf[len - 1] = '\0'; + } + str = strdup(buf); + } + + fclose(fp); + + return (str); +} + +/* + * Given a dev name like "nvme0n1", return the full PCI slot sysfs path to + * the drive (in /sys/bus/pci/slots). + * + * For example: + * dev: "nvme0n1" + * returns: "/sys/bus/pci/slots/0" + * + * 'dev' must be an NVMe device. + * + * Returned string must be freed. Returns NULL on error or no sysfs path. + */ +static char * +zfs_get_pci_slots_sys_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char *address1 = NULL; + char *address2 = NULL; + char *path = NULL; + char buf[MAXPATHLEN]; + char *tmp; + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp = strrchr(dev_name, '/'); + if (tmp != NULL) + dev_name = tmp + 1; /* +1 since we want the chr after '/' */ + + if (strncmp("nvme", dev_name, 4) != 0) + return (NULL); + + (void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address", + dev_name); + + address1 = zfs_read_sysfs_file(buf); + if (!address1) + return (NULL); + + /* + * /sys/block/nvme0n1/device/address format will + * be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be + * "0000:01:00". Just NULL terminate at the '.' so they match. + */ + tmp = strrchr(address1, '.'); + if (tmp != NULL) + *tmp = '\0'; + + dp = opendir("/sys/bus/pci/slots/"); + if (dp == NULL) { + free(address1); + return (NULL); + } + + /* + * Look through all the /sys/bus/pci/slots/ subdirs + */ + while ((ep = readdir(dp))) { + /* + * We only care about directory names that are a single number. + * Sometimes there's other directories like + * "/sys/bus/pci/slots/0-3/" in there - skip those. + */ + if (!zfs_isnumber(ep->d_name)) + continue; + + (void) snprintf(buf, sizeof (buf), + "/sys/bus/pci/slots/%s/address", ep->d_name); + + address2 = zfs_read_sysfs_file(buf); + if (!address2) + continue; + + if (strcmp(address1, address2) == 0) { + /* Addresses match, we're all done */ + free(address2); + if (asprintf(&path, "/sys/bus/pci/slots/%s", + ep->d_name) == -1) { + free(tmp); + continue; + } + break; + } + free(address2); + } + + closedir(dp); + free(address1); + + return (path); +} + /* * Given a dev name like "sda", return the full enclosure sysfs path to * the disk. You can also pass in the name with "/dev" prepended - * to it (like /dev/sda). + * to it (like /dev/sda). This works for both JBODs and NVMe PCI devices. * * For example, disk "sda" in enclosure slot 1: - * dev: "sda" + * dev_name: "sda" * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" * + * Or: + * + * dev_name: "nvme0n1" + * returns: "/sys/bus/pci/slots/0" + * * 'dev' must be a non-devicemapper device. * - * Returned string must be freed. + * Returned string must be freed. Returns NULL on error. */ char * zfs_get_enclosure_sysfs_path(const char *dev_name) @@ -252,6 +382,16 @@ zfs_get_enclosure_sysfs_path(const char *dev_name) if (dp != NULL) closedir(dp); + if (!path) { + /* + * This particular disk isn't in a JBOD. It could be an NVMe + * drive. If so, look up the NVMe device's path in + * /sys/bus/pci/slots/. Within that directory is a 'attention' + * file which controls the NVMe fault LED. + */ + path = zfs_get_pci_slots_sys_path(dev_name); + } + return (path); } diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c index 1a19db0dfebc..4dcac1f855ff 100644 --- a/lib/libzutil/zutil_nicenum.c +++ b/lib/libzutil/zutil_nicenum.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Return B_TRUE if "str" is a number string, B_FALSE otherwise. @@ -42,6 +43,14 @@ zfs_isnumber(const char *str) if (!(isdigit(*str) || (*str == '.'))) return (B_FALSE); + /* + * Numbers should not end with a period ("." ".." or "5." are + * not valid) + */ + if (str[strlen(str) - 1] == '.') { + return (B_FALSE); + } + return (B_TRUE); }