From 145a3d997a9f3423f6a483a7b3025bd3a68e2585 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 26 Jan 2022 16:51:40 -0800 Subject: [PATCH] zed: Misc multipath autoreplace fixes We recently had a case where our operators replaced a bad multipathed disk, only to see it fail to autoreplace. The zed logs showed that the multipath replacement disk did not pass the 'is_dm' test in zfs_process_add() even though it should have. is_dm is set if there exists a sysfs entry for to the underlying /dev/sd* paths for the multipath disk. It's possible this path didn't exist due to a race condition where the sysfs paths wern't created at the time the udev event came in to zed, but this was never verified. This patch updates the check to look for udev properties that indicate if the new autoreplace disk is an empty multipath disk, rather than looking for the underlying sysfs entries. It also adds in additional logging, and fixes a bug where zed allowed you to use an already zfs-formatted disk from another pool as a multipath auto-replacement disk. Furthermore, while testing this patch, I also ran across a case where a force-faulted disk did not have a ZPOOL_CONFIG_PHYS_PATH entry in its config. This prevented it from being autoreplaced. I added additional logic to derive the PHYS_PATH from the PATH if the PATH was a /dev/disk/by-vdev/ path. For example, if PATH was /dev/disk/by-vdev/L28, then PHYS_PATH would be L28. This is safe since by-vdev paths represent physical locations and do not change between boots. Signed-off-by: Tony Hutter --- cmd/zed/agents/zfs_mod.c | 171 +++++++++++++++++-- cmd/zed/zed_disk_event.c | 10 ++ lib/libzutil/os/linux/zutil_device_path_os.c | 38 +++-- 3 files changed, 194 insertions(+), 25 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index c62a976c2e3f..064051c59fac 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -183,7 +183,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) nvlist_t *nvroot, *newvd; pendingdev_t *device; uint64_t wholedisk = 0ULL; - uint64_t offline = 0ULL; + uint64_t offline = 0ULL, faulted = 0ULL; uint64_t guid = 0ULL; char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; char rawpath[PATH_MAX], fullpath[PATH_MAX]; @@ -191,6 +191,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) int ret; boolean_t is_dm = B_FALSE; boolean_t is_sd = B_FALSE; + boolean_t is_mpath_wholedisk = B_FALSE; uint_t c; vdev_stat_t *vs; @@ -211,15 +212,73 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) &enc_sysfs_path); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); - if (offline) - return; /* don't intervene if it was taken offline */ + /* + * Special case: + * + * We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH + * entry in their config. For example, on this force-faulted disk: + * + * children[0]: + * type: 'disk' + * id: 0 + * guid: 14309659774640089719 + * path: '/dev/disk/by-vdev/L28' + * whole_disk: 0 + * DTL: 654 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 1161 + * faulted: 1 + * aux_state: 'external' + * children[1]: + * type: 'disk' + * id: 1 + * guid: 16002508084177980912 + * path: '/dev/disk/by-vdev/L29' + * devid: 'dm-uuid-mpath-35000c500a61d68a3' + * phys_path: 'L29' + * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' + * whole_disk: 0 + * DTL: 1028 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 131 + * + * If the disk's path is a /dev/disk/by-vdev/ path, then we can infer + * the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name. + */ + if (physpath == NULL && path != NULL) { + /* If path begins with "/dev/disk/by-vdev/" ... */ + if (strstr(path, "/dev/disk/by-vdev/") == path) { + /* Set physpath to the char after "/dev/disk/by-vdev" */ + physpath = &path[strlen("/dev/disk/by-vdev/")]; + } + } + + /* + * We don't want to autoreplace offlined disks. However, we do want to + * replace force-faulted disks (`zpool offline -f`). Force-faulted + * disks have both offline=1 and faulted=1 in the nvlist. + */ + if (offline && !faulted) { + zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace", + __func__, path); + return; + } is_dm = zfs_dev_is_dm(path); + is_mpath_wholedisk = is_mpath_whole_disk(path); zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" - " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, - physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", + " %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', " + "(guid %llu)", + zpool_get_name(zhp), path, + physpath ? physpath : "NULL", + wholedisk ? "is" : "not", + is_mpath_wholedisk? "is" : "not", + labeled ? "is" : "not", + enc_sysfs_path, (long long unsigned int)guid); /* @@ -253,8 +312,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && (newstate == VDEV_STATE_HEALTHY || newstate == VDEV_STATE_DEGRADED)) { - zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", - fullpath, (newstate == VDEV_STATE_HEALTHY) ? + zed_log_msg(LOG_INFO, + " zpool_vdev_online: vdev '%s' ('%s') is " + "%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ? "HEALTHY" : "DEGRADED"); return; } @@ -271,11 +331,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) * vdev online to trigger a FMA fault by posting an ereport. */ if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || - !(wholedisk || is_dm) || (physpath == NULL)) { + !(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) { (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, &newstate); zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " - "not a whole disk for '%s'", fullpath); + "not a blank disk for '%s' ('%s')", fullpath, + physpath); return; } @@ -303,6 +364,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { + zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " + "a bad state (currently %d)\n", vs->vs_state); return; } @@ -522,8 +585,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) * the dp->dd_compare value. */ if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || - strcmp(dp->dd_compare, path) != 0) + strcmp(dp->dd_compare, path) != 0) { + zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)", + __func__, dp->dd_compare, path); return; + } zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", dp->dd_prop, path); @@ -571,6 +637,8 @@ zfs_iter_pool(zpool_handle_t *zhp, void *data) ZPOOL_CONFIG_VDEV_TREE, &nvl); zfs_iter_vdev(zhp, nvl, data); } + } else { + zed_log_msg(LOG_INFO, "%s: no config\n", __func__); } /* @@ -619,6 +687,71 @@ devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, return (data.dd_found); } +/* + * Given a device identifier, find any vdevs with a matching by-vdev + * path. Normally we shouldn't need this as the comparison would be + * made earlier in the devphys_iter(). For example, if we were replacing + * /dev/disk/by-vdev/L28, normally devphys_iter() would match the + * ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28" + * of the new disk config. However, we've seen cases where + * ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's + * an example of a real 2-disk mirror pool where one disk was force + * faulted: + * + * com.delphix:vdev_zap_top: 129 + * children[0]: + * type: 'disk' + * id: 0 + * guid: 14309659774640089719 + * path: '/dev/disk/by-vdev/L28' + * whole_disk: 0 + * DTL: 654 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 1161 + * faulted: 1 + * aux_state: 'external' + * children[1]: + * type: 'disk' + * id: 1 + * guid: 16002508084177980912 + * path: '/dev/disk/by-vdev/L29' + * devid: 'dm-uuid-mpath-35000c500a61d68a3' + * phys_path: 'L29' + * vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32' + * whole_disk: 0 + * DTL: 1028 + * create_txg: 4 + * com.delphix:vdev_zap_leaf: 131 + * + * So in the case above, the only thing we could compare is the path. + * + * We can do this because we assume by-vdev paths are authoritative as physical + * paths. We could not assume this for normal paths like /dev/sda since the + * physical location /dev/sda points to could change over time. + */ +static boolean_t +by_vdev_path_iter(const char *by_vdev_path, const char *devid, + zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = by_vdev_path; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PATH; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + if (strstr(by_vdev_path, "/dev/disk/by-vdev") != by_vdev_path) { + /* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */ + return (B_FALSE); + } + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + /* * Given a device identifier, find any vdevs with a matching devid. * On Linux we can match devid directly which is always a whole disk. @@ -683,15 +816,17 @@ guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid, static int zfs_deliver_add(nvlist_t *nvl) { - char *devpath = NULL, *devid; + char *devpath = NULL, *devid = NULL; uint64_t pool_guid = 0, vdev_guid = 0; boolean_t is_slice; /* * Expecting a devid string and an optional physical location and guid */ - if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) { + zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__); return (-1); + } (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); @@ -707,6 +842,8 @@ zfs_deliver_add(nvlist_t *nvl) * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). * 3. ZPOOL_CONFIG_GUID (identifies unique vdev). + * 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since + * by-vdev paths represent physical paths). */ if (devid_iter(devid, zfs_process_add, is_slice)) return (0); @@ -717,6 +854,16 @@ zfs_deliver_add(nvlist_t *nvl) (void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add, is_slice); + if (devpath != NULL) { + /* Can we match a /dev/disk/by-vdev/ path? */ + char by_vdev_path[MAXPATHLEN]; + snprintf(by_vdev_path, sizeof (by_vdev_path), + "/dev/disk/by-vdev/%s", devpath); + if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add, + is_slice)) + return (0); + } + return (0); } diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 94e24236063c..52b80d8c4c93 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -215,6 +215,11 @@ zed_udev_monitor(void *arg) if (type != NULL && type[0] != '\0' && strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { + zed_log_msg(LOG_INFO, + "%s: skip %s since it has a %s partition already", + __func__, + udev_device_get_property_value(dev, "DEVNAME"), + part); /* skip and wait for partition event */ udev_device_unref(dev); continue; @@ -229,6 +234,11 @@ zed_udev_monitor(void *arg) sectors = udev_device_get_sysattr_value(dev, "size"); if (sectors != NULL && strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + zed_log_msg(LOG_INFO, + "%s: %s sectors %s < %llu (minimum)", + __func__, + udev_device_get_property_value(dev, "DEVNAME"), + sectors, MINIMUM_SECTORS); udev_device_unref(dev); continue; } diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c index 17247233bcf2..31847df26fed 100644 --- a/lib/libzutil/os/linux/zutil_device_path_os.c +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -529,17 +529,19 @@ zfs_dev_is_whole_disk(const char *dev_name) { struct dk_gpt *label; int fd; + int rc; - if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) + if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0) { return (B_FALSE); + } + (void) close(fd); - if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { - (void) close(fd); + rc = efi_alloc_and_init(fd, EFI_NUMPAR, &label); + if (rc != 0) { return (B_FALSE); } efi_free(label); - (void) close(fd); return (B_TRUE); } @@ -613,22 +615,27 @@ zfs_get_underlying_path(const char *dev_name) /* * A disk is considered a multipath whole disk when: * DEVNAME key value has "dm-" - * DM_NAME key value has "mpath" prefix + * MPATH_DEVICE_READY is present * DM_UUID key exists * ID_PART_TABLE_TYPE key does not exist or is not gpt + * ID_FS_LABEL key does not exist (disk isn't labeled) */ static boolean_t -udev_mpath_whole_disk(struct udev_device *dev) +is_mpath_udev_sane(struct udev_device *dev) { - const char *devname, *type, *uuid; + const char *devname, *type, *uuid, *label, *mpath_ready; devname = udev_device_get_property_value(dev, "DEVNAME"); type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); uuid = udev_device_get_property_value(dev, "DM_UUID"); + label = udev_device_get_property_value(dev, "ID_FS_LABEL"); + mpath_ready = udev_device_get_property_value(dev, "MPATH_DEVICE_READY"); if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && ((type == NULL) || (strcmp(type, "gpt") != 0)) && - (uuid != NULL)) { + (uuid != NULL) && + (label == NULL) && + (mpath_ready != NULL && strncmp(mpath_ready, "1", 1) == 0)) { return (B_TRUE); } @@ -636,7 +643,11 @@ udev_mpath_whole_disk(struct udev_device *dev) } /* - * Check if a disk is effectively a multipath whole disk + * Check if a disk is a mutipath "blank" disk: + * + * 1. The disk has udev values that suggest it's a multipath disk + * 2. The disk is not currently labeled with a filesystem of any type + * 3. There are no partitions on the disk */ boolean_t is_mpath_whole_disk(const char *path) @@ -645,7 +656,7 @@ is_mpath_whole_disk(const char *path) struct udev_device *dev = NULL; char nodepath[MAXPATHLEN]; char *sysname; - boolean_t wholedisk = B_FALSE; + boolean_t is_sane; if (realpath(path, nodepath) == NULL) return (B_FALSE); @@ -660,10 +671,11 @@ is_mpath_whole_disk(const char *path) return (B_FALSE); } - wholedisk = udev_mpath_whole_disk(dev); - + /* Sanity check some udev values */ + is_sane = is_mpath_udev_sane(dev); udev_device_unref(dev); - return (wholedisk); + + return (is_sane); } #else /* HAVE_LIBUDEV */