Skip to content

Commit

Permalink
zed: Misc multipath autoreplace fixes
Browse files Browse the repository at this point in the history
We recently had a case where our operators replaced a bad
multipathed disk, only to see it fail to autoreplace.  The
zed logs showed that the multipath replacement disk did not pass
the 'is_dm' test in zfs_process_add() even though it should have.
is_dm is set if there exists a sysfs entry for to the
underlying /dev/sd* paths for the multipath disk.  It's
possible this path didn't exist due to a race condition where
the sysfs paths wern't created at the time the udev event came
in to zed, but this was never verified.

This patch updates the check to look for udev properties that
indicate if the new autoreplace disk is an empty multipath disk,
rather than looking for the underlying sysfs entries. It also
adds in additional logging, and fixes a bug where zed allowed
you to use an already zfs-formatted disk from another pool
as a multipath auto-replacement disk.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
  • Loading branch information
tonyhutter committed Jan 27, 2022
1 parent 4372e96 commit e696eb7
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 24 deletions.
30 changes: 22 additions & 8 deletions cmd/zed/agents/zfs_mod.c
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
int ret;
boolean_t is_dm = B_FALSE;
boolean_t is_sd = B_FALSE;
boolean_t is_mpath_wholedisk = B_FALSE;
uint_t c;
vdev_stat_t *vs;

Expand All @@ -217,9 +218,16 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
return; /* don't intervene if it was taken offline */

is_dm = zfs_dev_is_dm(path);
is_mpath_wholedisk = is_mpath_whole_disk(path);
zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
" wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
" %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', "
"(guid %llu)",
zpool_get_name(zhp), path,
physpath ? physpath : "NULL",
wholedisk ? "is" : "not",
is_mpath_wholedisk? "is" : "not",
labeled ? "is" : "not",
enc_sysfs_path,
(long long unsigned int)guid);

/*
Expand Down Expand Up @@ -253,8 +261,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
(newstate == VDEV_STATE_HEALTHY ||
newstate == VDEV_STATE_DEGRADED)) {
zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s",
fullpath, (newstate == VDEV_STATE_HEALTHY) ?
zed_log_msg(LOG_INFO,
" zpool_vdev_online: vdev '%s' ('%s') is "
"%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ?
"HEALTHY" : "DEGRADED");
return;
}
Expand All @@ -271,11 +280,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* vdev online to trigger a FMA fault by posting an ereport.
*/
if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
!(wholedisk || is_dm) || (physpath == NULL)) {
!(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) {
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
&newstate);
zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
"not a whole disk for '%s'", fullpath);
"not a blank disk for '%s' ('%s')", fullpath,
physpath);
return;
}

Expand Down Expand Up @@ -303,6 +313,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
(vs->vs_state != VDEV_STATE_FAULTED) &&
(vs->vs_state != VDEV_STATE_CANT_OPEN)) {
zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in "
"a bad state (currently %d)\n", vs->vs_state);
return;
}

Expand Down Expand Up @@ -683,15 +695,17 @@ guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
static int
zfs_deliver_add(nvlist_t *nvl)
{
char *devpath = NULL, *devid;
char *devpath = NULL, *devid = NULL;
uint64_t pool_guid = 0, vdev_guid = 0;
boolean_t is_slice;

/*
* Expecting a devid string and an optional physical location and guid
*/
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) {
zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__);
return (-1);
}

(void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/efi_partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,8 @@ struct partition64 {
#ifndef _KERNEL
_SYS_EFI_PARTITION_H int efi_debug;
_SYS_EFI_PARTITION_H int efi_alloc_and_init(int, uint32_t, struct dk_gpt **);
_SYS_EFI_PARTITION_H int efi_alloc_and_init_skip_check(int, uint32_t,
struct dk_gpt **);
_SYS_EFI_PARTITION_H int efi_alloc_and_read(int, struct dk_gpt **);
_SYS_EFI_PARTITION_H int efi_write(int, struct dk_gpt *);
_SYS_EFI_PARTITION_H int efi_rescan(int);
Expand Down
28 changes: 24 additions & 4 deletions lib/libefi/rdwr_efi.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,9 @@ efi_get_info(int fd, struct dk_cinfo *dki_info)
#define MAX_PARTS ((4294967295UL - sizeof (struct dk_gpt)) / \
sizeof (struct dk_part))

int
efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
static int
efi_alloc_and_init_impl(int fd, uint32_t nparts, struct dk_gpt **vtoc,
int skip_disk_check)
{
diskaddr_t capacity = 0;
uint_t lbsize = 0;
Expand All @@ -351,9 +352,10 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
if (dki_info.dki_partition != 0)
return (-1);

if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) ||
if (!skip_disk_check &&
((dki_info.dki_ctype == DKC_PCMCIA_MEM) ||
(dki_info.dki_ctype == DKC_VBD) ||
(dki_info.dki_ctype == DKC_UNKNOWN))
(dki_info.dki_ctype == DKC_UNKNOWN)))
return (-1);

nblocks = NBLOCKS(nparts, lbsize);
Expand Down Expand Up @@ -398,6 +400,24 @@ efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
return (0);
}

int
efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc)
{
return (efi_alloc_and_init_impl(fd, nparts, vtoc, 0));
}

/*
* This is the same as efi_alloc_and_init(), but it will not fail if the disk
* is a virtual block device, PCMCIA disk, or unknown disk type. It is used
* for doing an efi_alloc_and_init on multipath disks.
*/
int
efi_alloc_and_init_skip_check(int fd, uint32_t nparts,
struct dk_gpt **vtoc)
{
return (efi_alloc_and_init_impl(fd, nparts, vtoc, 1));
}

/*
* Read EFI - return partition number upon success.
*/
Expand Down
50 changes: 38 additions & 12 deletions lib/libzutil/os/linux/zutil_device_path_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,26 +524,38 @@ zfs_dev_is_dm(const char *dev_name)
* it's a viable device or not. Returns B_TRUE if it is and B_FALSE if
* it isn't.
*/
boolean_t
zfs_dev_is_whole_disk(const char *dev_name)
static boolean_t
zfs_dev_is_whole_disk_impl(const char *dev_name, int skip_checks)
{
struct dk_gpt *label;
int fd;
int rc;

if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
return (B_FALSE);

if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
(void) close(fd);
if (skip_checks) {
rc = efi_alloc_and_init_skip_check(fd, EFI_NUMPAR, &label);
} else {
rc = efi_alloc_and_init(fd, EFI_NUMPAR, &label);
}
(void) close(fd);

if (rc != 0) {
return (B_FALSE);
}

efi_free(label);
(void) close(fd);

return (B_TRUE);
}

boolean_t
zfs_dev_is_whole_disk(const char *dev_name)
{
return (zfs_dev_is_whole_disk_impl(dev_name, 0));
}

/*
* Lookup the underlying device for a device name
*
Expand Down Expand Up @@ -613,30 +625,37 @@ zfs_get_underlying_path(const char *dev_name)
/*
* A disk is considered a multipath whole disk when:
* DEVNAME key value has "dm-"
* DM_NAME key value has "mpath" prefix
* MPATH_DEVICE_READY is present
* DM_UUID key exists
* ID_PART_TABLE_TYPE key does not exist or is not gpt
* ID_FS_LABEL key does not exist (disk isn't labeled)
*/
static boolean_t
udev_mpath_whole_disk(struct udev_device *dev)
is_mpath_udev_sane(struct udev_device *dev)
{
const char *devname, *type, *uuid;
const char *devname, *type, *uuid, *label;

devname = udev_device_get_property_value(dev, "DEVNAME");
type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
uuid = udev_device_get_property_value(dev, "DM_UUID");
label = udev_device_get_property_value(dev, "ID_FS_LABEL");

if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
((type == NULL) || (strcmp(type, "gpt") != 0)) &&
(uuid != NULL)) {
(uuid != NULL) &&
(label == NULL)) {
return (B_TRUE);
}

return (B_FALSE);
}

/*
* Check if a disk is effectively a multipath whole disk
* Check if a disk is a mutipath "blank" disk:
*
* 1. The disk has udev values that suggest it's a multipath disk
* 2. The disk is not currently labeled with a filesystem of any type
* 3. There are no partitions on the disk
*/
boolean_t
is_mpath_whole_disk(const char *path)
Expand All @@ -645,6 +664,7 @@ is_mpath_whole_disk(const char *path)
struct udev_device *dev = NULL;
char nodepath[MAXPATHLEN];
char *sysname;
boolean_t is_sane;
boolean_t wholedisk = B_FALSE;

if (realpath(path, nodepath) == NULL)
Expand All @@ -660,9 +680,15 @@ is_mpath_whole_disk(const char *path)
return (B_FALSE);
}

wholedisk = udev_mpath_whole_disk(dev);

/* Sanity check some udev values */
is_sane = is_mpath_udev_sane(dev);
udev_device_unref(dev);
if (!is_sane)
return (B_FALSE);

/* Check for existing partitions on the disk */
wholedisk = zfs_dev_is_whole_disk_impl(path, 1);

return (wholedisk);
}

Expand Down

0 comments on commit e696eb7

Please sign in to comment.