Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zed: Misc multipath autoreplace fixes #13023

Merged
merged 1 commit into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 163 additions & 16 deletions cmd/zed/agents/zfs_mod.c
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
nvlist_t *nvroot, *newvd;
pendingdev_t *device;
uint64_t wholedisk = 0ULL;
uint64_t offline = 0ULL;
uint64_t offline = 0ULL, faulted = 0ULL;
uint64_t guid = 0ULL;
char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
char rawpath[PATH_MAX], fullpath[PATH_MAX];
char devpath[PATH_MAX];
int ret;
boolean_t is_dm = B_FALSE;
boolean_t is_sd = B_FALSE;
boolean_t is_mpath_wholedisk = B_FALSE;
uint_t c;
vdev_stat_t *vs;

Expand All @@ -211,15 +211,73 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
&enc_sysfs_path);
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk);
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline);
(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted);

(void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid);

if (offline)
return; /* don't intervene if it was taken offline */
/*
* Special case:
*
* We've seen times where a disk won't have a ZPOOL_CONFIG_PHYS_PATH
* entry in their config. For example, on this force-faulted disk:
*
* children[0]:
* type: 'disk'
* id: 0
* guid: 14309659774640089719
* path: '/dev/disk/by-vdev/L28'
* whole_disk: 0
* DTL: 654
* create_txg: 4
* com.delphix:vdev_zap_leaf: 1161
* faulted: 1
* aux_state: 'external'
* children[1]:
* type: 'disk'
* id: 1
* guid: 16002508084177980912
* path: '/dev/disk/by-vdev/L29'
* devid: 'dm-uuid-mpath-35000c500a61d68a3'
* phys_path: 'L29'
* vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
* whole_disk: 0
* DTL: 1028
* create_txg: 4
* com.delphix:vdev_zap_leaf: 131
*
* If the disk's path is a /dev/disk/by-vdev/ path, then we can infer
* the ZPOOL_CONFIG_PHYS_PATH from the by-vdev disk name.
*/
if (physpath == NULL && path != NULL) {
/* If path begins with "/dev/disk/by-vdev/" ... */
if (strncmp(path, DEV_BYVDEV_PATH,
strlen(DEV_BYVDEV_PATH)) == 0) {
/* Set physpath to the char after "/dev/disk/by-vdev" */
physpath = &path[strlen(DEV_BYVDEV_PATH)];
}
}

/*
* We don't want to autoreplace offlined disks. However, we do want to
* replace force-faulted disks (`zpool offline -f`). Force-faulted
* disks have both offline=1 and faulted=1 in the nvlist.
*/
if (offline && !faulted) {
zed_log_msg(LOG_INFO, "%s: %s is offline, skip autoreplace",
__func__, path);
return;
}

is_dm = zfs_dev_is_dm(path);
is_mpath_wholedisk = is_mpath_whole_disk(path);
zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'"
" wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path,
physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not",
" %s blank disk, %s mpath blank disk, %s labeled, enc sysfs '%s', "
"(guid %llu)",
zpool_get_name(zhp), path,
physpath ? physpath : "NULL",
wholedisk ? "is" : "not",
is_mpath_wholedisk? "is" : "not",
labeled ? "is" : "not",
enc_sysfs_path,
(long long unsigned int)guid);

/*
Expand Down Expand Up @@ -253,8 +311,9 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 &&
(newstate == VDEV_STATE_HEALTHY ||
newstate == VDEV_STATE_DEGRADED)) {
zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s",
fullpath, (newstate == VDEV_STATE_HEALTHY) ?
zed_log_msg(LOG_INFO,
" zpool_vdev_online: vdev '%s' ('%s') is "
"%s", fullpath, physpath, (newstate == VDEV_STATE_HEALTHY) ?
"HEALTHY" : "DEGRADED");
return;
}
Expand All @@ -271,11 +330,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* vdev online to trigger a FMA fault by posting an ereport.
*/
if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) ||
!(wholedisk || is_dm) || (physpath == NULL)) {
!(wholedisk || is_mpath_wholedisk) || (physpath == NULL)) {
tonyhutter marked this conversation as resolved.
Show resolved Hide resolved
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
&newstate);
zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or "
"not a whole disk for '%s'", fullpath);
"not a blank disk for '%s' ('%s')", fullpath,
physpath);
return;
}

Expand All @@ -287,7 +347,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);

if (realpath(rawpath, devpath) == NULL && !is_dm) {
if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
rawpath, strerror(errno));

Expand All @@ -303,12 +363,14 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
if ((vs->vs_state != VDEV_STATE_DEGRADED) &&
(vs->vs_state != VDEV_STATE_FAULTED) &&
(vs->vs_state != VDEV_STATE_CANT_OPEN)) {
zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in "
"a bad state (currently %d)", vs->vs_state);
return;
}

nvlist_lookup_string(vdev, "new_devid", &new_devid);

if (is_dm) {
if (is_mpath_wholedisk) {
/* Don't label device mapper or multipath disks. */
} else if (!labeled) {
/*
Expand Down Expand Up @@ -522,8 +584,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
* the dp->dd_compare value.
*/
if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
strcmp(dp->dd_compare, path) != 0)
strcmp(dp->dd_compare, path) != 0) {
zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)",
__func__, dp->dd_compare, path);
return;
}

zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s",
dp->dd_prop, path);
Expand Down Expand Up @@ -571,6 +636,8 @@ zfs_iter_pool(zpool_handle_t *zhp, void *data)
ZPOOL_CONFIG_VDEV_TREE, &nvl);
zfs_iter_vdev(zhp, nvl, data);
}
} else {
zed_log_msg(LOG_INFO, "%s: no config\n", __func__);
}

/*
Expand Down Expand Up @@ -619,6 +686,72 @@ devphys_iter(const char *physical, const char *devid, zfs_process_func_t func,
return (data.dd_found);
}

/*
* Given a device identifier, find any vdevs with a matching by-vdev
* path. Normally we shouldn't need this as the comparison would be
* made earlier in the devphys_iter(). For example, if we were replacing
* /dev/disk/by-vdev/L28, normally devphys_iter() would match the
* ZPOOL_CONFIG_PHYS_PATH of "L28" from the old disk config to "L28"
* of the new disk config. However, we've seen cases where
* ZPOOL_CONFIG_PHYS_PATH was not in the config for the old disk. Here's
tonyhutter marked this conversation as resolved.
Show resolved Hide resolved
* an example of a real 2-disk mirror pool where one disk was force
* faulted:
*
* com.delphix:vdev_zap_top: 129
* children[0]:
* type: 'disk'
* id: 0
* guid: 14309659774640089719
* path: '/dev/disk/by-vdev/L28'
* whole_disk: 0
* DTL: 654
* create_txg: 4
* com.delphix:vdev_zap_leaf: 1161
* faulted: 1
* aux_state: 'external'
* children[1]:
* type: 'disk'
* id: 1
* guid: 16002508084177980912
* path: '/dev/disk/by-vdev/L29'
* devid: 'dm-uuid-mpath-35000c500a61d68a3'
* phys_path: 'L29'
* vdev_enc_sysfs_path: '/sys/class/enclosure/0:0:1:0/SLOT 30 32'
* whole_disk: 0
* DTL: 1028
* create_txg: 4
* com.delphix:vdev_zap_leaf: 131
*
* So in the case above, the only thing we could compare is the path.
*
* We can do this because we assume by-vdev paths are authoritative as physical
* paths. We could not assume this for normal paths like /dev/sda since the
* physical location /dev/sda points to could change over time.
*/
static boolean_t
by_vdev_path_iter(const char *by_vdev_path, const char *devid,
zfs_process_func_t func, boolean_t is_slice)
{
dev_data_t data = { 0 };

data.dd_compare = by_vdev_path;
data.dd_func = func;
data.dd_prop = ZPOOL_CONFIG_PATH;
data.dd_found = B_FALSE;
data.dd_islabeled = is_slice;
data.dd_new_devid = devid;

if (strncmp(by_vdev_path, DEV_BYVDEV_PATH,
strlen(DEV_BYVDEV_PATH)) != 0) {
/* by_vdev_path doesn't start with "/dev/disk/by-vdev/" */
return (B_FALSE);
}

(void) zpool_iter(g_zfshdl, zfs_iter_pool, &data);

return (data.dd_found);
}

/*
* Given a device identifier, find any vdevs with a matching devid.
* On Linux we can match devid directly which is always a whole disk.
Expand Down Expand Up @@ -683,15 +816,17 @@ guid_iter(uint64_t pool_guid, uint64_t vdev_guid, const char *devid,
static int
zfs_deliver_add(nvlist_t *nvl)
{
char *devpath = NULL, *devid;
char *devpath = NULL, *devid = NULL;
uint64_t pool_guid = 0, vdev_guid = 0;
boolean_t is_slice;

/*
* Expecting a devid string and an optional physical location and guid
*/
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0)
if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) {
zed_log_msg(LOG_INFO, "%s: no dev identifier\n", __func__);
return (-1);
}

(void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath);
(void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid);
Expand All @@ -707,6 +842,8 @@ zfs_deliver_add(nvlist_t *nvl)
* 1. ZPOOL_CONFIG_DEVID (identifies the unique disk)
* 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location).
* 3. ZPOOL_CONFIG_GUID (identifies unique vdev).
* 4. ZPOOL_CONFIG_PATH for /dev/disk/by-vdev devices only (since
* by-vdev paths represent physical paths).
*/
if (devid_iter(devid, zfs_process_add, is_slice))
return (0);
Expand All @@ -717,6 +854,16 @@ zfs_deliver_add(nvlist_t *nvl)
(void) guid_iter(pool_guid, vdev_guid, devid, zfs_process_add,
is_slice);

if (devpath != NULL) {
/* Can we match a /dev/disk/by-vdev/ path? */
char by_vdev_path[MAXPATHLEN];
snprintf(by_vdev_path, sizeof (by_vdev_path),
"/dev/disk/by-vdev/%s", devpath);
if (by_vdev_path_iter(by_vdev_path, devid, zfs_process_add,
is_slice))
return (0);
}

return (0);
}

Expand Down
10 changes: 10 additions & 0 deletions cmd/zed/zed_disk_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,11 @@ zed_udev_monitor(void *arg)
if (type != NULL && type[0] != '\0' &&
strcmp(type, "disk") == 0 &&
part != NULL && part[0] != '\0') {
zed_log_msg(LOG_INFO,
"%s: skip %s since it has a %s partition already",
__func__,
udev_device_get_property_value(dev, "DEVNAME"),
part);
/* skip and wait for partition event */
udev_device_unref(dev);
continue;
Expand All @@ -229,6 +234,11 @@ zed_udev_monitor(void *arg)
sectors = udev_device_get_sysattr_value(dev, "size");
if (sectors != NULL &&
strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) {
zed_log_msg(LOG_INFO,
"%s: %s sectors %s < %llu (minimum)",
__func__,
udev_device_get_property_value(dev, "DEVNAME"),
sectors, MINIMUM_SECTORS);
udev_device_unref(dev);
continue;
}
Expand Down
29 changes: 19 additions & 10 deletions lib/libzutil/os/linux/zutil_device_path_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ zfs_dev_is_dm(const char *dev_name)
boolean_t
zfs_dev_is_whole_disk(const char *dev_name)
{
struct dk_gpt *label;
struct dk_gpt *label = NULL;
int fd;

if ((fd = open(dev_name, O_RDONLY | O_DIRECT | O_CLOEXEC)) < 0)
Expand Down Expand Up @@ -613,30 +613,39 @@ zfs_get_underlying_path(const char *dev_name)
/*
* A disk is considered a multipath whole disk when:
* DEVNAME key value has "dm-"
* DM_NAME key value has "mpath" prefix
* MPATH_DEVICE_READY is present
* DM_UUID key exists
* ID_PART_TABLE_TYPE key does not exist or is not gpt
* ID_FS_LABEL key does not exist (disk isn't labeled)
*/
static boolean_t
udev_mpath_whole_disk(struct udev_device *dev)
is_mpath_udev_sane(struct udev_device *dev)
{
const char *devname, *type, *uuid;
const char *devname, *type, *uuid, *label, *mpath_ready;

devname = udev_device_get_property_value(dev, "DEVNAME");
type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE");
uuid = udev_device_get_property_value(dev, "DM_UUID");
label = udev_device_get_property_value(dev, "ID_FS_LABEL");
mpath_ready = udev_device_get_property_value(dev, "MPATH_DEVICE_READY");

if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) &&
((type == NULL) || (strcmp(type, "gpt") != 0)) &&
(uuid != NULL)) {
(uuid != NULL) &&
(label == NULL) &&
(mpath_ready != NULL && strncmp(mpath_ready, "1", 1) == 0)) {
return (B_TRUE);
}

return (B_FALSE);
}

/*
* Check if a disk is effectively a multipath whole disk
* Check if a disk is a multipath "blank" disk:
*
* 1. The disk has udev values that suggest it's a multipath disk
* 2. The disk is not currently labeled with a filesystem of any type
* 3. There are no partitions on the disk
*/
boolean_t
is_mpath_whole_disk(const char *path)
Expand All @@ -645,7 +654,6 @@ is_mpath_whole_disk(const char *path)
struct udev_device *dev = NULL;
char nodepath[MAXPATHLEN];
char *sysname;
boolean_t wholedisk = B_FALSE;

if (realpath(path, nodepath) == NULL)
return (B_FALSE);
Expand All @@ -660,10 +668,11 @@ is_mpath_whole_disk(const char *path)
return (B_FALSE);
}

wholedisk = udev_mpath_whole_disk(dev);

/* Sanity check some udev values */
boolean_t is_sane = is_mpath_udev_sane(dev);
udev_device_unref(dev);
return (wholedisk);

return (is_sane);
}

#else /* HAVE_LIBUDEV */
Expand Down