From 891b2e770ce3efc75c40c2211445f3e849280001 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Tue, 16 Jan 2018 16:08:44 -0700 Subject: [PATCH 1/2] MMP: increase delay for busy pool before suspending In mmp_thread(), if the mmp_delay is increasing beyond the fixed mmp_interval, use the longer mmp_delay value when determining if the pool should be suspended. Otherwise, if the system is very busy (but still getting MMP writes to disk) it may incorrectly suspend the pool, even though a peer system would not import it. Signed-off-by: Andreas Dilger --- module/zfs/mmp.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index d13d1590f0fd..a4e1f13bd4b6 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -249,7 +249,7 @@ mmp_write_done(zio_t *zio) goto unlock; /* - * Mmp writes are queued on a fixed schedule, but under many + * MMP writes are queued on a fixed schedule, but under many * circumstances, such as a busy device or faulty hardware, * the writes will complete at variable, much longer, * intervals. In these cases, another node checking for @@ -269,8 +269,7 @@ mmp_write_done(zio_t *zio) if (delay > mts->mmp_delay) mts->mmp_delay = delay; else - mts->mmp_delay = (delay + mts->mmp_delay * 127) / - 128; + mts->mmp_delay = (delay + mts->mmp_delay * 127) / 128; } else { mts->mmp_delay = 0; } @@ -381,17 +380,17 @@ mmp_thread(void *arg) uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals; uint64_t mmp_interval = MSEC2NSEC( MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); + uint64_t fail_interval; + int vdev_leaves = MAX(vdev_count_leaves(spa), 1); boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); hrtime_t start, next_time; start = gethrtime(); - if (multihost) { - next_time = start + mmp_interval / - MAX(vdev_count_leaves(spa), 1); - } else { + if (multihost) + next_time = start + mmp_interval / vdev_leaves; + else next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); - } /* * When MMP goes off => on, or spa goes suspended => @@ -417,16 +416,16 @@ mmp_thread(void *arg) * immediately suspended before writes can occur at the new * higher frequency. */ - if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) { - max_fail_ns = ((31 * max_fail_ns) + (mmp_interval * - mmp_fail_intervals)) / 32; - } else { - max_fail_ns = mmp_interval * mmp_fail_intervals; - } + fail_interval = MAX(mmp_interval, + mmp->mmp_delay * vdev_leaves) * mmp_fail_intervals; + if (fail_interval < max_fail_ns) + max_fail_ns = ((31 * max_fail_ns) + fail_interval) / 32; + else + max_fail_ns = fail_interval; /* * Suspend the pool if no MMP write has succeeded in over - * mmp_interval * mmp_fail_intervals nanoseconds. + * mmp_delay * vdev_leaves * mmp_fail_intervals nanoseconds. */ if (!suspended && mmp_fail_intervals && multihost && (start - mmp->mmp_last_write) > max_fail_ns) { From abd17be2e0d03633a1f98b1fb031116fd8f768d1 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Tue, 16 Jan 2018 16:08:45 -0700 Subject: [PATCH 2/2] MMP: base mmp_fail_interval on mmp_import_interval Base zfs_multihost_fail_intervals on zfs_multihost_import_intervals by default, so that the number of failed writes for suspending a pool is not larger than the number of intervals a peer will use when checking if it is safe to import the pool. Otherwise, it would be possible (with poorly-set module parameters, fail_interval > import_interval) that the current importer writes MMP updates irregularly but eventually succeeds before mmp_fail_intervals (hence does not suspend the pool), yet the peer system imported the pool after waiting an insufficient number of update intervals. Signed-off-by: Andreas Dilger --- include/sys/mmp.h | 1 - module/zfs/mmp.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/sys/mmp.h b/include/sys/mmp.h index 5b2fea1a66b1..20b3c5f37a65 100644 --- a/include/sys/mmp.h +++ b/include/sys/mmp.h @@ -30,7 +30,6 @@ extern "C" { #define MMP_MIN_INTERVAL 100 /* ms */ #define MMP_DEFAULT_INTERVAL 1000 /* ms */ #define MMP_DEFAULT_IMPORT_INTERVALS 10 -#define MMP_DEFAULT_FAIL_INTERVALS 5 typedef struct mmp_thread { kmutex_t mmp_thread_lock; /* protect thread mgmt fields */ diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index a4e1f13bd4b6..8d298519f84d 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -115,14 +115,13 @@ uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; * configuration may take action such as suspending the pool or taking a * device offline. * - * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures will - * cause the pool to be suspended. This occurs when + * When zfs_multihost_fail_intervals > 0 then sequential mmp write failures + * will cause the pool to be suspended. This occurs when * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds have * passed since the last successful mmp write. This guarantees the activity - * test will see mmp writes if the - * pool is imported. + * test will see mmp writes if the pool is imported. */ -uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; +uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_IMPORT_INTERVALS / 2; static void mmp_thread(void *arg); @@ -392,6 +391,30 @@ mmp_thread(void *arg) else next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); + /* + * Don't allow fail_intervals larger than import_intervals, + * as this could allow an active pool to be imported on a peer. + * + * We don't actually know what the import_interval is on the + * peer node, but the best assumption is that it is configured + * in the same way as the local node. + */ + if (mmp_fail_intervals >= zfs_multihost_import_intervals) { + static boolean_t warned = B_FALSE; + + if (!warned) { + cmn_err(CE_WARN, + "zfs_multihost_fail_intervals %u >= " + "zfs_multihost_import_interval %u, " + "using smaller value", + zfs_multihost_fail_intervals, + zfs_multihost_import_interval); + warned = B_TRUE; + } + + mmp_fail_intervals = zfs_multihost_import_intervals - 1; + } + /* * When MMP goes off => on, or spa goes suspended => * !suspended, we know no writes occurred recently. We