Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report long delays in writing MMP blocks #7212

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/mmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ typedef struct mmp_thread {
uberblock_t mmp_ub; /* last ub written by sync */
zio_t *mmp_zio_root; /* root of mmp write zios */
uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */
int mmp_skip_error; /* reason for last skipped write */
} mmp_thread_t;


Expand Down
6 changes: 4 additions & 2 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -887,10 +887,12 @@ extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t,
struct dsl_pool *);
extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *);
extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id);
extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
hrtime_t duration);
extern void spa_mmp_history_add(uint64_t txg, uint64_t timestamp,
uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id);
extern void *spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
int error);

/* Pool configuration locks */
extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
Expand Down
120 changes: 95 additions & 25 deletions module/zfs/mmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,43 +200,86 @@ mmp_thread_stop(spa_t *spa)
mmp->mmp_thread_exiting = 0;
}

/*
* Choose a leaf vdev to write an MMP block to. It must not have an
* outstanding mmp write (if so then there is a problem, and a new write will
* also block). If there is no usable leaf in this subtree return NULL,
* otherwise return a pointer to the leaf.
*
* When walking the subtree, a random child is chosen as the starting point so
* that when the tree is healthy, the leaf chosen will be random with even
* distribution. If there are unhealthy vdevs in the tree, the distribution
* will be really poor only if a large proportion of the vdevs are unhealthy,
* in which case there are other more pressing problems.
*/
typedef enum mmp_vdev_state_flag {
MMP_FAIL_NOT_WRITABLE = (1 << 0),
MMP_FAIL_WRITE_PENDING = (1 << 1),
} mmp_vdev_state_flag_t;

static vdev_t *
mmp_random_leaf(vdev_t *vd)
mmp_random_leaf_impl(vdev_t *vd, int *fail_mask)
{
int child_idx;

if (!vdev_writeable(vd))
if (!vdev_writeable(vd)) {
*fail_mask |= MMP_FAIL_NOT_WRITABLE;
return (NULL);
}

if (vd->vdev_ops->vdev_op_leaf) {
vdev_t *ret;

if (vd->vdev_mmp_pending != 0) {
*fail_mask |= MMP_FAIL_WRITE_PENDING;
ret = NULL;
} else {
ret = vd;
}

if (vd->vdev_ops->vdev_op_leaf)
return (vd->vdev_mmp_pending == 0 ? vd : NULL);
return (ret);
}

child_idx = spa_get_random(vd->vdev_children);
for (int offset = vd->vdev_children; offset > 0; offset--) {
vdev_t *leaf;
vdev_t *child = vd->vdev_child[(child_idx + offset) %
vd->vdev_children];

leaf = mmp_random_leaf(child);
leaf = mmp_random_leaf_impl(child, fail_mask);
if (leaf)
return (leaf);
}

return (NULL);
}

/*
* Find a leaf vdev to write an MMP block to. It must not have an outstanding
* mmp write (if so a new write will also likely block). If there is no usable
* leaf in the tree rooted at in_vd, a nonzero error value is returned, and
* *out_vd is unchanged.
*
* The error value returned is a bit field.
*
* MMP_FAIL_WRITE_PENDING
* If set, one or more leaf vdevs are writeable, but have an MMP write which has
* not yet completed.
*
* MMP_FAIL_NOT_WRITABLE
* If set, one or more vdevs are not writeable. The children of those vdevs
* were not examined.
*
* Assuming in_vd points to a tree, a random subtree will be chosen to start.
* That subtree, and successive ones, will be walked until a usable leaf has
* been found, or all subtrees have been examined (except that the children of
* un-writeable vdevs are not examined).
*
* If the leaf vdevs in the tree are healthy, the distribution of returned leaf
* vdevs will be even. If there are unhealthy leaves, the following leaves
* (child_index % index_children) will be chosen more often.
*/

static int
mmp_random_leaf(vdev_t *in_vd, vdev_t **out_vd)
{
int error_mask = 0;
vdev_t *vd = mmp_random_leaf_impl(in_vd, &error_mask);

if (error_mask == 0)
*out_vd = vd;

return (error_mask);
}

static void
mmp_write_done(zio_t *zio)
{
Expand Down Expand Up @@ -319,18 +362,44 @@ mmp_write_uberblock(spa_t *spa)
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
mmp_thread_t *mmp = &spa->spa_mmp;
uberblock_t *ub;
vdev_t *vd;
int label;
vdev_t *vd = NULL;
int label, error;
uint64_t offset;

hrtime_t lock_acquire_time = gethrtime();
spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
vd = mmp_random_leaf(spa->spa_root_vdev);
if (vd == NULL) {
lock_acquire_time = gethrtime() - lock_acquire_time;
if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
zfs_dbgmsg("SCL_STATE acquisition took %llu ns\n",
(u_longlong_t)lock_acquire_time);

error = mmp_random_leaf(spa->spa_root_vdev, &vd);

mutex_enter(&mmp->mmp_io_lock);

/*
* spa_mmp_history has two types of entries:
* Issued MMP write: records time issued, error status, etc.
* Skipped MMP write: an MMP write could not be issued because no
* suitable leaf vdev was available. See comment above struct
* spa_mmp_history for details.
*/

if (error) {
if (mmp->mmp_skip_error == error) {
spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
} else {
mmp->mmp_skip_error = error;
spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
gethrestime_sec(), mmp->mmp_delay, NULL, 0,
mmp->mmp_kstat_id++, error);
}
mutex_exit(&mmp->mmp_io_lock);
spa_config_exit(spa, SCL_STATE, FTAG);
return;
}

mutex_enter(&mmp->mmp_io_lock);
mmp->mmp_skip_error = 0;

if (mmp->mmp_zio_root == NULL)
mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
Expand All @@ -341,13 +410,14 @@ mmp_write_uberblock(spa_t *spa)
ub->ub_mmp_magic = MMP_MAGIC;
ub->ub_mmp_delay = mmp->mmp_delay;
vd->vdev_mmp_pending = gethrtime();
vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id++;
vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;

zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));

mmp->mmp_kstat_id++;
mutex_exit(&mmp->mmp_io_lock);

offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
Expand All @@ -358,8 +428,8 @@ mmp_write_uberblock(spa_t *spa)
VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
flags | ZIO_FLAG_DONT_PROPAGATE);

spa_mmp_history_add(ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd,
label, vd->vdev_mmp_kstat_id);
(void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);

zio_nowait(zio);
}
Expand Down
89 changes: 74 additions & 15 deletions module/zfs/spa_stats.c
Original file line number Diff line number Diff line change
Expand Up @@ -718,18 +718,28 @@ spa_io_history_destroy(spa_t *spa)
*/

/*
* MMP statistics - Information exported regarding each MMP update
* MMP statistics - Information exported regarding attempted MMP writes
* For MMP writes issued, fields used as per comments below.
* For MMP writes skipped, an entry represents a span of time when
* writes were skipped for same reason (error from mmp_random_leaf).
* Differences are:
* timestamp time first write skipped, if >1 skipped in a row
* mmp_delay delay value at timestamp
* vdev_guid number of writes skipped
* io_error one of enum mmp_error
* duration time span (ns) of skipped writes
*/

typedef struct spa_mmp_history {
uint64_t mmp_kstat_id; /* unique # for updates */
uint64_t txg; /* txg of last sync */
uint64_t timestamp; /* UTC time of of last sync */
uint64_t mmp_delay; /* nanosec since last MMP write */
uint64_t timestamp; /* UTC time MMP write issued */
uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */
uint64_t vdev_guid; /* unique ID of leaf vdev */
char *vdev_path;
uint64_t vdev_label; /* vdev label */
int vdev_label; /* vdev label */
int io_error; /* error status of MMP write */
hrtime_t error_start; /* hrtime of start of error period */
hrtime_t duration; /* time from submission to completion */
list_node_t smh_link;
} spa_mmp_history_t;
Expand All @@ -747,9 +757,12 @@ static int
spa_mmp_history_data(char *buf, size_t size, void *data)
{
spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
"%-10lld %s\n";
char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
"%-10lld %s\n";

(void) snprintf(buf, size, "%-10llu %-10llu %-10llu %-6lld %-10lld "
"%-12llu %-24llu %-10llu %s\n",
(void) snprintf(buf, size, (smh->error_start ? skip_fmt : write_fmt),
(u_longlong_t)smh->mmp_kstat_id, (u_longlong_t)smh->txg,
(u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
(longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
Expand Down Expand Up @@ -869,8 +882,42 @@ spa_mmp_history_destroy(spa_t *spa)
mutex_destroy(&ssh->lock);
}

/*
* Set duration in existing "skip" record to how long we have waited for a leaf
* vdev to become available.
*
* Important that we start search at the head of the list where new
* records are inserted, so this is normally an O(1) operation.
*/
int
spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id)
{
spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
spa_mmp_history_t *smh;
int error = ENOENT;

if (zfs_multihost_history == 0 && ssh->size == 0)
return (0);

mutex_enter(&ssh->lock);
for (smh = list_head(&ssh->list); smh != NULL;
smh = list_next(&ssh->list, smh)) {
if (smh->mmp_kstat_id == mmp_kstat_id) {
ASSERT3U(smh->io_error, !=, 0);
smh->duration = gethrtime() - smh->error_start;
smh->vdev_guid++;
error = 0;
break;
}
}
mutex_exit(&ssh->lock);

return (error);
}

/*
* Set MMP write duration and error status in existing record.
* See comment re: search order above spa_mmp_history_set_skip().
*/
int
spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
Expand All @@ -887,6 +934,7 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
for (smh = list_head(&ssh->list); smh != NULL;
smh = list_next(&ssh->list, smh)) {
if (smh->mmp_kstat_id == mmp_kstat_id) {
ASSERT(smh->io_error == 0);
smh->io_error = io_error;
smh->duration = duration;
error = 0;
Expand All @@ -899,29 +947,39 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
}

/*
* Add a new MMP write to historical record.
* Add a new MMP historical record.
* error == 0 : a write was issued.
* error != 0 : a write was not issued because no leaves were found.
*/
void
spa_mmp_history_add(uint64_t txg, uint64_t timestamp, uint64_t mmp_delay,
vdev_t *vd, int label, uint64_t mmp_kstat_id)
void *
spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
int error)
{
spa_t *spa = vd->vdev_spa;
spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
spa_mmp_history_t *smh, *rm;

if (zfs_multihost_history == 0 && ssh->size == 0)
return;
return (NULL);

smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
smh->txg = txg;
smh->timestamp = timestamp;
smh->mmp_delay = mmp_delay;
smh->vdev_guid = vd->vdev_guid;
if (vd->vdev_path)
smh->vdev_path = strdup(vd->vdev_path);
if (vd) {
smh->vdev_guid = vd->vdev_guid;
if (vd->vdev_path)
smh->vdev_path = strdup(vd->vdev_path);
}
smh->vdev_label = label;
smh->mmp_kstat_id = mmp_kstat_id;

if (error) {
smh->io_error = error;
smh->error_start = gethrtime();
smh->vdev_guid = 1;
}

mutex_enter(&ssh->lock);

list_insert_head(&ssh->list, smh);
Expand All @@ -936,6 +994,7 @@ spa_mmp_history_add(uint64_t txg, uint64_t timestamp, uint64_t mmp_delay,
}

mutex_exit(&ssh->lock);
return ((void *)smh);
}

void
Expand Down
2 changes: 1 addition & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ tags = ['functional', 'mmap']
[tests/functional/mmp]
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval']
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history']
tags = ['functional', 'mmp']

[tests/functional/mount]
Expand Down
1 change: 1 addition & 0 deletions tests/zfs-tests/tests/functional/mmp/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/mmp
dist_pkgdata_SCRIPTS = \
multihost_history.ksh \
mmp_on_thread.ksh \
mmp_on_uberblocks.ksh \
mmp_on_off.ksh \
Expand Down
Loading