diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3c4c3fb5a279..48c1e7838f51 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -148,6 +148,9 @@ struct vdev_queue { avl_tree_t vq_write_offset_tree; avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; + zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_ia_active; /* Active interactive I/Os. */ + uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 469963750a05..c1e074614392 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2165,6 +2165,40 @@ See the section "ZFS I/O SCHEDULER". Default value: \fB1\fR. .RE +.sp +.ne 2 +.na +\fBzfs_vdev_nia_delay\fR (int) +.ad +.RS 12n +To reduce the effects of non-interactive I/O on interactive I/O latency +they are limited to *_min_active while there are outstanding interactive +I/Os, or until an additional zfs_vdev_nia_delay I/O's complete after the +last interactive I/O. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_nia_credit\fR (int) +.ad +.RS 12n +Some HDDs tend to prioritize sequential I/O so high, that concurrent +random I/O latency reaches several seconds. On some HDDs it happens +even if sequential I/Os are submitted one at a time, and so setting +*_max_active to 1 does not help. To prevent non-interactive I/Os, like +scrub, from monopolizing the device no more than zfs_vdev_nia_credit +I/Os can be sent while there are outstanding incomplete interactive +I/Os. This enforced wait ensures the HDD services the interactive I/O +within a reasonable amount of time. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB5\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index a8ef3d7474c9..ce1d2a0b89a9 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -151,7 +151,7 @@ uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; -uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_scrub_max_active = 3; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; @@ -171,6 +171,26 @@ uint32_t zfs_vdev_rebuild_max_active = 3; int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; +/* + * To reduce the effects of non-interactive I/O on interactive I/O latency + * they are limited to *_min_active while there are outstanding interactive + * I/Os, or until an additional zfs_vdev_nia_delay I/O's complete after the + * last interactive I/O. + */ +uint_t zfs_vdev_nia_delay = 5; + +/* + * Some HDDs tend to prioritize sequential I/O so high that concurrent + * random I/O latency reaches several seconds. On some HDDs it happens + * even if sequential I/Os are submitted one at a time, and so setting + * *_max_active to 1 does not help. To prevent non-interactive I/Os, like + * scrub, from monopolizing the device no more than zfs_vdev_nia_credit + * I/Os can be sent while there are outstanding incomplete interactive + * I/Os. This enforced wait ensures the HDD services the interactive I/O + * within a reasonable amount of time. + */ +uint_t zfs_vdev_nia_credit = 5; + /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes @@ -261,7 +281,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2) } static int -vdev_queue_class_min_active(zio_priority_t p) +vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -273,15 +293,19 @@ vdev_queue_class_min_active(zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: - return (zfs_vdev_scrub_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); case ZIO_PRIORITY_REMOVAL: - return (zfs_vdev_removal_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); case ZIO_PRIORITY_INITIALIZING: - return (zfs_vdev_initializing_min_active); + return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: + MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REBUILD: - return (zfs_vdev_rebuild_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); default: panic("invalid priority %u", p); return (0); @@ -337,7 +361,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static int -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -349,14 +373,34 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_scrub_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (zfs_vdev_scrub_min_active); return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_removal_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (zfs_vdev_removal_min_active); return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_initializing_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (zfs_vdev_initializing_min_active); return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REBUILD: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_rebuild_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (zfs_vdev_rebuild_min_active); return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); @@ -372,17 +416,24 @@ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p; + zio_priority_t p, n; if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); - /* find a queue that has not reached its minimum # outstanding i/os */ - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + /* + * Find a queue that has not reached its minimum # outstanding i/os. + * Do round-robin to reduce starvation due to zfs_vdev_max_active + * and vq_nia_credit limits. + */ + for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { + p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(p)) + vdev_queue_class_min_active(vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* @@ -392,8 +443,10 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, p)) + vdev_queue_class_max_active(spa, vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* No eligible queued i/os */ @@ -493,6 +546,20 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) } } +static boolean_t +vdev_queue_is_interactive(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SCRUB: + case ZIO_PRIORITY_REMOVAL: + case ZIO_PRIORITY_INITIALIZING: + case ZIO_PRIORITY_REBUILD: + return (B_FALSE); + default: + return (B_TRUE); + } +} + static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { @@ -502,6 +569,11 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (++vq->vq_ia_active == 1) + vq->vq_nia_credit = 1; + } else if (vq->vq_ia_active > 0) + vq->vq_nia_credit--; avl_add(&vq->vq_active_tree, zio); if (shk->kstat != NULL) { @@ -520,6 +592,13 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (--vq->vq_ia_active == 0) + vq->vq_nia_credit = 0; + else + vq->vq_nia_credit = zfs_vdev_nia_credit; + } else if (vq->vq_ia_active == 0) + vq->vq_nia_credit++; avl_remove(&vq->vq_active_tree, zio); if (shk->kstat != NULL) { @@ -1065,6 +1144,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, "Min active rebuild I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, + "Number of non-interactive I/Os to allow in sequence"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, + "Number of non-interactive I/Os before _max_active"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */