diff --git a/include/sys/zio.h b/include/sys/zio.h index 69b00d0f4029..278b138e6cea 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -440,6 +440,7 @@ struct zio { uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; + uint64_t io_recursion_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 34e4420da733..07649d086508 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -650,6 +650,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) } zio_add_child(dio, aio); + dio->io_recursion_count = aio->io_recursion_count + 1; vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); @@ -661,7 +662,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) } static zio_t * -vdev_queue_io_to_issue(vdev_queue_t *vq) +vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t recursion_count) { zio_t *zio, *aio; zio_priority_t p; @@ -708,6 +709,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) */ if (zio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); + zio->io_recursion_count = recursion_count + 1; zio_vdev_io_bypass(zio); zio_execute(zio); mutex_enter(&vq->vq_lock); @@ -750,7 +752,7 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); zio->io_timestamp = gethrtime(); vdev_queue_io_add(vq, zio); - nio = vdev_queue_io_to_issue(vq); + nio = vdev_queue_io_to_issue(vq, zio->io_recursion_count); mutex_exit(&vq->vq_lock); if (nio == NULL) @@ -781,7 +783,8 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = gethrtime(); vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; - while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { + while ((nio = vdev_queue_io_to_issue(vq, zio->io_recursion_count)) + != NULL) { mutex_exit(&vq->vq_lock); if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0ba167c62b59..89978689da1a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -60,6 +60,7 @@ kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; int zio_bulk_flags = 0; int zio_delay_max = ZIO_DELAY_MAX; +int zio_recursion_threshold = 16; /* Threshold to redispatch zio_t objects */ /* * The following actions directly effect the spa's sync-to-convergence logic. @@ -516,6 +517,7 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) if (*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; + pio->io_recursion_count = zio->io_recursion_count + 1; mutex_exit(&pio->io_lock); __zio_execute(pio); } else { @@ -975,6 +977,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; + zio->io_recursion_count = pio->io_recursion_count + 1; + return (zio); } @@ -1268,6 +1272,9 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); + /* Reset the notify counter */ + zio->io_recursion_count = 0; + /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. @@ -1391,6 +1398,16 @@ __zio_execute(zio_t *zio) cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; + /* + * Deep call graphs can cause us to overrun the stack. + * Redispatch ZIO when we hit zio_recursion_threshold. + */ + if (zio->io_recursion_count && + (zio->io_recursion_count >= zio_recursion_threshold)) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); + return; + } + /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O,