From 43ff2584fd53ac0deadd4c3274e2bccff9e50510 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 29 Jun 2021 14:00:49 -0600 Subject: [PATCH] Creating gang ABDs for Raidz optional IOs In order to reduce contention on the vq_lock, optional no data blocks for Raidz are put into gang ABDs. This allows for a reduction on the number of IO issued down to the children VDEVs reducing contention on the vq_lock when issuing IO for skip sectors. Signed-off-by: Brian Atkinson --- module/zfs/vdev_queue.c | 12 ++++-- module/zfs/vdev_raidz.c | 87 ++++++++++++++++++++++++++++++----------- 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 06d22f6df4c5..f79be04e0b4a 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -778,9 +778,15 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { /* allocate a buffer for a write gap */ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - ASSERT3P(dio->io_abd, ==, NULL); - abd_gang_add(aio->io_abd, - abd_get_zeros(dio->io_size), B_TRUE); + if (dio->io_abd == NULL) { + abd_gang_add(aio->io_abd, + abd_get_zeros(dio->io_size), + B_TRUE); + } else { + abd_gang_add(zio->io_abd, + dio->io_abd, + B_FALSE); + } } else { /* * We pass B_FALSE to abd_gang_add() diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 1feebf7089b4..18cc9659f931 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1485,6 +1485,18 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; + + /* + * If we created a gang ABD to aggregate IO's for writes we will + * free the gang ABD here and reset the column's ABD to the original + * ABD. + */ + if (zio->io_type == ZIO_TYPE_WRITE && abd_is_gang(rc->rc_abd)) { + ASSERT3P(rc->rc_orig_data, !=, rc->rc_abd); + abd_free(rc->rc_abd); + rc->rc_abd = rc->rc_orig_data; + rc->rc_orig_data = NULL; + } } static void @@ -1525,41 +1537,70 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) { vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; - int c, i; + int c, i = 0, skip_first_cols = -1; vdev_raidz_generate_parity_row(rm, rr); - for (int c = 0; c < rr->rr_cols; c++) { + IMPLY(rm->rm_nskip > 0, rm->rm_skipstart < rr->rr_scols); + + if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { + skip_first_cols = + (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; + } + + ASSERT3S(skip_first_cols, <, rr->rr_scols); + IMPLY(rm->rm_nskip == 0, skip_first_cols = -1); + + for (c = 0; c < rr->rr_scols; c++) { + abd_t *abd = NULL; + enum zio_flag flags = 0; raidz_col_t *rc = &rr->rr_col[c]; - if (rc->rc_size == 0) - continue; /* Verify physical to logical translation */ vdev_raidz_io_verify(vd, rr, c); - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], rc->rc_offset, - rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, - 0, vdev_raidz_child_done, rc)); - } + /* + * Generate I/O for skip sectors to improve aggregation + * contiguity. We will use gang ABD's to reduce contention + * on the children VDEV queue locks (vq_lock) by issuing + * a single I/O that contains the data and skip sectors. + */ + if (((skip_first_cols > -1 && c < skip_first_cols) || + c >= rm->rm_skipstart) && i < rm->rm_nskip) { + abd = abd_alloc_gang(); + if (rc->rc_size > 0) { + abd_gang_add(abd, rc->rc_abd, B_FALSE); + } else { + ASSERT3P(rc->rc_abd, ==, NULL); + flags = ZIO_FLAG_OPTIONAL | ZIO_FLAG_NODATA; + } + abd_gang_add(abd, abd_get_zeros(1ULL << ashift), + B_TRUE); - /* - * Generate optional I/Os for skip sectors to improve aggregation - * contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rr->rr_scols); - if (c == rr->rr_scols) - c = 0; + /* + * Store original ABD so the gang ABD can be freed in + * vdev_raidz_child_done(). + */ + ASSERT3P(rc->rc_orig_data, ==, NULL); + rc->rc_orig_data = rc->rc_abd; + rc->rc_abd = abd; + i++; + } else { + /* + * I/O does not contain any skip sectors. + */ + abd = rc->rc_abd; + } - raidz_col_t *rc = &rr->rr_col[c]; - vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + ASSERT3P(abd, !=, NULL); - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + abd, abd_get_size(abd), zio->io_type, zio->io_priority, + flags, vdev_raidz_child_done, rc)); } + + ASSERT3S(i, ==, rm->rm_nskip); } static void