From e5d032c88efb1c29583437c00942e4de8d48925e Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Mon, 15 Jul 2024 20:07:13 +0500 Subject: [PATCH] zvol_os: Add copy offload support Signed-off-by: Ameer Hamza --- config/kernel-blkdev.m4 | 32 ++++ module/os/linux/zfs/zvol_os.c | 262 ++++++++++++++++++++++++++++++++ module/zfs/zfs_log.c | 10 +- module/zfs/zfs_vnops.c | 2 +- tests/zfs-tests/cmd/clonefile.c | 4 +- 5 files changed, 304 insertions(+), 6 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 83190c6fbe3f..554c16451aa8 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -132,6 +132,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [ ]) ]) +dnl # +dnl # Upstream patch for blkdev copy offload support +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_COPY_OFFLOAD], [ + ZFS_LINUX_TEST_SRC([blkdev_copy_offload], [ + #include + #include + ], [ + struct block_device *bdev_in = NULL, *bdev_out = NULL; + loff_t pos_in = 0, pos_out = 0; + ssize_t ret __attribute__ ((unused)); + ssize_t len = 0; + void *private = NULL; + void (*endio)(void *, int, ssize_t) = NULL; + ret = blkdev_copy_offload(bdev_in, pos_in, pos_out, len, + endio, private, GFP_KERNEL, bdev_out); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_COPY_OFFLOAD], [ + AC_MSG_CHECKING([whether blkdev_copy_offload exists]) + ZFS_LINUX_TEST_RESULT([blkdev_copy_offload], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_COPY_OFFLOAD, 1, + [blkdev_copy_offload exits]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 2.6.38 API change, dnl # Added blkdev_put() @@ -759,6 +789,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T + ZFS_AC_KERNEL_SRC_BLKDEV_COPY_OFFLOAD ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -781,4 +812,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T + ZFS_AC_KERNEL_BLKDEV_COPY_OFFLOAD ]) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 47aa6417068d..08f3237b6be5 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,7 @@ static boolean_t zvol_use_blk_mq = B_FALSE; static unsigned int zvol_blk_mq_blocks_per_thread = 8; static unsigned int zvol_num_taskqs = 0; +extern int zfs_bclone_wait_dirty; #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ @@ -139,6 +141,13 @@ typedef struct zv_request_task { taskq_ent_t ent; } zv_request_task_t; +#ifdef HAVE_BLKDEV_COPY_OFFLOAD +/* Keeps track of single outstanding copy offload IO */ +struct blkdev_copy_offload_io_zvol { + void *driver_private; +}; +#endif + static zv_request_task_t * zv_request_task_create(zv_request_t zvr) { @@ -496,6 +505,223 @@ zvol_read_task(void *arg) zv_request_task_free(task); } +#ifdef HAVE_BLKDEV_COPY_OFFLOAD +static void zvol_setup_copy_offload(zv_request_t *zvr) +{ + zvol_state_t *zv_src = zvr->zv, *zv_dst = zvr->zv; + struct request *req = zvr->rq; + struct bio *bio; + zilog_t *zilog_dst; + zfs_uio_t uio_src, uio_dst; + zfs_locked_range_t *inlr, *outlr; + objset_t *inos, *outos; + dmu_tx_t *tx; + blkptr_t *bps; + size_t maxblocks; + uint64_t inoff, outoff, len = 0; + int error = 0, seg = 1; + + memset(&uio_src, 0, sizeof (zfs_uio_t)); + memset(&uio_dst, 0, sizeof (zfs_uio_t)); + + /* + * First bio contains information about destination and + * the second contains information about the source + */ + __rq_for_each_bio(bio, req) { + if (seg == blk_rq_nr_phys_segments(req)) { + struct blkdev_copy_offload_io_zvol *offload_io = + bio->bi_private; + zfs_uio_bvec_init(&uio_src, bio, NULL); + if (len != bio->bi_iter.bi_size) { + rw_exit(&zv_src->zv_suspend_lock); + zvol_end_io(bio, req, -SET_ERROR(error)); + return; + } + if (offload_io && offload_io->driver_private) + zv_dst = offload_io->driver_private; + } else { + zfs_uio_bvec_init(&uio_dst, bio, NULL); + len = bio->bi_iter.bi_size; + } + seg++; + } + + if (!zv_src || !zv_dst) { + rw_exit(&zv_src->zv_suspend_lock); + zvol_end_io(bio, req, -SET_ERROR(error)); + return; + } + if (zv_src != zv_dst) + rw_enter(&zv_dst->zv_suspend_lock, RW_READER); + + inoff = uio_src.uio_loffset; + outoff = uio_dst.uio_loffset; + inos = zv_src->zv_objset; + outos = zv_dst->zv_objset; + + /* + * Sanity checks + */ + if (!spa_feature_is_enabled(dmu_objset_spa(outos), + SPA_FEATURE_BLOCK_CLONING)) { + error = EOPNOTSUPP; + goto out; + } + if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { + error = EXDEV; + goto out; + } + if (inos->os_encrypted != outos->os_encrypted) { + error = EXDEV; + goto out; + } + if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { + error = EINVAL; + goto out; + } + if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { + error = 0; + goto out; + } + + /* + * Do not read beyond source boundary + */ + if (len > zv_src->zv_volsize - inoff) + len = zv_src->zv_volsize - inoff; + if (len == 0) { + error = 0; + goto out; + } + + /* + * No overlapping if we are cloning within the same file + */ + if (zv_src == zv_dst) { + if (inoff < outoff + len && outoff < inoff + len) { + error = EINVAL; + goto out; + } + } + + /* + * Block size must be power-of-2 if destination offset != 0. + * There can be no multiple blocks of non-power-of-2 size. + */ + if (outoff != 0 && !ISP2(zv_src->zv_volblocksize)) { + error = EINVAL; + goto out; + } + + /* + * Offsets and length must be at block boundaries + */ + if ((inoff % zv_src->zv_volblocksize) != 0 || + (outoff % zv_dst->zv_volblocksize) != 0) { + error = EINVAL; + goto out; + } + + /* + * Length must be multiple of block size, except for the end of the file + */ + if ((len % zv_src->zv_volblocksize) != 0 && (len < zv_src->zv_volsize - + inoff || len < zv_dst->zv_volsize - outoff)) { + error = EINVAL; + goto out; + } + + /* + * ZIL Lock + */ + if (zv_dst->zv_zilog == NULL) { + rw_exit(&zv_dst->zv_suspend_lock); + rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); + if (zv_dst->zv_zilog == NULL) { + zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, + zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); + zv_dst->zv_flags |= ZVOL_WRITTEN_TO; + VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); + } + rw_downgrade(&zv_dst->zv_suspend_lock); + } + + zilog_dst = zv_dst->zv_zilog; + maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / + sizeof (bps[0]); + bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + while (len > 0) { + uint64_t size, last_synced_txg; + size_t nbps = maxblocks; + size = MIN(zv_src->zv_volblocksize * maxblocks, len); + last_synced_txg = spa_last_synced_txg( + dmu_objset_spa(zv_src->zv_objset)); + error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff, + size, bps, &nbps); + if (error != 0) { + /* + * If we are trying to clone a block that was created + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. + */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool + (zv_src->zv_objset), last_synced_txg + 1); + continue; + } + break; + } + + tx = dmu_tx_create(zv_dst->zv_objset); + dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + break; + } + error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size, + tx, bps, nbps); + if (error != 0) { + dmu_tx_commit(tx); + break; + } + zfs_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, NULL, outoff, + size, zv_src->zv_volblocksize, bps, nbps); + dmu_tx_commit(tx); + inoff += size; + outoff += size; + len -= size; + } + vmem_free(bps, sizeof (bps[0]) * maxblocks); + zfs_rangelock_exit(outlr); + zfs_rangelock_exit(inlr); + if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { + zil_commit(zilog_dst, ZVOL_OBJ); + } +out: + if (zv_src != zv_dst) + rw_exit(&zv_dst->zv_suspend_lock); + + rw_exit(&zv_src->zv_suspend_lock); + zvol_end_io(bio, req, -SET_ERROR(error)); +} + +static void +zvol_copy_offload_task(void *arg) +{ + zv_request_task_t *task = arg; + zvol_setup_copy_offload(&task->zvr); + zv_request_task_free(task); +} +#endif /* * Process a BIO or request @@ -555,6 +781,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, blk_mq_hw_queue); tq_idx = taskq_hash % ztqs->tqs_cnt; +#ifdef HAVE_BLKDEV_COPY_OFFLOAD + if (zv->zv_zso->use_blk_mq && rq && op_is_copy(req_op(rq))) { + if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { + zvol_end_io(bio, rq, -SET_ERROR(EROFS)); + goto out; + } + rw_enter(&zv->zv_suspend_lock, RW_READER); + + if (force_sync) { + zvol_setup_copy_offload(&zvr); + } else { + task = zv_request_task_create(zvr); + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], + zvol_copy_offload_task, task, 0, &task->ent); + } + goto out; + } else if (bio && op_is_copy(bio_op(bio))) { + zvol_end_io(bio, rq, -SET_ERROR(EOPNOTSUPP)); + goto out; + } +#endif + if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { zvol_end_io(bio, rq, -SET_ERROR(EROFS)); @@ -1607,6 +1855,9 @@ zvol_os_create_minor(const char *name) uint64_t hash = zvol_name_hash(name); uint64_t volthreading; bool replayed_zil = B_FALSE; +#ifdef HAVE_BLKDEV_COPY_OFFLOAD + struct queue_limits *lim; +#endif if (zvol_inhibit_dev) return (0); @@ -1667,6 +1918,17 @@ zvol_os_create_minor(const char *name) set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); +#ifdef HAVE_BLKDEV_COPY_OFFLOAD + if (zv->zv_zso->use_blk_mq) { + /* + * We've seen SCST sending 256 MB XCOPY request for large IOs + */ + lim = &zv->zv_zso->zvo_queue->limits; + lim->max_copy_hw_sectors = (256 * 1024 * 1024) >> 9; + lim->max_copy_sectors = (256 * 1024 * 1024) >> 9; + } +#endif + #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 16b07cc0a7d0..39bce1950380 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -46,6 +46,7 @@ #include #include #include +#include /* * These zfs_log_* functions must be called within a dmu tx, in one @@ -899,7 +900,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t partlen, max_log_data; size_t partnbps; - if (zil_replaying(zilog, tx) || zp->z_unlinked) + if (zil_replaying(zilog, tx) || (zp && zp->z_unlinked)) return; max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); @@ -913,14 +914,17 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, itx = zil_itx_create(txtype, sizeof (*lr) + sizeof (bps[0]) * partnbps); lr = (lr_clone_range_t *)&itx->itx_lr; - lr->lr_foid = zp->z_id; + lr->lr_foid = (zp) ? zp->z_id : ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = partlen; lr->lr_blksz = blksz; lr->lr_nbps = partnbps; memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); - itx->itx_sync = (zp->z_sync_cnt != 0); + if (zp) + itx->itx_sync = (zp->z_sync_cnt != 0); + else + itx->itx_sync = 0; zil_itx_assign(zilog, itx, tx); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 6c15a5c472ea..8c8ed255e686 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -71,7 +71,7 @@ int zfs_bclone_enabled = 1; * a copy of the file and is therefore not the default. However, in certain * scenarios this behavior may be desirable so a tunable is provided. */ -static int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 0; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index bc30bb7798e9..e09286f44455 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -228,14 +228,14 @@ main(int argc, char **argv) } } - int sfd = open(argv[optind], O_RDONLY); + int sfd = open(argv[optind], O_RDONLY | O_DIRECT); if (sfd < 0) { fprintf(stderr, "open: %s: %s\n", argv[optind], strerror(errno)); return (1); } - int dfd = open(argv[optind+1], O_WRONLY|O_CREAT, + int dfd = open(argv[optind+1], O_WRONLY|O_CREAT|O_DIRECT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); if (dfd < 0) { fprintf(stderr, "open: %s: %s\n",