From 3995357a1426d8324802c0033fe24afa7708a1fa Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 18 Sep 2025 09:37:04 -0700 Subject: [PATCH] zvol: Fix blk-mq sync The zvol blk-mq codepaths would erroneously send FLUSH and TRIM commands down the read codepath, rather than write. This fixes the issue, and updates the zvol_misc_fua test to verify that sync writes are actually happening. Fixes: #17761 Signed-off-by: Tony Hutter --- include/os/linux/kernel/linux/blkdev_compat.h | 18 --------- module/os/linux/zfs/zvol_os.c | 23 ++++++++++- .../zvol/zvol_misc/zvol_misc_fua.ksh | 40 ++++++++++++++++++- 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 076dab8ba6dc..214f3ea0e787 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -542,24 +542,6 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ -/* - * All the io_*() helper functions below can operate on a bio, or a rq, but - * not both. The older submit_bio() codepath will pass a bio, and the - * newer blk-mq codepath will pass a rq. - */ -static inline int -io_data_dir(struct bio *bio, struct request *rq) -{ - if (rq != NULL) { - if (op_is_write(req_op(rq))) { - return (WRITE); - } else { - return (READ); - } - } - return (bio_data_dir(bio)); -} - static inline int io_is_flush(struct bio *bio, struct request *rq) { diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index bac166fcd89e..967a018640e1 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -484,7 +484,28 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); - int rw = io_data_dir(bio, rq); + int rw; + + if (rq != NULL) { + /* + * Flush & trim requests go down the zvol_write codepath. Or + * more specifically: + * + * If request is a write, or if it's op_is_sync() and not a + * read, or if it's a flush, or if it's a discard, then send the + * request down the write path. + */ + if (op_is_write(rq->cmd_flags) || + (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || + req_op(rq) == REQ_OP_FLUSH || + op_is_discard(rq->cmd_flags)) { + rw = WRITE; + } else { + rw = READ; + } + } else { + rw = bio_data_dir(bio); + } if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 571a698eb63a..502ebada22dc 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -50,17 +50,53 @@ fi typeset datafile1="$(mktemp -t zvol_misc_fua1.XXXXXX)" typeset datafile2="$(mktemp -t zvol_misc_fua2.XXXXXX)" +typeset datafile3="$(mktemp -t zvol_misc_fua3_log.XXXXXX)" typeset zvolpath=${ZVOL_DEVDIR}/$TESTPOOL/$TESTVOL +typeset DISK1=${DISKS%% *} function cleanup { - rm "$datafile1" "$datafile2" + log_must zpool remove $TESTPOOL $datafile3 + rm "$datafile1" "$datafile2" "$datafile2" +} + +# Prints the total number of sync writes for a vdev +# $1: vdev +function get_sync +{ + zpool iostat -p -H -v -r $TESTPOOL $1 | \ + awk '/[0-9]+$/{s+=$4+$5} END{print s}' } function do_test { # Wait for udev to create symlinks to our zvol block_device_wait $zvolpath + # Write using sync (creates FLUSH calls after writes, but not FUA) + old_vdev_writes=$(get_sync $DISK1) + old_log_writes=$(get_sync $datafile3) + + log_must fio --name=write_iops --size=5M \ + --ioengine=libaio --verify=0 --bs=4K \ + --iodepth=1 --rw=randwrite --group_reporting=1 \ + --filename=$zvolpath --sync=1 + + vdev_writes=$(( $(get_sync $DISK1) - $old_vdev_writes)) + log_writes=$(( $(get_sync $datafile3) - $old_log_writes)) + + # When we're doing sync writes, we should see many more writes go to + # the log vs the first vdev. Experiments show anywhere from a 160-320x + # ratio of writes to the log vs the first vdev (due to some straggler + # writes to the first vdev). + # + # Check that we have a large ratio (100x) of sync writes going to the + # log device + ratio=$(($log_writes / $vdev_writes)) + log_note "Got $log_writes log writes, $vdev_writes vdev writes." + if [ $ratio -lt 100 ] ; then + log_fail "Expected > 100x more log writes than vdev writes. " + fi + # Create a data file log_must dd if=/dev/urandom of="$datafile1" bs=1M count=5 @@ -81,6 +117,8 @@ log_assert "Verify that a ZFS volume can do Force Unit Access (FUA)" log_onexit cleanup log_must zfs set compression=off $TESTPOOL/$TESTVOL +log_must truncate -s 100M $datafile3 +log_must zpool add $TESTPOOL log $datafile3 log_note "Testing without blk-mq"