diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index ff5d2d370e98..354891a339bf 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -315,6 +315,36 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ ]) ]) +dnl # +dnl # See if kernel supports block multi-queue and blk_status_t. +dnl # blk_status_t represents the new status codes introduced in the 4.13 +dnl # kernel patch: +dnl # +dnl # block: introduce new block status code type +dnl # +dnl # We do not currently support the "old" block multi-queue interfaces from +dnl # prior kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ + ZFS_LINUX_TEST_SRC([blk_mq], [ + #include + ], [ + struct blk_mq_tag_set tag_set = {0}; + (void) blk_mq_alloc_tag_set(&tag_set); + return BLK_STS_OK; + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ + AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available]) + ZFS_LINUX_TEST_RESULT([blk_mq], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI @@ -326,6 +356,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_SRC_BLK_MQ ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ @@ -339,4 +370,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ ZFS_AC_KERNEL_BLK_QUEUE_FLUSH ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS + ZFS_AC_KERNEL_BLK_MQ ]) diff --git a/configure.ac b/configure.ac index 6f34b210d2b7..4d576cec815f 100644 --- a/configure.ac +++ b/configure.ac @@ -400,6 +400,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/zpool_influxdb/Makefile tests/zfs-tests/tests/functional/zvol/Makefile tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 019d5390adec..f0d86835d9ce 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -236,7 +236,9 @@ bio_set_bi_status(struct bio *bio, int error) { ASSERT3S(error, <=, 0); bio->bi_status = errno_to_bi_status(-error); +#ifndef HAVE_BLK_MQ bio_endio(bio); +#endif } #else #define BIO_END_IO_ERROR(bio) (-(bio->bi_error)) @@ -247,7 +249,9 @@ bio_set_bi_error(struct bio *bio, int error) { ASSERT3S(error, <=, 0); bio->bi_error = error; +#ifndef HAVE_BLK_MQ bio_endio(bio); +#endif } #endif /* HAVE_BIO_BI_STATUS */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d7fc31bfde10..bdd356d99d3a 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2172,9 +2172,15 @@ for each I/O submitter. When unset, requests are handled asynchronously by a thread pool. The number of requests which can be handled concurrently is controlled by .Sy zvol_threads . -. -.It Sy zvol_threads Ns = Ns Sy 32 Pq uint -Max number of threads which can handle zvol I/O requests concurrently. +On blk-mq kernels, zvol_request_sync is ignored. +. +.It Sy zvol_threads Ns = Ns Sy 0 Pq uint +The number of threads to use for processing zvol block IOs. +On older non-blk-mq kernels, zvol_threads is the total number of threads to +use for all zvols. +On newer, blk-mq kernels, zvol_threads is the total number of threads per-zvol. +If zvol_threads = 0 (the default) then internally set zvol_threads to the +number of CPUs present. . .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint Defines zvol block devices behaviour when diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c17423426319..8cfbc9ec7e49 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -40,17 +40,38 @@ #include #include +#include + +static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, + boolean_t force_sync); unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; -unsigned int zvol_threads = 32; + +/* + * zvol_threads is the module param the user passes in. + * + * zvol_actual_threads is what we use internally, since the user can pass + * zvol_thread = 0 to mean "use all the CPUs" (the default). So on a quad + * core system, you would have: zvol_threads = 0, zvol_actual_threads = 4. + */ +unsigned int zvol_threads = 0; +unsigned int zvol_actual_threads; + +#ifdef HAVE_BLK_MQ +kmem_cache_t *blk_mq_cache = NULL; +#endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ + +#ifdef HAVE_BLK_MQ + struct blk_mq_tag_set tag_set; +#endif }; taskq_t *zvol_taskq; @@ -61,6 +82,11 @@ typedef struct zv_request_stack { struct bio *bio; } zv_request_t; +typedef struct zv_work { + struct request *rq; + struct work_struct work; +} zv_work_t; + typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; @@ -82,6 +108,108 @@ zv_request_task_free(zv_request_task_t *task) kmem_free(task, sizeof (*task)); } +#ifdef HAVE_BLK_MQ +/* + * This is our blk-mq workqueue callback function. It's here that + * we process all the BIOs in a request. + */ +static void zvol_mq_work_func(struct work_struct *work) +{ + zv_work_t *zv_work; + struct bio *bio = NULL; + struct request *rq; + zvol_state_t *zv; + blk_status_t res = BLK_STS_OK; + + zv_work = container_of(work, zv_work_t, work); + rq = zv_work->rq; + zv = rq->q->queuedata; + + /* Tell the kernel that we are starting to process this request */ + blk_mq_start_request(rq); + + if (blk_rq_is_passthrough(rq)) { + /* Skip non filesystem request */ + blk_mq_end_request(rq, BLK_STS_IOERR); + goto out; + } + + /* Execute the BIOs in this request. */ + __rq_for_each_bio(bio, rq) { + zvol_request_impl(zv, bio, 1); + + /* Did this BIO cause an error? If so, stop the request */ +#ifdef HAVE_BIO_BI_STATUS + res = bio->bi_status; +#else + res = bio->bi_error; +#endif + if (res != BLK_STS_OK) { + /* Got an error */ + break; + } + } + + /* All done */ + blk_mq_end_request(rq, res); + +out: + kmem_cache_free(blk_mq_cache, zv_work); +} + +/* + * This is called when a new block multiqueue request comes in. A request + * contains one or more BIOs. This function is run from an atomic context. + */ +static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + zv_work_t *zv_work; + + /* + * Add the request to our workqueue for processing later since we're + * currently in an atomic context and our bio processing code can sleep. + */ + zv_work = kmem_cache_alloc(blk_mq_cache, KM_NOSLEEP); + if (!zv_work) { + /* Try again, maybe memory will free up */ + return (BLK_STS_AGAIN); + } + INIT_WORK(&zv_work->work, zvol_mq_work_func); + zv_work->rq = rq; + schedule_work(&zv_work->work); + + /* Acknowledge to the kernel that we got this request */ + return (BLK_STS_OK); +} + +static struct blk_mq_ops my_queue_ops = { + .queue_rq = zvol_mq_queue_rq, +}; + +/* Initialize our blk-mq struct */ +static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) +{ + int err; + struct zvol_state_os *zso = zv->zv_zso; + + memset(&zso->tag_set, 0, sizeof (zso->tag_set)); + + /* Initialize tag set. */ + zso->tag_set.ops = &my_queue_ops; + zso->tag_set.nr_hw_queues = zvol_actual_threads; + zso->tag_set.queue_depth = BLKDEV_MAX_RQ; + zso->tag_set.numa_node = NUMA_NO_NODE; + zso->tag_set.cmd_size = 0; + zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + zso->tag_set.driver_data = zv; + err = blk_mq_alloc_tag_set(&zso->tag_set); + + return (err); +} +#endif /* HAVE_BLK_MQ */ + /* * Given a path, return TRUE if path is a ZVOL. */ @@ -335,27 +463,24 @@ zvol_read_task(void *arg) zv_request_task_free(task); } -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -static blk_qc_t -zvol_submit_bio(struct bio *bio) -#else -static MAKE_REQUEST_FN_RET -zvol_request(struct request_queue *q, struct bio *bio) -#endif +/* + * Process a BIO + * + * force_sync: Set to 0 to defer processing the BIO to a background taskq + * Set to 1 to process the BIO right now. + */ +static void +zvol_request_impl(zvol_state_t *zv, struct bio *bio, boolean_t force_sync) { -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#if defined(HAVE_BIO_BDEV_DISK) - struct request_queue *q = bio->bi_bdev->bd_disk->queue; -#else - struct request_queue *q = bio->bi_disk->queue; -#endif -#endif - zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); int rw = bio_data_dir(bio); + if (zvol_request_sync) { + force_sync = 1; + } + if (bio_has_data(bio) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", @@ -437,7 +562,7 @@ zvol_request(struct request_queue *q, struct bio *bio) * the i/o to complete). */ if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { - if (zvol_request_sync) { + if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); @@ -445,7 +570,7 @@ zvol_request(struct request_queue *q, struct bio *bio) zvol_discard_task, task, 0, &task->ent); } } else { - if (zvol_request_sync) { + if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); @@ -467,7 +592,7 @@ zvol_request(struct request_queue *q, struct bio *bio) rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ - if (zvol_request_sync) { + if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); @@ -478,11 +603,34 @@ zvol_request(struct request_queue *q, struct bio *bio) out: spl_fstrans_unmark(cookie); +} + + +#ifndef HAVE_BLK_MQ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +static blk_qc_t +zvol_submit_bio(struct bio *bio) +#else +static MAKE_REQUEST_FN_RET +zvol_request(struct request_queue *q, struct bio *bio) +#endif +{ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +#if defined(HAVE_BIO_BDEV_DISK) + struct request_queue *q = bio->bi_bdev->bd_disk->queue; +#else + struct request_queue *q = bio->bi_disk->queue; +#endif +#endif + zvol_state_t *zv = q->queuedata; + + zvol_request_impl(zv, bio, 0); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) return (BLK_QC_T_NONE); #endif } +#endif /* !HAVE_BLK_MQ */ static int zvol_open(struct block_device *bdev, fmode_t flag) @@ -761,9 +909,12 @@ static struct block_device_operations zvol_ops = { #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, + +#ifndef HAVE_BLK_MQ #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif +#endif }; /* @@ -794,8 +945,58 @@ zvol_alloc(dev_t dev, const char *name) list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); -#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS -#ifdef HAVE_BLK_ALLOC_DISK + /* + * Just to decode this #ifdef rat's nest a little... + * + * The block layer has 3 interfaces for getting BIOs: + * + * 1. blk-mq request queues (new) + * 2. submit_bio() (oldest) + * 3. regular request queues (old). + * + * Each of those interfaces has two permutations: + * + * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates + * both the disk and its queue (5.14 kernel or newer) + * + * b) We don't have blk_*alloc_disk(), and have to allocate the + * disk and the queue separately. (5.13 kernel or older) + */ +#if defined(HAVE_BLK_MQ) + + /* Allocate our blk-mq tag_set */ + if (zvol_blk_mq_alloc_tag_set(zv) != 0) + goto out_kmem; + +#if defined(HAVE_BLK_ALLOC_DISK) + zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); + if (zso->zvo_disk == NULL) + goto out_kmem; + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; +#else + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) { + blk_cleanup_queue(zso->zvo_queue); + goto out_kmem; + } + /* Allocate queue */ + zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); + if (IS_ERR(zso->zvo_queue)) { + blk_mq_free_tag_set(&zso->tag_set); + goto out_kmem; + } + + /* Our queue is now created, assign it to our disk */ + zso->zvo_disk->queue = zso->zvo_queue; + +#endif /* HAVE_BLK_ALLOC_DISK */ + + /* Finish blk-mq init */ + blk_queue_logical_block_size(zso->zvo_queue, 4096); + +#elif defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) +#if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); if (zso->zvo_disk == NULL) goto out_kmem; @@ -901,6 +1102,10 @@ zvol_free(zvol_state_t *zv) ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); +#ifdef HAVE_BLK_MQ + flush_scheduled_work(); +#endif + rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); @@ -913,6 +1118,10 @@ zvol_free(zvol_state_t *zv) put_disk(zv->zv_zso->zvo_disk); #endif +#ifdef HAVE_BLK_MQ + blk_mq_free_tag_set(&zv->zv_zso->tag_set); +#endif + ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); @@ -1122,19 +1331,35 @@ int zvol_init(void) { int error; - int threads = MIN(MAX(zvol_threads, 1), 1024); + + if (zvol_threads == 0) { + zvol_actual_threads = num_online_cpus(); + } else { + zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); + } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } - zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, - threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + +#ifdef HAVE_BLK_MQ + /* Create a kmem cache for all our blk-mq work items */ + blk_mq_cache = kmem_cache_create("blk_mq_cache", + sizeof (zv_work_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (!blk_mq_cache) + return (-ENOMEM); +#else + /* We're not using blk-mq so setup taskqueues */ + zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, + zvol_actual_threads, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } +#endif zvol_init_impl(); ida_init(&zvol_ida); zvol_register_ops(&zvol_linux_ops); @@ -1146,7 +1371,11 @@ zvol_fini(void) { zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); +#ifdef HAVE_BLK_MQ + kmem_cache_destroy(blk_mq_cache); +#else taskq_destroy(zvol_taskq); +#endif ida_destroy(&zvol_ida); } @@ -1158,7 +1387,8 @@ module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); -MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); +MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" + "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 505131d2b9be..b8ea3e67bf38 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -926,6 +926,10 @@ tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] tags = ['functional', 'zvol', 'zvol_misc'] +[tests/functional/zvol/zvol_stress] +tests = ['zvol_stress'] +tags = ['functional', 'zvol', 'zvol_stress'] + [tests/functional/zvol/zvol_swap] tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] tags = ['functional', 'zvol', 'zvol_swap'] diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index ab0cd5270c99..e526b8110142 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3335,18 +3335,26 @@ function is_te_enabled fi } -# Utility function to determine if a system has multiple cpus. -function is_mp +# Return the number of CPUs (cross-platform) +function get_num_cpus { - if is_linux; then - (($(nproc) > 1)) + if is_linux ; then + nproc elif is_freebsd; then sysctl -n kern.smp.cpus else - (($(psrinfo | wc -l) > 1)) + psrinfo | wc -l fi +} - return $? +# Utility function to determine if a system has multiple cpus. +function is_mp +{ + if [[ $(get_num_cpus) > 1 ]] ; then + true + else + false + fi } function get_cpu_freq diff --git a/tests/zfs-tests/tests/functional/zvol/Makefile.am b/tests/zfs-tests/tests/functional/zvol/Makefile.am index e4910754bb81..9089a939abb0 100644 --- a/tests/zfs-tests/tests/functional/zvol/Makefile.am +++ b/tests/zfs-tests/tests/functional/zvol/Makefile.am @@ -5,6 +5,7 @@ dist_pkgdata_DATA = \ SUBDIRS = \ zvol_ENOSPC \ + zvol_stress \ zvol_cli \ zvol_misc \ zvol_swap diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am b/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am new file mode 100644 index 000000000000..5ccd0c7b5619 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/zvol/zvol_stress +dist_pkgdata_SCRIPTS = \ + cleanup.ksh \ + setup.ksh \ + zvol_stress.ksh diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh new file mode 100755 index 000000000000..b81a372638e3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/cleanup.ksh @@ -0,0 +1,36 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh new file mode 100755 index 000000000000..746ac307a755 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/setup.ksh @@ -0,0 +1,38 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_setup "$DISKS" + +log_pass diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh new file mode 100755 index 000000000000..88b4798c43ce --- /dev/null +++ b/tests/zfs-tests/tests/functional/zvol/zvol_stress/zvol_stress.ksh @@ -0,0 +1,141 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/reservation/reservation.shlib + +# +# DESCRIPTION: +# Stress test multithreaded transfers to multiple zvols. Also verify +# zvol errors show up in zpool status. +# +# STRATEGY: +# 1. Create one zvol per CPU +# 2. In parallel, spawn an fio "write and verify" for each zvol +# 3. Inject write errors +# 4. Write to one of the zvols with dd and verify the errors +# + +verify_runnable "global" + +num_zvols=$(get_num_cpus) + +# If we were making one big zvol from all the pool space, it would +# be this big: +biggest_zvol_size_possible=$(largest_volsize_from_pool $TESTPOOL) + +# Crude calculation: take the biggest zvol size we could possibly +# create, knock 10% off it (for overhead) and divide by the number +# of ZVOLs we want to make. +each_zvol_size=$((($biggest_zvol_size_possible - ($biggest_zvol_size_possible / 10)) / $num_zvols)) + +typeset tmpdir="$(mktemp -d zvol_stress_fio_state.XXXXXX)" + +function create_zvols +{ + log_note "Creating $num_zvols zvols that are ${each_zvol_size}B each" + for i in $(seq 1 $num_zvols) ; do + zfs create -V $each_zvol_size $TESTPOOL/testvol$i + done +} + +function destroy_zvols +{ + for i in $(seq 1 $num_zvols) ; do + zfs destroy $TESTPOOL/testvol$i + done +} + +function do_zvol_stress +{ + # Write 10% of each zvol, or 50MB, whichever is less + zvol_write_size=$((each_zvol_size / 10)) + if [ $zvol_write_size -gt $((50 * 1048576)) ] ; then + zvol_write_size=$((50 * 1048576)) + fi + zvol_write_size_mb=$(($zvol_write_size / 1048576)) + + # Spawn off one fio per zvol in parallel + pids="" + for i in $(seq 1 $num_zvols) ; do + # Spawn one fio per zvol as its own process + fio --ioengine=libaio --name=zvol_stress$i --direct=1 \ + --filename="$ZVOL_DEVDIR/$TESTPOOL/testvol$i" --bs=1048576 \ + --iodepth=10 --readwrite=randwrite --size=${zvol_write_size} \ + --verify_async=2 --numjobs=1 --verify=sha1 \ + --verify_fatal=1 \ + --continue_on_error=none \ + --error_dump=1 \ + --exitall_on_error \ + --aux-path="$tmpdir" --do_verify=1 & + pids="$pids $!" + done + + # Wait for all the spawned fios to finish and look for errors + fail="" + i=0 + for pid in $pids ; do + log_note "$s waiting on $pid" + if ! wait $pid ; then + log_fail "fio error on $TESTPOOL/testvol$i" + fi + i=$(($i + 1)) + done +} + +function cleanup +{ + zinject -c all + log_must zpool clear $TESTPOOL + destroy_zvols + + # Remove all fio's leftover state files + if [ -n "$tmpdir" ] ; then + rm -f "$tmpdir"/*.state + rmdir "$tmpdir" + fi +} + +log_onexit cleanup +log_assert "Stress test zvols" + +create_zvols + +# Do some fio write/verifies in parallel +do_zvol_stress + +# Inject some errors, and verify we see some IO errors in zpool status +for DISK in $DISKS ; do + zinject -d $DISK -f 10 -e io -T write $TESTPOOL +done +dd if=/dev/zero of=$ZVOL_DEVDIR/$TESTPOOL/testvol1 bs=1 count=100 +zinject -c all + +log_must zpool status +write_errors=$(zpool status -pv | grep $DISK | awk '{print $4}') +if [ $write_errors -le 0 ] ; then + log_fail "Expected to see some write errors (saw $write_errors)" +else + log_note "Correctly saw $write_errors write errors" +fi +log_pass "Done with zvol_stress"