Skip to content

Commit

Permalink
Reinstate zvol_taskq to fix aio on zvol
Browse files Browse the repository at this point in the history
Commit 37f9dac removed the zvol_taskq for processing zvol request. I imagined
this was removed because after we switched to make_request_fn based, we no
longer received request from interrupt.

However, this also made all bio request synchronous, and cause serious
performance issue as the bio submitter would wait for every bio it submitted,
effectly making iodepth to be 1.

This patch reinstate zvol_taskq, and refactor zvol_{read,write,discard} to
make them take bio as argument.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
  • Loading branch information
Chunwei Chen committed Feb 23, 2017
1 parent 6d82f98 commit f1edaaf
Showing 1 changed file with 131 additions and 75 deletions.
206 changes: 131 additions & 75 deletions module/zfs/zvol.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@

unsigned int zvol_inhibit_dev = 0;
unsigned int zvol_major = ZVOL_MAJOR;
unsigned int zvol_threads = 32;
unsigned int zvol_prefetch_bytes = (128 * 1024);
unsigned long zvol_max_discard_blocks = 16384;

static taskq_t *zvol_taskq;
static kmutex_t zvol_state_lock;
static list_t zvol_state_list;

Expand Down Expand Up @@ -636,21 +638,55 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
}
}

static int
zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
static void
uio_from_bio(uio_t *uio, struct bio *bio)
{
uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio->uio_skip = BIO_BI_SKIP(bio);
uio->uio_resid = BIO_BI_SIZE(bio);
uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio->uio_loffset = BIO_BI_SECTOR(bio) << 9;
uio->uio_limit = MAXOFFSET_T;
uio->uio_segflg = UIO_BVEC;
}

static void
zvol_write(void *arg)
{
struct bio *bio = arg;
uio_t uio;
zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
uint64_t volsize = zv->zv_volsize;
rl_t *rl;
boolean_t sync;
int error = 0;
#ifdef HAVE_GENERIC_IO_ACCT
unsigned long start_jif = jiffies;
#endif

uio_from_bio(&uio, bio);

ASSERT(zv && zv->zv_open_count > 0);

rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset,
uio->uio_resid, RL_WRITER);
generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
rw_enter(&zv->zv_suspend_lock, RW_READER);

/* bio marked as FLUSH need to flush before write */
if (bio_is_flush(bio))
zil_commit(zv->zv_zilog, ZVOL_OBJ);

/* Some requests are just for flush and nothing else. */
if (uio.uio_resid == 0)
goto out;

while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio->uio_loffset;
rl = zfs_range_lock(&zv->zv_range_lock, uio.uio_loffset,
uio.uio_resid, RL_WRITER);

sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;

while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
uint64_t off = uio.uio_loffset;
dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);

if (bytes > volsize - off) /* don't write past the end */
Expand All @@ -664,7 +700,7 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
dmu_tx_abort(tx);
break;
}
error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
error = dmu_write_uio_dbuf(zv->zv_dbuf, &uio, bytes, tx);
if (error == 0)
zvol_log_write(zv, tx, off, bytes, sync);
dmu_tx_commit(tx);
Expand All @@ -675,7 +711,11 @@ zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync)
zfs_range_unlock(rl);
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
return (error);

out:
rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
}

/*
Expand All @@ -702,21 +742,30 @@ zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
zil_itx_assign(zilog, itx, tx);
}

static int
zvol_discard(struct bio *bio)
static void
zvol_discard(void *arg)
{
struct bio *bio = arg;
zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
uint64_t start = BIO_BI_SECTOR(bio) << 9;
uint64_t size = BIO_BI_SIZE(bio);
uint64_t end = start + size;
int error;
int error = 0;
rl_t *rl;
dmu_tx_t *tx;
#ifdef HAVE_GENERIC_IO_ACCT
unsigned long start_jif = jiffies;
#endif

ASSERT(zv && zv->zv_open_count > 0);

if (end > zv->zv_volsize)
return (SET_ERROR(EIO));
generic_start_io_acct(WRITE, bio_sectors(bio), &zv->zv_disk->part0);
rw_enter(&zv->zv_suspend_lock, RW_READER);

if (end > zv->zv_volsize) {
error = SET_ERROR(EIO);
goto out;
}

/*
* Align the request to volume block boundaries when a secure erase is
Expand All @@ -731,7 +780,7 @@ zvol_discard(struct bio *bio)
}

if (start >= end)
return (0);
goto out;

rl = zfs_range_lock(&zv->zv_range_lock, start, size, RL_WRITER);
tx = dmu_tx_create(zv->zv_objset);
Expand All @@ -747,29 +796,42 @@ zvol_discard(struct bio *bio)
}

zfs_range_unlock(rl);

return (error);
out:
rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(WRITE, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
}

static int
zvol_read(zvol_state_t *zv, uio_t *uio)
static void
zvol_read(void *arg)
{
struct bio *bio = arg;
uio_t uio;
zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data;
uint64_t volsize = zv->zv_volsize;
rl_t *rl;
int error = 0;
#ifdef HAVE_GENERIC_IO_ACCT
unsigned long start_jif = jiffies;
#endif

uio_from_bio(&uio, bio);

ASSERT(zv && zv->zv_open_count > 0);

rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset,
uio->uio_resid, RL_READER);
while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
generic_start_io_acct(READ, bio_sectors(bio), &zv->zv_disk->part0);
rw_enter(&zv->zv_suspend_lock, RW_READER);

rl = zfs_range_lock(&zv->zv_range_lock, uio.uio_loffset,
uio.uio_resid, RL_READER);
while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);

/* don't read past the end */
if (bytes > volsize - uio->uio_loffset)
bytes = volsize - uio->uio_loffset;
if (bytes > volsize - uio.uio_loffset)
bytes = volsize - uio.uio_loffset;

error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
error = dmu_read_uio_dbuf(zv->zv_dbuf, &uio, bytes);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
Expand All @@ -778,75 +840,54 @@ zvol_read(zvol_state_t *zv, uio_t *uio)
}
}
zfs_range_unlock(rl);
return (error);

rw_exit(&zv->zv_suspend_lock);
generic_end_io_acct(READ, &zv->zv_disk->part0, start_jif);
BIO_END_IO(bio, -error);
}

static MAKE_REQUEST_FN_RET
zvol_request(struct request_queue *q, struct bio *bio)
{
uio_t uio;
zvol_state_t *zv = q->queuedata;
fstrans_cookie_t cookie = spl_fstrans_mark();
uint64_t offset = BIO_BI_SECTOR(bio)<<9;
uint64_t size = BIO_BI_SIZE(bio);
int rw = bio_data_dir(bio);
#ifdef HAVE_GENERIC_IO_ACCT
unsigned long start = jiffies;
#endif
int error = 0;

rw_enter(&zv->zv_suspend_lock, RW_READER);

uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)];
uio.uio_skip = BIO_BI_SKIP(bio);
uio.uio_resid = BIO_BI_SIZE(bio);
uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio);
uio.uio_loffset = BIO_BI_SECTOR(bio) << 9;
uio.uio_limit = MAXOFFSET_T;
uio.uio_segflg = UIO_BVEC;

if (bio_has_data(bio) && uio.uio_loffset + uio.uio_resid >
zv->zv_volsize) {
if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
printk(KERN_INFO
"%s: bad access: offset=%llu, size=%lu\n",
zv->zv_disk->disk_name,
(long long unsigned)uio.uio_loffset,
(long unsigned)uio.uio_resid);
error = SET_ERROR(EIO);
goto out1;
}
(long long unsigned)offset,
(long unsigned)size);

generic_start_io_acct(rw, bio_sectors(bio), &zv->zv_disk->part0);
BIO_END_IO(bio, -SET_ERROR(EIO));
goto out;
}

if (rw == WRITE) {
if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
error = SET_ERROR(EROFS);
goto out2;
BIO_END_IO(bio, -SET_ERROR(EROFS));
goto out;
}

if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
error = zvol_discard(bio);
goto out2;
}

/*
* Some requests are just for flush and nothing else.
*/
if (uio.uio_resid == 0) {
if (bio_is_flush(bio))
zil_commit(zv->zv_zilog, ZVOL_OBJ);
goto out2;
if (taskq_dispatch(zvol_taskq, zvol_discard, bio,
TQ_SLEEP) == TASKQID_INVALID)
zvol_discard(bio);
} else {
if (taskq_dispatch(zvol_taskq, zvol_write, bio,
TQ_SLEEP) == TASKQID_INVALID)
zvol_write(bio);
}
} else {
if (taskq_dispatch(zvol_taskq, zvol_read, bio,
TQ_SLEEP) == TASKQID_INVALID)
zvol_read(bio);
}

error = zvol_write(zv, &uio,
bio_is_flush(bio) || bio_is_fua(bio) ||
zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
} else
error = zvol_read(zv, &uio);

out2:
generic_end_io_acct(rw, &zv->zv_disk->part0, start);
out1:
BIO_END_IO(bio, -error);
rw_exit(&zv->zv_suspend_lock);
out:
spl_fstrans_unmark(cookie);
#ifdef HAVE_MAKE_REQUEST_FN_RET_INT
return (0);
Expand Down Expand Up @@ -2157,18 +2198,27 @@ zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
int
zvol_init(void)
{
int threads = MIN(MAX(zvol_threads, 1), 1024);
int i, error;

list_create(&zvol_state_list, sizeof (zvol_state_t),
offsetof(zvol_state_t, zv_next));
mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
ida_init(&zvol_ida);

zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
if (zvol_taskq == NULL) {
printk(KERN_INFO "ZFS: taskq_create() failed\n");
error = -ENOMEM;
goto out;
}

zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
KM_SLEEP);
if (!zvol_htable) {
error = ENOMEM;
goto out;
goto out_taskq;
}
for (i = 0; i < ZVOL_HT_SIZE; i++)
INIT_HLIST_HEAD(&zvol_htable[i]);
Expand All @@ -2186,6 +2236,8 @@ zvol_init(void)

out_free:
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
out_taskq:
taskq_destroy(zvol_taskq);
out:
mutex_destroy(&zvol_state_lock);
list_destroy(&zvol_state_list);
Expand All @@ -2202,6 +2254,7 @@ zvol_fini(void)
unregister_blkdev(zvol_major, ZVOL_DRIVER);
kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));

taskq_destroy(zvol_taskq);
list_destroy(&zvol_state_list);
mutex_destroy(&zvol_state_lock);

Expand All @@ -2215,6 +2268,9 @@ MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
module_param(zvol_major, uint, 0444);
MODULE_PARM_DESC(zvol_major, "Major number for zvol device");

module_param(zvol_threads, uint, 0444);
MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");

module_param(zvol_max_discard_blocks, ulong, 0444);
MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");

Expand Down

0 comments on commit f1edaaf

Please sign in to comment.