Skip to content

Commit

Permalink
zfs_vnops: make zfs_get_data OS-independent
Browse files Browse the repository at this point in the history
Move zfs_get_data() in to platform-independent code. The only
platform-specific aspect of it is the way we release an inode 
(Linux) / vnode_t (FreeBSD). I am not aware of a platform that
could be supported by ZFS that couldn't implement zfs_rele_async 
itself. It's sibling zvol_get_data already is platform-independent.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Christian Schwarz <me@cschwarz.com>
Closes openzfs#10979
  • Loading branch information
problame authored and RageLtMan committed May 31, 2021
1 parent 0d3ff77 commit f010ded
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 316 deletions.
1 change: 0 additions & 1 deletion include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
extern void zfs_znode_free(struct znode *);

extern zil_get_data_t zfs_get_data;
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;

Expand Down
1 change: 0 additions & 1 deletion include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw);
extern void zfs_unmap_page(page_t *, caddr_t);
#endif /* HAVE_UIO_RW */

extern zil_get_data_t zfs_get_data;
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;

Expand Down
13 changes: 13 additions & 0 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,17 @@ extern int mappedread(znode_t *, int, uio_t *);
extern int mappedread_sf(znode_t *, int, uio_t *);
extern void update_pages(znode_t *, int64_t, int, objset_t *, uint64_t);

/*
* Platform code that asynchronously drops zp's inode / vnode_t.
*
* Asynchronous dropping ensures that the caller will never drop the
* last reference on an inode / vnode_t in the current context.
* Doing so while holding open a tx could result in a deadlock if
* the platform calls into filesystem again in the implementation
* of inode / vnode_t dropping (e.g. call from iput_final()).
*/
extern void zfs_zrele_async(znode_t *zp);

extern zil_get_data_t zfs_get_data;

#endif
1 change: 1 addition & 0 deletions module/os/freebsd/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <sys/mount.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
Expand Down
161 changes: 6 additions & 155 deletions module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */


#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
Expand Down Expand Up @@ -669,163 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}

static void
zfs_get_done(zgd_t *zgd, int error)
{
znode_t *zp = zgd->zgd_private;
objset_t *os = zp->z_zfsvfs->z_os;

if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);

zfs_rangelock_exit(zgd->zgd_lr);

/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));

kmem_free(zgd, sizeof (zgd_t));
}

#ifdef ZFS_DEBUG
static int zil_fault_io = 0;
#endif

/*
* Get data to generate a TX_WRITE intent log record.
*/
int
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
void
zfs_zrele_async(znode_t *zp)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;

ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);

/*
* Nothing to do if the file has been removed
*/
if (zfs_zget(zfsvfs, object, &zp) != 0)
return (SET_ERROR(ENOENT));
if (zp->z_unlinked) {
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_zrele_taskq(dmu_objset_pool(os)));
return (SET_ERROR(ENOENT));
}

zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;

/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
uint64_t blkoff;
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
offset, size, RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
zfs_rangelock_exit(zgd->zgd_lr);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
error = SET_ERROR(ENOENT);
#ifdef ZFS_DEBUG
if (zil_fault_io) {
error = SET_ERROR(EIO);
zil_fault_io = 0;
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);

if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;

zgd->zgd_db = db;
zgd->zgd_bp = bp;

ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);

error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
ASSERT(error || lr->lr_length <= size);

/*
* On success, we need to wait for the write I/O
* initiated by dmu_sync() to complete before we can
* release this dbuf. We will finish everything up
* in the zfs_get_done() callback.
*/
if (error == 0)
return (0);

if (error == EALREADY) {
lr->lr_common.lrc_txtype = TX_WRITE2;
/*
* TX_WRITE2 relies on the data previously
* written by the TX_WRITE that caused
* EALREADY. We zero out the BP because
* it is the old, currently-on-disk BP,
* so there's no need to zio_flush() its
* vdevs (flushing would needlesly hurt
* performance, and doesn't work on
* indirect vdevs).
*/
zgd->zgd_bp = NULL;
BP_ZERO(bp);
error = 0;
}
}
}

zfs_get_done(zgd, error);
vnode_t *vp = ZTOV(zp);
objset_t *os = ITOZSB(vp)->z_os;

return (error);
VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
}

static int
Expand Down
159 changes: 0 additions & 159 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -381,12 +381,6 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}

/*
* Drop a reference on the passed inode asynchronously. This ensures
* that the caller will never drop the last reference on an inode in
* the current context. Doing so while holding open a tx could result
* in a deadlock if iput_final() re-enters the filesystem code.
*/
void
zfs_zrele_async(znode_t *zp)
{
Expand All @@ -403,159 +397,6 @@ zfs_zrele_async(znode_t *zp)
zrele(zp);
}

/* ARGSUSED */
static void
zfs_get_done(zgd_t *zgd, int error)
{
znode_t *zp = zgd->zgd_private;

if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);

zfs_rangelock_exit(zgd->zgd_lr);

/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
zfs_zrele_async(zp);

kmem_free(zgd, sizeof (zgd_t));
}

#ifdef ZFS_DEBUG
static int zil_fault_io = 0;
#endif

/*
* Get data to generate a TX_WRITE intent log record.
*/
int
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;

ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);

/*
* Nothing to do if the file has been removed
*/
if (zfs_zget(zfsvfs, object, &zp) != 0)
return (SET_ERROR(ENOENT));
if (zp->z_unlinked) {
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
zfs_zrele_async(zp);
return (SET_ERROR(ENOENT));
}

zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;

/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
uint64_t blkoff;
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
offset, size, RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
zfs_rangelock_exit(zgd->zgd_lr);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
error = SET_ERROR(ENOENT);
#ifdef ZFS_DEBUG
if (zil_fault_io) {
error = SET_ERROR(EIO);
zil_fault_io = 0;
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);

if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;

zgd->zgd_db = db;
zgd->zgd_bp = bp;

ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);

error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
ASSERT(error || lr->lr_length <= size);

/*
* On success, we need to wait for the write I/O
* initiated by dmu_sync() to complete before we can
* release this dbuf. We will finish everything up
* in the zfs_get_done() callback.
*/
if (error == 0)
return (0);

if (error == EALREADY) {
lr->lr_common.lrc_txtype = TX_WRITE2;
/*
* TX_WRITE2 relies on the data previously
* written by the TX_WRITE that caused
* EALREADY. We zero out the BP because
* it is the old, currently-on-disk BP.
*/
zgd->zgd_bp = NULL;
BP_ZERO(bp);
error = 0;
}
}
}

zfs_get_done(zgd, error);

return (error);
}

/*
* Lookup an entry in a directory, or an extended attribute directory.
Expand Down
Loading

0 comments on commit f010ded

Please sign in to comment.