Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zfs_vnops: make zfs_get_data OS-independent #10979

Merged
merged 1 commit into from
Nov 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ extern void zfs_tstamp_update_setup_ext(struct znode *,
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
extern void zfs_znode_free(struct znode *);

extern zil_get_data_t zfs_get_data;
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;

Expand Down
1 change: 0 additions & 1 deletion include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw);
extern void zfs_unmap_page(page_t *, caddr_t);
#endif /* HAVE_UIO_RW */

extern zil_get_data_t zfs_get_data;
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;

Expand Down
13 changes: 13 additions & 0 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,17 @@ extern int mappedread(znode_t *, int, uio_t *);
extern int mappedread_sf(znode_t *, int, uio_t *);
extern void update_pages(znode_t *, int64_t, int, objset_t *, uint64_t);

/*
* Platform code that asynchronously drops zp's inode / vnode_t.
*
* Asynchronous dropping ensures that the caller will never drop the
* last reference on an inode / vnode_t in the current context.
* Doing so while holding open a tx could result in a deadlock if
* the platform calls into filesystem again in the implementation
* of inode / vnode_t dropping (e.g. call from iput_final()).
*/
extern void zfs_zrele_async(znode_t *zp);

extern zil_get_data_t zfs_get_data;

#endif
1 change: 1 addition & 0 deletions module/os/freebsd/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <sys/mount.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_vnops.h>
#include <sys/zfs_dir.h>
#include <sys/zil.h>
#include <sys/fs/zfs.h>
Expand Down
161 changes: 6 additions & 155 deletions module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
/* Portions Copyright 2007 Jeremy Teo */
/* Portions Copyright 2010 Robert Milkowski */


#include <sys/types.h>
#include <sys/param.h>
#include <sys/time.h>
Expand Down Expand Up @@ -669,163 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}

static void
zfs_get_done(zgd_t *zgd, int error)
{
znode_t *zp = zgd->zgd_private;
objset_t *os = zp->z_zfsvfs->z_os;

if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);

zfs_rangelock_exit(zgd->zgd_lr);

/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));

kmem_free(zgd, sizeof (zgd_t));
}

#ifdef ZFS_DEBUG
static int zil_fault_io = 0;
#endif

/*
* Get data to generate a TX_WRITE intent log record.
*/
int
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
void
zfs_zrele_async(znode_t *zp)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;

ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);

/*
* Nothing to do if the file has been removed
*/
if (zfs_zget(zfsvfs, object, &zp) != 0)
return (SET_ERROR(ENOENT));
if (zp->z_unlinked) {
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
VN_RELE_ASYNC(ZTOV(zp),
dsl_pool_zrele_taskq(dmu_objset_pool(os)));
return (SET_ERROR(ENOENT));
}

zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;

/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
uint64_t blkoff;
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
offset, size, RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
zfs_rangelock_exit(zgd->zgd_lr);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
error = SET_ERROR(ENOENT);
#ifdef ZFS_DEBUG
if (zil_fault_io) {
error = SET_ERROR(EIO);
zil_fault_io = 0;
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);

if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;

zgd->zgd_db = db;
zgd->zgd_bp = bp;

ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);

error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
ASSERT(error || lr->lr_length <= size);

/*
* On success, we need to wait for the write I/O
* initiated by dmu_sync() to complete before we can
* release this dbuf. We will finish everything up
* in the zfs_get_done() callback.
*/
if (error == 0)
return (0);

if (error == EALREADY) {
lr->lr_common.lrc_txtype = TX_WRITE2;
/*
* TX_WRITE2 relies on the data previously
* written by the TX_WRITE that caused
* EALREADY. We zero out the BP because
* it is the old, currently-on-disk BP,
* so there's no need to zio_flush() its
* vdevs (flushing would needlesly hurt
* performance, and doesn't work on
* indirect vdevs).
*/
zgd->zgd_bp = NULL;
BP_ZERO(bp);
error = 0;
}
}
}

zfs_get_done(zgd, error);
vnode_t *vp = ZTOV(zp);
objset_t *os = ITOZSB(vp)->z_os;

return (error);
VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
}

static int
Expand Down
159 changes: 0 additions & 159 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -381,12 +381,6 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
return (error);
}

/*
* Drop a reference on the passed inode asynchronously. This ensures
* that the caller will never drop the last reference on an inode in
* the current context. Doing so while holding open a tx could result
* in a deadlock if iput_final() re-enters the filesystem code.
*/
void
zfs_zrele_async(znode_t *zp)
{
Expand All @@ -403,159 +397,6 @@ zfs_zrele_async(znode_t *zp)
zrele(zp);
}

/* ARGSUSED */
static void
zfs_get_done(zgd_t *zgd, int error)
{
znode_t *zp = zgd->zgd_private;

if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);

zfs_rangelock_exit(zgd->zgd_lr);

/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
zfs_zrele_async(zp);

kmem_free(zgd, sizeof (zgd_t));
}

#ifdef ZFS_DEBUG
static int zil_fault_io = 0;
#endif

/*
* Get data to generate a TX_WRITE intent log record.
*/
int
zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
{
zfsvfs_t *zfsvfs = arg;
objset_t *os = zfsvfs->z_os;
znode_t *zp;
uint64_t object = lr->lr_foid;
uint64_t offset = lr->lr_offset;
uint64_t size = lr->lr_length;
dmu_buf_t *db;
zgd_t *zgd;
int error = 0;

ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);

/*
* Nothing to do if the file has been removed
*/
if (zfs_zget(zfsvfs, object, &zp) != 0)
return (SET_ERROR(ENOENT));
if (zp->z_unlinked) {
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
*/
zfs_zrele_async(zp);
return (SET_ERROR(ENOENT));
}

zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_lwb = lwb;
zgd->zgd_private = zp;

/*
* Write records come in two flavors: immediate and indirect.
* For small writes it's cheaper to store the data with the
* log record (immediate); for large writes it's cheaper to
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
if (buf != NULL) { /* immediate write */
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
offset, size, RL_READER);
/* test for truncation needs to be done while range locked */
if (offset >= zp->z_size) {
error = SET_ERROR(ENOENT);
} else {
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
for (;;) {
uint64_t blkoff;
size = zp->z_blksz;
blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
offset -= blkoff;
zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
offset, size, RL_READER);
if (zp->z_blksz == size)
break;
offset += blkoff;
zfs_rangelock_exit(zgd->zgd_lr);
}
/* test for truncation needs to be done while range locked */
if (lr->lr_offset >= zp->z_size)
error = SET_ERROR(ENOENT);
#ifdef ZFS_DEBUG
if (zil_fault_io) {
error = SET_ERROR(EIO);
zil_fault_io = 0;
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);

if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;

zgd->zgd_db = db;
zgd->zgd_bp = bp;

ASSERT(db->db_offset == offset);
ASSERT(db->db_size == size);

error = dmu_sync(zio, lr->lr_common.lrc_txg,
zfs_get_done, zgd);
ASSERT(error || lr->lr_length <= size);

/*
* On success, we need to wait for the write I/O
* initiated by dmu_sync() to complete before we can
* release this dbuf. We will finish everything up
* in the zfs_get_done() callback.
*/
if (error == 0)
return (0);

if (error == EALREADY) {
lr->lr_common.lrc_txtype = TX_WRITE2;
/*
* TX_WRITE2 relies on the data previously
* written by the TX_WRITE that caused
* EALREADY. We zero out the BP because
* it is the old, currently-on-disk BP.
*/
zgd->zgd_bp = NULL;
BP_ZERO(bp);
error = 0;
}
}
}

zfs_get_done(zgd, error);

return (error);
}

/*
* Lookup an entry in a directory, or an extended attribute directory.
Expand Down
Loading