From 288ed4a6f4f0ee354bdd47ed40855d2d4883fca1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Fri, 25 Sep 2020 15:34:03 +0200
Subject: [PATCH] zfs_vnops: make zfs_get_data OS-independent

closes #10979

Signed-off-by: Christian Schwarz <me@cschwarz.com>
---
 include/os/freebsd/zfs/sys/zfs_znode_impl.h |   1 -
 include/os/linux/zfs/sys/zfs_znode_impl.h   |   1 -
 include/sys/zfs_vnops.h                     |  13 ++
 module/os/freebsd/zfs/zfs_vfsops.c          |   1 +
 module/os/freebsd/zfs/zfs_vnops_os.c        | 161 +-------------------
 module/os/linux/zfs/zfs_vnops_os.c          | 159 -------------------
 module/zfs/zfs_vnops.c                      | 159 +++++++++++++++++++
 7 files changed, 179 insertions(+), 316 deletions(-)

diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
index d87938f0979f..62570da58fbb 100644
--- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h
+++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
@@ -173,7 +173,6 @@ extern void	zfs_tstamp_update_setup_ext(struct znode *,
     uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
 extern void zfs_znode_free(struct znode *);
 
-extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
 extern int zfsfstype;
 
diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h
index a4a136f010d2..5c31d8309ad8 100644
--- a/include/os/linux/zfs/sys/zfs_znode_impl.h
+++ b/include/os/linux/zfs/sys/zfs_znode_impl.h
@@ -163,7 +163,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw);
 extern void zfs_unmap_page(page_t *, caddr_t);
 #endif /* HAVE_UIO_RW */
 
-extern zil_get_data_t zfs_get_data;
 extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
 extern int zfsfstype;
 
diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h
index b2767cb0e697..5aaa64b342fa 100644
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@@ -39,4 +39,17 @@ extern int mappedread(znode_t *, int, uio_t *);
 extern int mappedread_sf(znode_t *, int, uio_t *);
 extern void update_pages(znode_t *, int64_t, int, objset_t *, uint64_t);
 
+/*
+ * Platform code that asynchronously drops zp's inode / vnode_t.
+ *
+ * Asynchronous dropping ensures that the caller will never drop the
+ * last reference on an inode / vnode_t in the current context.
+ * Doing so while holding open a tx could result in a deadlock if
+ * the platform calls into filesystem again in the implementation
+ * of inode / vnode_t dropping (e.g. call from iput_final()).
+ */
+extern void zfs_zrele_async(znode_t *zp);
+
+extern zil_get_data_t zfs_get_data;
+
 #endif
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c
index 55ed2e25d5c3..7587d02ae5ea 100644
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -42,6 +42,7 @@
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 758cea21fbae..484ab3d648ed 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -29,6 +29,7 @@
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
+
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
@@ -669,163 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
 	return (error);
 }
 
-static void
-zfs_get_done(zgd_t *zgd, int error)
-{
-	znode_t *zp = zgd->zgd_private;
-	objset_t *os = zp->z_zfsvfs->z_os;
-
-	if (zgd->zgd_db)
-		dmu_buf_rele(zgd->zgd_db, zgd);
-
-	zfs_rangelock_exit(zgd->zgd_lr);
-
-	/*
-	 * Release the vnode asynchronously as we currently have the
-	 * txg stopped from syncing.
-	 */
-	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os)));
-
-	kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef ZFS_DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+void
+zfs_zrele_async(znode_t *zp)
 {
-	zfsvfs_t *zfsvfs = arg;
-	objset_t *os = zfsvfs->z_os;
-	znode_t *zp;
-	uint64_t object = lr->lr_foid;
-	uint64_t offset = lr->lr_offset;
-	uint64_t size = lr->lr_length;
-	dmu_buf_t *db;
-	zgd_t *zgd;
-	int error = 0;
-
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
-	ASSERT3U(size, !=, 0);
-
-	/*
-	 * Nothing to do if the file has been removed
-	 */
-	if (zfs_zget(zfsvfs, object, &zp) != 0)
-		return (SET_ERROR(ENOENT));
-	if (zp->z_unlinked) {
-		/*
-		 * Release the vnode asynchronously as we currently have the
-		 * txg stopped from syncing.
-		 */
-		VN_RELE_ASYNC(ZTOV(zp),
-		    dsl_pool_zrele_taskq(dmu_objset_pool(os)));
-		return (SET_ERROR(ENOENT));
-	}
-
-	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_lwb = lwb;
-	zgd->zgd_private = zp;
-
-	/*
-	 * Write records come in two flavors: immediate and indirect.
-	 * For small writes it's cheaper to store the data with the
-	 * log record (immediate); for large writes it's cheaper to
-	 * sync the data and get a pointer to it (indirect) so that
-	 * we don't have to write the data twice.
-	 */
-	if (buf != NULL) { /* immediate write */
-		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
-		    size, RL_READER);
-		/* test for truncation needs to be done while range locked */
-		if (offset >= zp->z_size) {
-			error = SET_ERROR(ENOENT);
-		} else {
-			error = dmu_read(os, object, offset, size, buf,
-			    DMU_READ_NO_PREFETCH);
-		}
-		ASSERT(error == 0 || error == ENOENT);
-	} else { /* indirect write */
-		/*
-		 * Have to lock the whole block to ensure when it's
-		 * written out and its checksum is being calculated
-		 * that no one can change the data. We need to re-check
-		 * blocksize after we get the lock in case it's changed!
-		 */
-		for (;;) {
-			uint64_t blkoff;
-			size = zp->z_blksz;
-			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
-			offset -= blkoff;
-			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
-			    offset, size, RL_READER);
-			if (zp->z_blksz == size)
-				break;
-			offset += blkoff;
-			zfs_rangelock_exit(zgd->zgd_lr);
-		}
-		/* test for truncation needs to be done while range locked */
-		if (lr->lr_offset >= zp->z_size)
-			error = SET_ERROR(ENOENT);
-#ifdef ZFS_DEBUG
-		if (zil_fault_io) {
-			error = SET_ERROR(EIO);
-			zil_fault_io = 0;
-		}
-#endif
-		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db,
-			    DMU_READ_NO_PREFETCH);
-
-		if (error == 0) {
-			blkptr_t *bp = &lr->lr_blkptr;
-
-			zgd->zgd_db = db;
-			zgd->zgd_bp = bp;
-
-			ASSERT(db->db_offset == offset);
-			ASSERT(db->db_size == size);
-
-			error = dmu_sync(zio, lr->lr_common.lrc_txg,
-			    zfs_get_done, zgd);
-			ASSERT(error || lr->lr_length <= size);
-
-			/*
-			 * On success, we need to wait for the write I/O
-			 * initiated by dmu_sync() to complete before we can
-			 * release this dbuf.  We will finish everything up
-			 * in the zfs_get_done() callback.
-			 */
-			if (error == 0)
-				return (0);
-
-			if (error == EALREADY) {
-				lr->lr_common.lrc_txtype = TX_WRITE2;
-				/*
-				 * TX_WRITE2 relies on the data previously
-				 * written by the TX_WRITE that caused
-				 * EALREADY.  We zero out the BP because
-				 * it is the old, currently-on-disk BP,
-				 * so there's no need to zio_flush() its
-				 * vdevs (flushing would needlesly hurt
-				 * performance, and doesn't work on
-				 * indirect vdevs).
-				 */
-				zgd->zgd_bp = NULL;
-				BP_ZERO(bp);
-				error = 0;
-			}
-		}
-	}
-
-	zfs_get_done(zgd, error);
+	vnode_t *vp = ZTOV(zp);
+	objset_t *os = ITOZSB(vp)->z_os;
 
-	return (error);
+	VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
 }
 
 static int
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 5a56534b9396..056b8900a41f 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -381,12 +381,6 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len,
 	return (error);
 }
 
-/*
- * Drop a reference on the passed inode asynchronously. This ensures
- * that the caller will never drop the last reference on an inode in
- * the current context. Doing so while holding open a tx could result
- * in a deadlock if iput_final() re-enters the filesystem code.
- */
 void
 zfs_zrele_async(znode_t *zp)
 {
@@ -403,159 +397,6 @@ zfs_zrele_async(znode_t *zp)
 		zrele(zp);
 }
 
-/* ARGSUSED */
-static void
-zfs_get_done(zgd_t *zgd, int error)
-{
-	znode_t *zp = zgd->zgd_private;
-
-	if (zgd->zgd_db)
-		dmu_buf_rele(zgd->zgd_db, zgd);
-
-	zfs_rangelock_exit(zgd->zgd_lr);
-
-	/*
-	 * Release the vnode asynchronously as we currently have the
-	 * txg stopped from syncing.
-	 */
-	zfs_zrele_async(zp);
-
-	kmem_free(zgd, sizeof (zgd_t));
-}
-
-#ifdef ZFS_DEBUG
-static int zil_fault_io = 0;
-#endif
-
-/*
- * Get data to generate a TX_WRITE intent log record.
- */
-int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
-{
-	zfsvfs_t *zfsvfs = arg;
-	objset_t *os = zfsvfs->z_os;
-	znode_t *zp;
-	uint64_t object = lr->lr_foid;
-	uint64_t offset = lr->lr_offset;
-	uint64_t size = lr->lr_length;
-	dmu_buf_t *db;
-	zgd_t *zgd;
-	int error = 0;
-
-	ASSERT3P(lwb, !=, NULL);
-	ASSERT3P(zio, !=, NULL);
-	ASSERT3U(size, !=, 0);
-
-	/*
-	 * Nothing to do if the file has been removed
-	 */
-	if (zfs_zget(zfsvfs, object, &zp) != 0)
-		return (SET_ERROR(ENOENT));
-	if (zp->z_unlinked) {
-		/*
-		 * Release the vnode asynchronously as we currently have the
-		 * txg stopped from syncing.
-		 */
-		zfs_zrele_async(zp);
-		return (SET_ERROR(ENOENT));
-	}
-
-	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_lwb = lwb;
-	zgd->zgd_private = zp;
-
-	/*
-	 * Write records come in two flavors: immediate and indirect.
-	 * For small writes it's cheaper to store the data with the
-	 * log record (immediate); for large writes it's cheaper to
-	 * sync the data and get a pointer to it (indirect) so that
-	 * we don't have to write the data twice.
-	 */
-	if (buf != NULL) { /* immediate write */
-		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
-		    offset, size, RL_READER);
-		/* test for truncation needs to be done while range locked */
-		if (offset >= zp->z_size) {
-			error = SET_ERROR(ENOENT);
-		} else {
-			error = dmu_read(os, object, offset, size, buf,
-			    DMU_READ_NO_PREFETCH);
-		}
-		ASSERT(error == 0 || error == ENOENT);
-	} else { /* indirect write */
-		/*
-		 * Have to lock the whole block to ensure when it's
-		 * written out and its checksum is being calculated
-		 * that no one can change the data. We need to re-check
-		 * blocksize after we get the lock in case it's changed!
-		 */
-		for (;;) {
-			uint64_t blkoff;
-			size = zp->z_blksz;
-			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
-			offset -= blkoff;
-			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
-			    offset, size, RL_READER);
-			if (zp->z_blksz == size)
-				break;
-			offset += blkoff;
-			zfs_rangelock_exit(zgd->zgd_lr);
-		}
-		/* test for truncation needs to be done while range locked */
-		if (lr->lr_offset >= zp->z_size)
-			error = SET_ERROR(ENOENT);
-#ifdef ZFS_DEBUG
-		if (zil_fault_io) {
-			error = SET_ERROR(EIO);
-			zil_fault_io = 0;
-		}
-#endif
-		if (error == 0)
-			error = dmu_buf_hold(os, object, offset, zgd, &db,
-			    DMU_READ_NO_PREFETCH);
-
-		if (error == 0) {
-			blkptr_t *bp = &lr->lr_blkptr;
-
-			zgd->zgd_db = db;
-			zgd->zgd_bp = bp;
-
-			ASSERT(db->db_offset == offset);
-			ASSERT(db->db_size == size);
-
-			error = dmu_sync(zio, lr->lr_common.lrc_txg,
-			    zfs_get_done, zgd);
-			ASSERT(error || lr->lr_length <= size);
-
-			/*
-			 * On success, we need to wait for the write I/O
-			 * initiated by dmu_sync() to complete before we can
-			 * release this dbuf.  We will finish everything up
-			 * in the zfs_get_done() callback.
-			 */
-			if (error == 0)
-				return (0);
-
-			if (error == EALREADY) {
-				lr->lr_common.lrc_txtype = TX_WRITE2;
-				/*
-				 * TX_WRITE2 relies on the data previously
-				 * written by the TX_WRITE that caused
-				 * EALREADY.  We zero out the BP because
-				 * it is the old, currently-on-disk BP.
-				 */
-				zgd->zgd_bp = NULL;
-				BP_ZERO(bp);
-				error = 0;
-			}
-		}
-	}
-
-	zfs_get_done(zgd, error);
-
-	return (error);
-}
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index a4237f8f3463..c2a2d98d17c8 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -52,6 +52,8 @@
 #include <sys/policy.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
 
 
 static ulong_t zfs_fsync_sync_cnt = 4;
@@ -717,6 +719,163 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 	return (error);
 }
 
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+static void zfs_get_done(zgd_t *zgd, int error);
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+	zfsvfs_t *zfsvfs = arg;
+	objset_t *os = zfsvfs->z_os;
+	znode_t *zp;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error = 0;
+
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
+	/*
+	 * Nothing to do if the file has been removed
+	 */
+	if (zfs_zget(zfsvfs, object, &zp) != 0)
+		return (SET_ERROR(ENOENT));
+	if (zp->z_unlinked) {
+		/*
+		 * Release the vnode asynchronously as we currently have the
+		 * txg stopped from syncing.
+		 */
+		zfs_zrele_async(zp);
+		return (SET_ERROR(ENOENT));
+	}
+
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_lwb = lwb;
+	zgd->zgd_private = zp;
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) { /* immediate write */
+		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+		    offset, size, RL_READER);
+		/* test for truncation needs to be done while range locked */
+		if (offset >= zp->z_size) {
+			error = SET_ERROR(ENOENT);
+		} else {
+			error = dmu_read(os, object, offset, size, buf,
+			    DMU_READ_NO_PREFETCH);
+		}
+		ASSERT(error == 0 || error == ENOENT);
+	} else { /* indirect write */
+		/*
+		 * Have to lock the whole block to ensure when it's
+		 * written out and its checksum is being calculated
+		 * that no one can change the data. We need to re-check
+		 * blocksize after we get the lock in case it's changed!
+		 */
+		for (;;) {
+			uint64_t blkoff;
+			size = zp->z_blksz;
+			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+			offset -= blkoff;
+			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+			    offset, size, RL_READER);
+			if (zp->z_blksz == size)
+				break;
+			offset += blkoff;
+			zfs_rangelock_exit(zgd->zgd_lr);
+		}
+		/* test for truncation needs to be done while range locked */
+		if (lr->lr_offset >= zp->z_size)
+			error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+		if (zil_fault_io) {
+			error = SET_ERROR(EIO);
+			zil_fault_io = 0;
+		}
+#endif
+		if (error == 0)
+			error = dmu_buf_hold(os, object, offset, zgd, &db,
+			    DMU_READ_NO_PREFETCH);
+
+		if (error == 0) {
+			blkptr_t *bp = &lr->lr_blkptr;
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zfs_get_done, zgd);
+			ASSERT(error || lr->lr_length <= size);
+
+			/*
+			 * On success, we need to wait for the write I/O
+			 * initiated by dmu_sync() to complete before we can
+			 * release this dbuf.  We will finish everything up
+			 * in the zfs_get_done() callback.
+			 */
+			if (error == 0)
+				return (0);
+
+			if (error == EALREADY) {
+				lr->lr_common.lrc_txtype = TX_WRITE2;
+				/*
+				 * TX_WRITE2 relies on the data previously
+				 * written by the TX_WRITE that caused
+				 * EALREADY.  We zero out the BP because
+				 * it is the old, currently-on-disk BP.
+				 */
+				zgd->zgd_bp = NULL;
+				BP_ZERO(bp);
+				error = 0;
+			}
+		}
+	}
+
+	zfs_get_done(zgd, error);
+
+	return (error);
+}
+
+
+/* ARGSUSED */
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+	znode_t *zp = zgd->zgd_private;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_rangelock_exit(zgd->zgd_lr);
+
+	/*
+	 * Release the vnode asynchronously as we currently have the
+	 * txg stopped from syncing.
+	 */
+	zfs_zrele_async(zp);
+
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
 EXPORT_SYMBOL(zfs_access);
 EXPORT_SYMBOL(zfs_fsync);
 EXPORT_SYMBOL(zfs_holey);