Skip to content

Commit

Permalink
zfs_rename: pick up and finish renameat2 flags support
Browse files Browse the repository at this point in the history
warning: one merge is not sure

Removing new txtypes in favor of compound ZIL operations, see comment in
module/zfs/zfs_log.c.

Other notable changes:

- unlock after the inodes are updated
- pass whiteout znode pointer to zfs_log_rename_whiteout
- don't wrap code directly in ASSERT*(), it turn to noop on non-debug
  builds
- update configure time tests for rename2 to support kernels
  from 3.5 to 4.8

Fixes openzfs#2256
Fixes openzfs#8648
Fixes openzfs#8774

Signed-off-by: Pavel Snajdr <snajpa@snajpa.net>
  • Loading branch information
snajpa authored and wuxxin committed Apr 23, 2020
1 parent 2b15a3c commit f12d717
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 87 deletions.
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ CONTRIBUTORS:
Paul Dagnelie <pcd@delphix.com>
Paul Zuchowski <pzuchowski@datto.com>
Pavel Boldin <boldin.pavel@gmail.com>
Pavel Snajdr <snajpa@snajpa.net>
Pavel Zakharov <pavel.zakharov@delphix.com>
Pawel Jakub Dawidek <pjd@FreeBSD.org>
Pedro Giffuni <pfg@freebsd.org>
Expand Down
31 changes: 26 additions & 5 deletions config/kernel-rename.m4
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ dnl # 4.9 API change,
dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants
dnl # flags.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS], [
ZFS_LINUX_TEST_SRC([inode_operations_rename], [
AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [
ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [
#include <linux/fs.h>
int rename_fn(struct inode *sip, struct dentry *sdp,
struct inode *tip, struct dentry *tdp,
Expand All @@ -15,15 +15,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS], [
.rename = rename_fn,
};
],[])
ZFS_LINUX_TEST_SRC([inode_operations_rename], [
#include <linux/fs.h>
int rename2_fn(struct inode *sip, struct dentry *sdp,
struct inode *tip, struct dentry *tdp,
unsigned int flags) { return 0; }
static const struct inode_operations
iops __attribute__ ((unused)) = {
.rename2 = rename2_fn,
};
],[])
])

AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [
AC_DEFUN([ZFS_AC_KERNEL_RENAME], [
AC_MSG_CHECKING([whether iops->rename() wants flags])
ZFS_LINUX_TEST_RESULT([inode_operations_rename], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1,
[iops->rename() wants flags])
AC_DEFINE(HAVE_RENAME2, 1, [iops->rename2() exists])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether iops->rename() wants flags])
ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1,
[iops->rename() wants flags])
],[
AC_MSG_RESULT(no)
])
])
])

4 changes: 2 additions & 2 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_KUIDGID_T
ZFS_AC_KERNEL_SRC_KUID_HELPERS
ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST
ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS
ZFS_AC_KERNEL_SRC_RENAME
ZFS_AC_KERNEL_SRC_CURRENT_TIME
ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES
ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL
Expand Down Expand Up @@ -250,7 +250,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_KUIDGID_T
ZFS_AC_KERNEL_KUID_HELPERS
ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST
ZFS_AC_KERNEL_RENAME_WANTS_FLAGS
ZFS_AC_KERNEL_RENAME
ZFS_AC_KERNEL_CURRENT_TIME
ZFS_AC_KERNEL_USERNS_CAPABILITIES
ZFS_AC_KERNEL_IN_COMPAT_SYSCALL
Expand Down
7 changes: 7 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,13 @@ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name);
extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *dzp, znode_t *zp, char *name, char *link);
extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx,
uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp,
char *dname, znode_t *szp);
extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx,
uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp,
char *dname, znode_t *szp, znode_t *wzp, vsecattr_t *vsecp,
zfs_fuid_info_t *fuidp, vattr_t *vap);
extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp);
extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
Expand Down
4 changes: 1 addition & 3 deletions include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,7 @@ typedef enum zil_create {
#define TX_MKDIR_ATTR 18 /* mkdir with attr */
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
#define TX_WRITE2 20 /* dmu_sync EALREADY write */
#define TX_EXCHANGE 21 /* Exchange two paths */
#define TX_WHITEOUT 22 /* Rename a file, leaving a whiteout */
#define TX_MAX_TYPE 23 /* Max transaction type */
#define TX_MAX_TYPE 21 /* Max transaction type */

/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
Expand Down
3 changes: 2 additions & 1 deletion module/zfs/zfs_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -1015,7 +1015,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
}

/* The only error is !zfs_dirempty() and we checked earlier. */
ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0);
error = zfs_drop_nlink_locked(zp, tx, &unlinked);
ASSERT3U(error, ==, 0);
mutex_exit(&zp->z_lock);
} else {
error = zfs_dropname(dl, zp, dzp, tx, flag);
Expand Down
101 changes: 98 additions & 3 deletions module/zfs/zfs_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,9 +474,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}

/*
* Handles TX_{RENAME,EXCHANGE,WHITEOUT} transactions. They all have the same
* underyling structure (lr_rename_t) but have different txtypes to indicate
* different renameat2(2) flags.
* Handles TX_RENAME transactions.
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
Expand All @@ -490,6 +488,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
if (zil_replaying(zilog, tx))
return;

txtype |= TX_RENAME;
itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
lr = (lr_rename_t *)&itx->itx_lr;
lr->lr_sdoid = sdzp->z_id;
Expand All @@ -501,6 +500,102 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
zil_itx_assign(zilog, itx, tx);
}

/*
* At the moment, only Linux supports the renameat2 variant of renameat, which
* adds three new flags of interest for us:
*
* RENAME_NOREPLACE: if the target name at the moment of the call exists,
* don't rewrite it and return error
* RENAME_EXCHANGE: atomically swap the two names on the filesystem
* RENAME_WHITEOUT: creates a whiteout inode in place of renamed file as
* an atomic operation
*
* Ideally, these operations should be represented as new ZFS Intent Log
* txtypes, which would mandate a new ZFS feature flag due to the on-disk
* format change. One would then use spa_feature_incr/decr functions to
* indicate that the on-disk log contains these new txtypes. However, these
* functions are only supposed to be called from the txg syncing context.
*
* This means that we would need to force out an in-progress txg to disk and
* start a new one before writing any ZIL records. This would ensure that
* previous versions of ZFS which do not support these log txtypes would
* never encounter them during ZIL replay. Doing this would hurt performance.
*
* Alternatively, we could just activate the feature on a pool when these
* renameat2 flags get first used and leave it at that. This would render
* the pool read-only importable on implementations without the new feature
* flag, even when no new txtypes were present on-disk. This could be almost
* all of the time, so it'd be a shame to render the pool read-only on
* non-Linux platforms.
*
* Instead, we choose to rely on the fact that the ZIL is replayed in single-
* threaded mode before the dataset is mounted. This means we can represent
* the otherwise atomic operations as a series of plain good old txtypes
* known to all current OpenZFS implementations. To do that, we use the
* following functions (at least until more platforms implement renameat2).
*
* zfs_log_rename_exchange
* zfs_log_rename_whiteout
*/

void
zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
zfs_dirlock_t *tmpdl;
znode_t *tmpzp = NULL;
char *tmpname;
int retries = 0;
int pos = 0;
int error;

tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);

/*
* To represent atomic rename with old non-atomic operations, we need
* a temporary new name; so we try picking a name until we succeed,
* then we get a dirent lock for that temp name until the final itx
* gets queued
*/
retry:
retries++;
pos = snprintf(tmpname, MAXPATHLEN, "%s.zfs_renameat2_emul_", dname);

for (int i = 0; i < 16; i++) {
int r = 0xFF;
random_get_pseudo_bytes((void *)&r, 1);
pos += snprintf(tmpname+pos, MAXPATHLEN, "%02x", r);
}

error = zfs_dirent_lock(&tmpdl, tdzp, tmpname,
&tmpzp, ZNEW, NULL, NULL);

VERIFY3U(retries, <, 10);
if (error)
goto retry;

/* dst -> tmp */
zfs_log_rename(zilog, tx, txtype, tdzp, dname, tdzp, tmpname, szp);
/* src -> dst */
zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
/* tmp -> src */
zfs_log_rename(zilog, tx, txtype, tdzp, tmpname, sdzp, sname, szp);

zfs_dirent_unlock(tmpdl);
kmem_free(tmpname, MAXPATHLEN);
}

/* See comment above zfs_log_rename_exchange */
void
zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp,
znode_t *wzp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap)
{
zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp);
txtype |= TX_CREATE;
zfs_log_create(zilog, tx, txtype, sdzp, wzp, sname, vsecp, fuidp, vap);
}

/*
* zfs_log_write() handles TX_WRITE transactions. The specified callback is
* called as soon as the write is on stable storage (be it via a DMU sync or a
Expand Down
23 changes: 2 additions & 21 deletions module/zfs/zfs_replay.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,14 +625,15 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
}

static int
_zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg)
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
zfsvfs_t *zfsvfs = arg1;
lr_rename_t *lr = arg2;
char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
char *tname = sname + strlen(sname) + 1;
znode_t *sdzp, *tdzp;
int error;
int vflg = 0;

if (byteswap)
byteswap_uint64_array(lr, sizeof (*lr));
Expand All @@ -656,24 +657,6 @@ _zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg)
return (error);
}

static int
zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, 0));
}

static int
zfs_replay_exchange(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_EXCHANGE));
}

static int
zfs_replay_whiteout(void *arg1, void *arg2, boolean_t byteswap)
{
return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_WHITEOUT));
}

static int
zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
Expand Down Expand Up @@ -998,6 +981,4 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
zfs_replay_create, /* TX_MKDIR_ATTR */
zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */
zfs_replay_write2, /* TX_WRITE2 */
zfs_replay_exchange, /* TX_EXCHANGE */
zfs_replay_whiteout, /* TX_WHITEOUT */
};
Loading

0 comments on commit f12d717

Please sign in to comment.