Skip to content

Commit

Permalink
zfs_rename: support RENAME_* flags
Browse files Browse the repository at this point in the history
Implement support for Linux's RENAME_* flags (for renameat2). Aside from
being quite useful for userspace (providing race-free ways to exchange
paths and implement mv --no-clobber), they are used by overlayfs and are
thus required in order to use overlayfs-on-ZFS.

In order for us to represent the new renameat2(2) flags in the ZIL, we
create two new transaction types for the two flags which need
transactional-level support (RENAME_EXCHANGE and RENAME_WHITEOUT).
RENAME_NOREPLACE does not need any ZIL support because we know that if
the operation succeeded before creating the ZIL entry, there was no file
to be clobbered and thus it can be treated as a regular TX_RENAME.

Cc: Pavel Snajdr <snajpa@snajpa.net>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
  • Loading branch information
cyphar authored and Ryan Moeller committed Oct 21, 2022
1 parent d524f3c commit 4299712
Show file tree
Hide file tree
Showing 33 changed files with 929 additions and 71 deletions.
2 changes: 2 additions & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ CONTRIBUTORS:
Alec Salazar <alec.j.salazar@gmail.com>
Alejandro R. Sedeño <asedeno@mit.edu>
Alek Pinchuk <alek@nexenta.com>
Aleksa Sarai <cyphar@cyphar.com>
Alex Braunegg <alex.braunegg@gmail.com>
Alex McWhirter <alexmcwhirter@triadic.us>
Alex Reece <alex@delphix.com>
Expand Down Expand Up @@ -236,6 +237,7 @@ CONTRIBUTORS:
Paul Dagnelie <pcd@delphix.com>
Paul Zuchowski <pzuchowski@datto.com>
Pavel Boldin <boldin.pavel@gmail.com>
Pavel Snajdr <snajpa@snajpa.net>
Pavel Zakharov <pavel.zakharov@delphix.com>
Pawel Jakub Dawidek <pjd@FreeBSD.org>
Pedro Giffuni <pfg@freebsd.org>
Expand Down
10 changes: 10 additions & 0 deletions cmd/zdb/zdb_il.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, const void *arg)
(void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix,
(u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
(void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm);
switch (txtype) {
case TX_RENAME_EXCHANGE:
(void) printf("%sflags RENAME_EXCHANGE\n", tab_prefix);
break;
case TX_RENAME_WHITEOUT:
(void) printf("%sflags RENAME_WHITEOUT\n", tab_prefix);
break;
}
}

static int
Expand Down Expand Up @@ -330,6 +338,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
{.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "},
{.zri_print = zil_prt_rec_setsaxattr,
.zri_name = "TX_SETSAXATTR "},
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "},
{.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "},
};

static int
Expand Down
2 changes: 2 additions & 0 deletions cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -2368,6 +2368,8 @@ static zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
NULL, /* TX_MKDIR_ACL_ATTR */
NULL, /* TX_WRITE2 */
NULL, /* TX_SETSAXATTR */
NULL, /* TX_RENAME_EXCHANGE */
NULL, /* TX_RENAME_WHITEOUT */
};

/*
Expand Down
71 changes: 63 additions & 8 deletions config/kernel-rename.m4
Original file line number Diff line number Diff line change
@@ -1,8 +1,28 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [
dnl #
dnl # 3.9 (to 4.9) API change,
dnl #
dnl # A new version of iops->rename() was added (rename2) that takes a flag
dnl # argument (to support renameat2). However this separate function was
dnl # merged back into iops->rename() in Linux 4.9.
dnl #
ZFS_LINUX_TEST_SRC([inode_operations_rename2], [
#include <linux/fs.h>
int rename2_fn(struct inode *sip, struct dentry *sdp,
struct inode *tip, struct dentry *tdp,
unsigned int flags) { return 0; }
static const struct inode_operations
iops __attribute__ ((unused)) = {
.rename2 = rename2_fn,
};
],[])
dnl #
dnl # 4.9 API change,
dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants
dnl # flags.
dnl #
dnl # iops->rename2() merged into iops->rename(), and iops->rename() now
dnl # wants flags.
dnl #
ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [
#include <linux/fs.h>
Expand All @@ -16,11 +36,29 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [
};
],[])
dnl #
dnl # EL7 compatibility
dnl #
dnl # EL7 has backported renameat2 support, but it's done by defining a
dnl # separate iops wrapper structure that takes the .renameat2 function.
dnl #
ZFS_LINUX_TEST_SRC([dir_inode_operations_wrapper_rename2], [
#include <linux/fs.h>
int rename2_fn(struct inode *sip, struct dentry *sdp,
struct inode *tip, struct dentry *tdp,
unsigned int flags) { return 0; }
static const struct inode_operations_wrapper
iops __attribute__ ((unused)) = {
.rename2 = rename2_fn,
};
],[])
dnl #
dnl # 5.12 API change,
dnl #
dnl # Linux 5.12 introduced passing struct user_namespace* as the first argument
dnl # of the rename() and other inode_operations members.
dnl # Linux 5.12 introduced passing struct user_namespace* as the first
dnl # argument of the rename() and other inode_operations members.
dnl #
ZFS_LINUX_TEST_SRC([inode_operations_rename_userns], [
#include <linux/fs.h>
Expand All @@ -44,13 +82,30 @@ AC_DEFUN([ZFS_AC_KERNEL_RENAME], [
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether iop->rename() wants flags])
ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [
AC_MSG_CHECKING([whether iops->rename2() exists])
ZFS_LINUX_TEST_RESULT([inode_operations_rename2], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1,
[iops->rename() wants flags])
AC_DEFINE(HAVE_RENAME2, 1, [iops->rename2() exists])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether iops->rename() wants flags])
ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1,
[iops->rename() wants flags])
],[
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether struct inode_operations_wrapper takes .rename2()])
ZFS_LINUX_TEST_RESULT([dir_inode_operations_wrapper_rename2], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RENAME2_OPERATIONS_WRAPPER, 1,
[struct inode_operations_wrapper takes .rename2()])
],[
AC_MSG_RESULT(no)
])
])
])
])
])
3 changes: 2 additions & 1 deletion include/os/freebsd/zfs/sys/zfs_vnops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ extern int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd,
extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr,
zuserns_t *mnt_ns);
extern int zfs_rename(znode_t *sdzp, const char *snm, znode_t *tdzp,
const char *tnm, cred_t *cr, int flags, zuserns_t *mnt_ns);
const char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap,
zuserns_t *mnt_ns);
extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
const char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns);
extern int zfs_link(znode_t *tdzp, znode_t *sp,
Expand Down
13 changes: 13 additions & 0 deletions include/os/linux/kernel/linux/vfs_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,19 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid)
ip->i_gid = make_kgid(kcred->user_ns, gid);
}

/*
* 3.15 API change
*/
#ifndef RENAME_NOREPLACE
#define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */
#endif
#ifndef RENAME_EXCHANGE
#define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */
#endif
#ifndef RENAME_WHITEOUT
#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */
#endif

/*
* 4.9 API change
*/
Expand Down
10 changes: 10 additions & 0 deletions include/os/linux/spl/sys/sysmacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ extern uint32_t zone_get_hostid(void *zone);
extern void spl_setup(void);
extern void spl_cleanup(void);

/*
* Only handles the first 4096 majors and first 256 minors. We don't have a
* libc for the kernel module so we define this inline.
*/
static inline dev_t
makedev(unsigned int major, unsigned int minor)
{
return ((major & 0xFFF) << 8) | (minor & 0xFF);
}

#define highbit(x) __fls(x)
#define lowbit(x) __ffs(x)

Expand Down
3 changes: 2 additions & 1 deletion include/os/linux/zfs/sys/zfs_vnops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ extern int zfs_getattr_fast(struct user_namespace *, struct inode *ip,
extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr,
zuserns_t *mnt_ns);
extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp,
char *tnm, cred_t *cr, int flags, zuserns_t *mnt_ns);
char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap,
zuserns_t *mnt_ns);
extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap,
char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns);
extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr);
Expand Down
4 changes: 4 additions & 0 deletions include/os/linux/zfs/sys/zpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ extern void zpl_vap_init(vattr_t *vap, struct inode *dir,
umode_t mode, cred_t *cr, zuserns_t *mnt_ns);

extern const struct inode_operations zpl_inode_operations;
#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
extern const struct inode_operations_wrapper zpl_dir_inode_operations;
#else
extern const struct inode_operations zpl_dir_inode_operations;
#endif
extern const struct inode_operations zpl_symlink_inode_operations;
extern const struct inode_operations zpl_special_inode_operations;

Expand Down
6 changes: 6 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,12 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
znode_t *sdzp, const char *sname, znode_t *tdzp, const char *dname,
znode_t *szp);
extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx,
uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp,
const char *dname, znode_t *szp);
extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx,
uint64_t txtype, znode_t *sdzp, const char *sname, znode_t *tdzp,
const char *dname, znode_t *szp, znode_t *wzp);
extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t len, int ioflag,
zil_callback_t callback, void *callback_data);
Expand Down
17 changes: 16 additions & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ typedef enum zil_create {
#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */
#define TX_WRITE2 20 /* dmu_sync EALREADY write */
#define TX_SETSAXATTR 21 /* Set sa xattrs on file */
#define TX_MAX_TYPE 22 /* Max transaction type */
#define TX_RENAME_EXCHANGE 22 /* Atomic swap via renameat2 */
#define TX_RENAME_WHITEOUT 23 /* Atomic whiteout via renameat2 */
#define TX_MAX_TYPE 24 /* Max transaction type */

/*
* The transactions for mkdir, symlink, remove, rmdir, link, and rename
Expand Down Expand Up @@ -317,6 +319,19 @@ typedef struct {
/* 2 strings: names of source and destination follow this */
} lr_rename_t;

typedef struct {
lr_rename_t lr_rename; /* common rename portion */
/* members related to the whiteout file (based on lr_create_t) */
uint64_t lr_wfoid; /* obj id of the new whiteout file */
uint64_t lr_wmode; /* mode of object */
uint64_t lr_wuid; /* uid of whiteout */
uint64_t lr_wgid; /* gid of whiteout */
uint64_t lr_wgen; /* generation (txg of creation) */
uint64_t lr_wcrtime[2]; /* creation time */
uint64_t lr_wrdev; /* always makedev(0, 0) */
/* 2 strings: names of source and destination follow this */
} lr_rename_whiteout_t;

typedef struct {
lr_t lr_common; /* common portion of log record */
uint64_t lr_foid; /* file object to write */
Expand Down
5 changes: 4 additions & 1 deletion module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -3420,14 +3420,17 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,

int
zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
cred_t *cr, int flags, zuserns_t *mnt_ns)
cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns)
{
struct componentname scn, tcn;
vnode_t *sdvp, *tdvp;
vnode_t *svp, *tvp;
int error;
svp = tvp = NULL;

if (rflags != 0 || wo_vap != NULL)
return (SET_ERROR(EINVAL));

sdvp = ZTOV(sdzp);
tdvp = ZTOV(tdzp);
error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
Expand Down
3 changes: 2 additions & 1 deletion module/os/linux/zfs/zfs_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -1035,7 +1035,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
}

/* The only error is !zfs_dirempty() and we checked earlier. */
ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0);
error = zfs_drop_nlink_locked(zp, tx, &unlinked);
ASSERT3U(error, ==, 0);
mutex_exit(&zp->z_lock);
} else {
error = zfs_dropname(dl, zp, dzp, tx, flag);
Expand Down
Loading

0 comments on commit 4299712

Please sign in to comment.