Skip to content

Commit

Permalink
Fix atime handling and relatime
Browse files Browse the repository at this point in the history
The problem for atime:

We have 3 places for atime: inode->i_atime, znode->z_atime and SA. And its
handling is a mess. A huge part of mess regarding atime comes from
zfs_tstamp_update_setup, zfs_inode_update, and zfs_getattr, which behave
inconsistently with those three values.

zfs_tstamp_update_setup clears z_atime_dirty unconditionally as long as you
don't pass ATTR_ATIME. Which means every write(2) operation which only updates
ctime and mtime will cause atime changes to not be written to disk.

Also zfs_inode_update from write(2) will replace inode->i_atime with what's
inside SA(stale). But doesn't touch z_atime. So after read(2) and write(2).
You'll have i_atime(stale), z_atime(new), SA(stale) and z_atime_dirty=0.

Now, if you do stat(2), zfs_getattr will actually replace i_atime with what's
inside, z_atime. So you will have now you'll have i_atime(new), z_atime(new),
SA(stale) and z_atime_dirty=0. These will all gone after umount. And you'll
leave with a stale atime.

The problem for relatime:

We do have a relatime config inside ZFS dataset, but how it should interact
with the mount flag MS_RELATIME is not well defined. It seems it wanted
relatime mount option to override the dataset config by showing it as
temporary in `zfs get`. But at the same time, `zfs set relatime=on|off` would
also seems to want to override the mount option. Not to mention that
MS_RELATIME flag is actually never passed into ZFS, so it never really worked.

How Linux handles atime:

The Linux kernel actually handles atime completely in VFS, except for writing
it to disk. So if we remove the atime handling in ZFS, things would just work,
no matter it's strictatime, relatime, noatime, or even O_NOATIME. And whenever
VFS updates the i_atime, it will notify the underlying filesystem via
sb->dirty_inode().

And also there's one thing to note about atime flags like MS_RELATIME and
other flags like MS_NODEV, etc. They are mount point flags rather than
filesystem(sb) flags. Since native linux filesystem can be mounted at multiple
places at the same time, they can all have different atime settings. So these
flags are never passed down to filesystem drivers.

What this patch tries to do:

We remove znode->z_atime, since we won't gain anything from it. We remove most
of the atime handling and leave it to VFS. The only thing we do with atime is
to write it when dirty_inode() or setattr() is called. We also add
file_accessed() in zpl_read() since it's not provided in vfs_read().

After this patch, only the MS_RELATIME flag will have effect. The setting in
dataset won't do anything. We will make zfstuil to mount ZFS with MS_RELATIME
set according to the setting in dataset in future patch.

Signed-off-by: Chunwei Chen <david.chen@osnexus.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue openzfs#4482
  • Loading branch information
Chunwei Chen committed Nov 11, 2016
1 parent ee49e20 commit cb0b802
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 108 deletions.
7 changes: 2 additions & 5 deletions include/sys/trace_acl.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__field(uint64_t, z_mapcnt)
__field(uint64_t, z_gen)
__field(uint64_t, z_size)
__array(uint64_t, z_atime, 2)
__field(uint64_t, z_links)
__field(uint64_t, z_pflags)
__field(uint64_t, z_uid)
Expand Down Expand Up @@ -94,8 +93,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__entry->z_mapcnt = zn->z_mapcnt;
__entry->z_gen = zn->z_gen;
__entry->z_size = zn->z_size;
__entry->z_atime[0] = zn->z_atime[0];
__entry->z_atime[1] = zn->z_atime[1];
__entry->z_links = zn->z_links;
__entry->z_pflags = zn->z_pflags;
__entry->z_uid = zn->z_uid;
Expand Down Expand Up @@ -124,7 +121,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
),
TP_printk("zn { id %llu unlinked %u atime_dirty %u "
"zn_prefetch %u moved %u blksz %u seq %u "
"mapcnt %llu gen %llu size %llu atime 0x%llx:0x%llx "
"mapcnt %llu gen %llu size %llu "
"links %llu pflags %llu uid %llu gid %llu "
"sync_cnt %u mode 0x%x is_sa %d "
"is_mapped %d is_ctldir %d is_stale %d inode { "
Expand All @@ -134,7 +131,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty,
__entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz,
__entry->z_seq, __entry->z_mapcnt, __entry->z_gen,
__entry->z_size, __entry->z_atime[0], __entry->z_atime[1],
__entry->z_size,
__entry->z_links, __entry->z_pflags, __entry->z_uid,
__entry->z_gid, __entry->z_sync_cnt, __entry->z_mode,
__entry->z_is_sa, __entry->z_is_mapped,
Expand Down
7 changes: 1 addition & 6 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ typedef struct znode {
uint64_t z_mapcnt; /* number of pages mapped to file */
uint64_t z_gen; /* generation (cached) */
uint64_t z_size; /* file size (cached) */
uint64_t z_atime[2]; /* atime (cached) */
uint64_t z_links; /* file links (cached) */
uint64_t z_pflags; /* pflags (cached) */
uint64_t z_uid; /* uid fuid (cached) */
Expand Down Expand Up @@ -304,16 +303,12 @@ extern unsigned int zfs_object_mutex_size;
#define STATE_CHANGED (ATTR_CTIME)
#define CONTENT_MODIFIED (ATTR_MTIME | ATTR_CTIME)

#define ZFS_ACCESSTIME_STAMP(zsb, zp) \
if ((zsb)->z_atime && !(zfs_is_readonly(zsb))) \
zfs_tstamp_update_setup(zp, ACCESSED, NULL, NULL, B_FALSE);

extern int zfs_init_fs(zfs_sb_t *, znode_t **);
extern void zfs_set_dataprop(objset_t *);
extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
dmu_tx_t *tx);
extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2],
uint64_t [2], boolean_t);
uint64_t [2]);
extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
extern void zfs_znode_init(void);
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/zfs_acl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1457,7 +1457,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
zp->z_pflags |= ZFS_ACL_TRIVIAL;

zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime, B_TRUE);
zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
}

Expand Down
2 changes: 0 additions & 2 deletions module/zfs/zfs_ctldir.c
Original file line number Diff line number Diff line change
Expand Up @@ -478,8 +478,6 @@ zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
zp->z_mapcnt = 0;
zp->z_gen = 0;
zp->z_size = 0;
zp->z_atime[0] = 0;
zp->z_atime[1] = 0;
zp->z_links = 0;
zp->z_pflags = 0;
zp->z_uid = 0;
Expand Down
10 changes: 5 additions & 5 deletions module/zfs/zfs_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
ctime, B_TRUE);
ctime);
}
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0);
Expand All @@ -781,7 +781,7 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
ctime, sizeof (ctime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb), NULL,
&dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
Expand Down Expand Up @@ -876,8 +876,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, &ctime, sizeof (ctime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb),
NULL, &zp->z_pflags, sizeof (zp->z_pflags));
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
B_TRUE);
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
ctime);
}
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zsb),
NULL, &zp->z_links, sizeof (zp->z_links));
Expand All @@ -904,7 +904,7 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
NULL, mtime, sizeof (mtime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zsb),
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
ASSERT(error == 0);
mutex_exit(&dzp->z_lock);
Expand Down
5 changes: 3 additions & 2 deletions module/zfs/zfs_sa.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
sa_bulk_attr_t *bulk, *sa_attrs;
zfs_acl_locator_cb_t locate = { 0 };
uint64_t uid, gid, mode, rdev, xattr, parent;
uint64_t crtime[2], mtime[2], ctime[2];
uint64_t crtime[2], mtime[2], ctime[2], atime[2];
zfs_acl_phys_t znode_acl;
char scanstamp[AV_SCANSTAMP_SZ];
boolean_t drop_lock = B_FALSE;
Expand Down Expand Up @@ -309,6 +309,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)

/* First do a bulk query of the attributes that aren't cached */
bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * 20, KM_SLEEP);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL, &atime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zsb), NULL, &crtime, 16);
Expand Down Expand Up @@ -344,7 +345,7 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zsb), NULL,
&zp->z_pflags, 8);
SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zsb), NULL,
zp->z_atime, 16);
&atime, 16);
SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zsb), NULL,
&mtime, 16);
SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zsb), NULL,
Expand Down
38 changes: 14 additions & 24 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,6 @@ zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
out:
zfs_range_unlock(rl);

ZFS_ACCESSTIME_STAMP(zsb, zp);
ZFS_EXIT(zsb);
return (error);
}
Expand Down Expand Up @@ -865,8 +864,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
}
mutex_exit(&zp->z_acl_lock);

zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);

/*
* Update the file size (zp_size) if it has changed;
Expand Down Expand Up @@ -2140,9 +2138,6 @@ zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr)
zap_cursor_fini(&zc);
if (error == ENOENT)
error = 0;

ZFS_ACCESSTIME_STAMP(zsb, zp);

out:
ZFS_EXIT(zsb);

Expand Down Expand Up @@ -2195,18 +2190,19 @@ zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
zfs_sb_t *zsb = ITOZSB(ip);
int error = 0;
uint64_t links;
uint64_t mtime[2], ctime[2];
uint64_t atime[2], mtime[2], ctime[2];
xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
xoptattr_t *xoap = NULL;
boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
sa_bulk_attr_t bulk[2];
sa_bulk_attr_t bulk[3];
int count = 0;

ZFS_ENTER(zsb);
ZFS_VERIFY_ZP(zp);

zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);

SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL, &atime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zsb), NULL, &mtime, 16);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL, &ctime, 16);

Expand Down Expand Up @@ -2355,7 +2351,7 @@ zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
}
}

ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
ZFS_TIME_DECODE(&vap->va_atime, atime);
ZFS_TIME_DECODE(&vap->va_mtime, mtime);
ZFS_TIME_DECODE(&vap->va_ctime, ctime);

Expand Down Expand Up @@ -2402,7 +2398,6 @@ zfs_getattr_fast(struct inode *ip, struct kstat *sp)
mutex_enter(&zp->z_lock);

generic_fillattr(ip, sp);
ZFS_TIME_DECODE(&sp->atime, zp->z_atime);

sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
sp->blksize = blksize;
Expand Down Expand Up @@ -2466,7 +2461,7 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
uint64_t new_mode;
uint64_t new_uid, new_gid;
uint64_t xattr_obj;
uint64_t mtime[2], ctime[2];
uint64_t mtime[2], ctime[2], atime[2];
znode_t *attrzp;
int need_policy = FALSE;
int err, err2;
Expand Down Expand Up @@ -2940,9 +2935,9 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)


if (mask & ATTR_ATIME) {
ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
ZFS_TIME_ENCODE(&vap->va_atime, atime);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zsb), NULL,
&zp->z_atime, sizeof (zp->z_atime));
&atime, sizeof (atime));
}

if (mask & ATTR_MTIME) {
Expand All @@ -2957,19 +2952,17 @@ zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
NULL, mtime, sizeof (mtime));
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
B_TRUE);
zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
} else if (mask != 0) {
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zsb), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
B_TRUE);
zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
if (attrzp) {
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_CTIME(zsb), NULL,
&ctime, sizeof (ctime));
zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
mtime, ctime, B_TRUE);
mtime, ctime);
}
}
/*
Expand Down Expand Up @@ -3690,7 +3683,6 @@ zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
error = zfs_sa_readlink(zp, uio);
mutex_exit(&zp->z_lock);

ZFS_ACCESSTIME_STAMP(zsb, zp);
ZFS_EXIT(zsb);
return (error);
}
Expand Down Expand Up @@ -4089,7 +4081,6 @@ zfs_dirty_inode(struct inode *ip, int flags)
mode = ip->i_mode;

zp->z_mode = mode;
zp->z_atime_dirty = 0;

error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
mutex_exit(&zp->z_lock);
Expand All @@ -4107,6 +4098,7 @@ zfs_inactive(struct inode *ip)
{
znode_t *zp = ITOZ(ip);
zfs_sb_t *zsb = ITOZSB(ip);
uint64_t atime[2];
int error;
int need_unlock = 0;

Expand All @@ -4130,9 +4122,10 @@ zfs_inactive(struct inode *ip)
if (error) {
dmu_tx_abort(tx);
} else {
ZFS_TIME_ENCODE(&ip->i_atime, atime);
mutex_enter(&zp->z_lock);
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zsb),
(void *)&zp->z_atime, sizeof (zp->z_atime), tx);
(void *)&atime, sizeof (atime), tx);
zp->z_atime_dirty = 0;
mutex_exit(&zp->z_lock);
dmu_tx_commit(tx);
Expand Down Expand Up @@ -4241,9 +4234,6 @@ zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)

err = zfs_fillpage(ip, pl, nr_pages);

if (!err)
ZFS_ACCESSTIME_STAMP(zsb, zp);

ZFS_EXIT(zsb);
return (err);
}
Expand Down
Loading

0 comments on commit cb0b802

Please sign in to comment.