Skip to content

Commit

Permalink
Add two new counters: z_sync_writes_cnt & z_async_writes_cnt to the
Browse files Browse the repository at this point in the history
znode to keep track of active sync & non-sync page writebacks

Do a commit when:
- the current writeback is not intended to be synced and there are
active sync writebacks

- the current writeback is intended to be synced and there are active
non-sync page writebacks

This prevents sync page writebacks from accidentally blocking on
non-sync page writebacks which can take several seconds to complete.

Signed-off-by: Shaan Nobee <sniper111@gmail.com>
  • Loading branch information
shaan1337 committed Dec 5, 2021
1 parent 82b6d7b commit a53d9af
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 18 deletions.
14 changes: 10 additions & 4 deletions include/os/linux/zfs/sys/trace_acl.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__field(uint64_t, z_size)
__field(uint64_t, z_pflags)
__field(uint32_t, z_sync_cnt)
__field(uint32_t, z_sync_writes_cnt)
__field(uint32_t, z_async_writes_cnt)
__field(mode_t, z_mode)
__field(boolean_t, z_is_sa)
__field(boolean_t, z_is_mapped)
Expand Down Expand Up @@ -91,6 +93,8 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
__entry->z_size = zn->z_size;
__entry->z_pflags = zn->z_pflags;
__entry->z_sync_cnt = zn->z_sync_cnt;
__entry->z_sync_writes_cnt = zn->z_sync_writes_cnt;
__entry->z_async_writes_cnt = zn->z_async_writes_cnt;
__entry->z_mode = zn->z_mode;
__entry->z_is_sa = zn->z_is_sa;
__entry->z_is_mapped = zn->z_is_mapped;
Expand All @@ -116,16 +120,18 @@ DECLARE_EVENT_CLASS(zfs_ace_class,
TP_printk("zn { id %llu unlinked %u atime_dirty %u "
"zn_prefetch %u blksz %u seq %u "
"mapcnt %llu size %llu pflags %llu "
"sync_cnt %u mode 0x%x is_sa %d "
"is_mapped %d is_ctldir %d is_stale %d inode { "
"sync_cnt %u sync_writes_cnt %u async_writes_cnt %u "
"mode 0x%x is_sa %d is_mapped %d "
"is_ctldir %d is_stale %d inode { "
"uid %u gid %u ino %lu nlink %u size %lli "
"blkbits %u bytes %u mode 0x%x generation %x } } "
"ace { type %u flags %u access_mask %u } mask_matched %u",
__entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty,
__entry->z_zn_prefetch, __entry->z_blksz,
__entry->z_seq, __entry->z_mapcnt, __entry->z_size,
__entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode,
__entry->z_is_sa, __entry->z_is_mapped,
__entry->z_pflags, __entry->z_sync_cnt,
__entry->z_sync_writes_cnt, __entry->z_async_writes_cnt,
__entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped,
__entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid,
__entry->i_gid, __entry->i_ino, __entry->i_nlink,
__entry->i_size, __entry->i_blkbits,
Expand Down
2 changes: 1 addition & 1 deletion include/os/linux/zfs/sys/zfs_vnops_os.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
extern int zfs_fid(struct inode *ip, fid_t *fidp);
extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages);
extern int zfs_putpage(struct inode *ip, struct page *pp,
struct writeback_control *wbc);
struct writeback_control *wbc, boolean_t for_sync);
extern int zfs_dirty_inode(struct inode *ip, int flags);
extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp,
size_t len, unsigned long vm_flags);
Expand Down
2 changes: 2 additions & 0 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ typedef struct znode {
uint64_t z_size; /* file size (cached) */
uint64_t z_pflags; /* pflags (cached) */
uint32_t z_sync_cnt; /* synchronous open count */
uint32_t z_sync_writes_cnt; /* synchronous write count */
uint32_t z_async_writes_cnt; /* asynchronous write count */
mode_t z_mode; /* mode (cached) */
kmutex_t z_acl_lock; /* acl data lock */
zfs_acl_t *z_acl_cached; /* cached acl */
Expand Down
2 changes: 2 additions & 0 deletions module/os/freebsd/zfs/zfs_znode.c
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;
#if __FreeBSD_version >= 1300139
atomic_store_ptr(&zp->z_cached_symlink, NULL);
#endif
Expand Down
2 changes: 2 additions & 0 deletions module/os/linux/zfs/zfs_ctldir.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
zp->z_pflags = 0;
zp->z_mode = 0;
zp->z_sync_cnt = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;
ip->i_generation = 0;
ip->i_ino = id;
ip->i_mode = (S_IFDIR | S_IRWXUGO);
Expand Down
58 changes: 52 additions & 6 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -3408,21 +3408,43 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
}

static void
zfs_putpage_commit_cb(void *arg)
zfs_putpage_sync_commit_cb(void *arg)
{
struct page *pp = arg;

ClearPageError(pp);
end_page_writeback(pp);
}

static int
zfs_putpage_async_commit_cb(void *arg)
{
struct page *pp = arg;

ClearPageError(pp);
end_page_writeback(pp);

struct inode *ip = pp->mapping->host;
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);

ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
atomic_dec_32(&zp->z_async_writes_cnt);
ZFS_EXIT(zfsvfs);

return (0);
}

/*
* Push a page out to disk, once the page is on stable storage the
* registered commit callback will be run as notification of completion.
*
* IN: ip - page mapped for inode.
* pp - page to push (page is locked)
* wbc - writeback control data
* IN: ip - page mapped for inode.
* pp - page to push (page is locked)
* wbc - writeback control data
* for_sync - does the caller intend to wait synchronously for the
* page writeback to complete?
*
* RETURN: 0 if success
* error code if failure
Expand All @@ -3432,7 +3454,8 @@ zfs_putpage_commit_cb(void *arg)
*/
/* ARGSUSED */
int
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
boolean_t for_sync)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
Expand Down Expand Up @@ -3530,6 +3553,14 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
zfs_rangelock_exit(lr);

if (wbc->sync_mode != WB_SYNC_NONE) {
/*
* speed up any non-sync page writebacks since
* they may take several seconds to complete
*/
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
zil_commit(zfsvfs->z_log, zp->z_id);
}

if (PageWriteback(pp))
wait_on_page_bit(pp, PG_writeback);
}
Expand All @@ -3551,6 +3582,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
* was in fact not skipped and should not be counted as if it were.
*/
wbc->pages_skipped--;
if (!for_sync)
atomic_inc_32(&zp->z_async_writes_cnt);
set_page_writeback(pp);
unlock_page(pp);

Expand All @@ -3568,6 +3601,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
__set_page_dirty_nobuffers(pp);
ClearPageError(pp);
end_page_writeback(pp);
if (!for_sync)
atomic_dec_32(&zp->z_async_writes_cnt);
zfs_rangelock_exit(lr);
ZFS_EXIT(zfsvfs);
return (err);
Expand All @@ -3592,7 +3627,9 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);

zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
zfs_putpage_commit_cb, pp);
for_sync ? zfs_putpage_sync_commit_cb :
(void *) zfs_putpage_async_commit_cb, pp);

dmu_tx_commit(tx);

zfs_rangelock_exit(lr);
Expand All @@ -3604,6 +3641,15 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
* performance reasons.
*/
zil_commit(zfsvfs->z_log, zp->z_id);
} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
/*
* If the caller does not intend to wait synchronously
* for this page writeback to complete and there are active
* synchronous calls on this file, do a commit so that
* the latter don't accidentally end up waiting for
* our writeback to complete.
*/
zil_commit(zfsvfs->z_log, zp->z_id);
}

ZFS_EXIT(zfsvfs);
Expand Down
2 changes: 2 additions & 0 deletions module/os/linux/zfs/zfs_znode.c
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
zp->z_sync_cnt = 0;
zp->z_sync_writes_cnt = 0;
zp->z_async_writes_cnt = 0;

zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);

Expand Down
49 changes: 42 additions & 7 deletions module/os/linux/zfs/zpl_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,29 @@ static int
zpl_fsync(struct file *filp, int datasync)
{
struct inode *inode = filp->f_mapping->host;
znode_t *zp = ITOZ(inode);
zfsvfs_t *zfsvfs = ITOZSB(inode);
cred_t *cr = CRED();
int error;
fstrans_cookie_t cookie;

ZPL_ENTER(zfsvfs);
ZPL_VERIFY_ZP(zp);
atomic_inc_32(&zp->z_sync_writes_cnt);
ZPL_EXIT(zfsvfs);

crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_fsync(ITOZ(inode), datasync, cr);
error = -zfs_fsync(zp, datasync, cr);
spl_fstrans_unmark(cookie);
crfree(cr);
ASSERT3S(error, <=, 0);

ZPL_ENTER(zfsvfs);
ZPL_VERIFY_ZP(zp);
atomic_dec_32(&zp->z_sync_writes_cnt);
ZPL_EXIT(zfsvfs);

return (error);
}

Expand All @@ -161,21 +173,40 @@ static int
zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
{
struct inode *inode = filp->f_mapping->host;
znode_t *zp = ITOZ(inode);
zfsvfs_t *zfsvfs = ITOZSB(inode);
cred_t *cr = CRED();
int error;
fstrans_cookie_t cookie;

ZPL_ENTER(zfsvfs);
ZPL_VERIFY_ZP(zp);
atomic_inc_32(&zp->z_sync_writes_cnt);
/*
* speed up any non-sync page writebacks since
* they may take several seconds to complete
*/
if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
zil_commit(zfsvfs->z_log, zp->z_id);
}
ZPL_EXIT(zfsvfs);

error = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (error)
return (error);

crhold(cr);
cookie = spl_fstrans_mark();
error = -zfs_fsync(ITOZ(inode), datasync, cr);
error = -zfs_fsync(zp, datasync, cr);
spl_fstrans_unmark(cookie);
crfree(cr);
ASSERT3S(error, <=, 0);

ZPL_ENTER(zfsvfs);
ZPL_VERIFY_ZP(zp);
atomic_dec_32(&zp->z_sync_writes_cnt);
ZPL_EXIT(zfsvfs);

return (error);
}

Expand Down Expand Up @@ -651,14 +682,14 @@ zpl_readpages(struct file *filp, struct address_space *mapping,
static int
zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
{
struct address_space *mapping = data;
boolean_t *for_sync = data;
fstrans_cookie_t cookie;

ASSERT(PageLocked(pp));
ASSERT(!PageWriteback(pp));

cookie = spl_fstrans_mark();
(void) zfs_putpage(mapping->host, pp, wbc);
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
spl_fstrans_unmark(cookie);

return (0);
Expand All @@ -685,8 +716,9 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
* we run it once in non-SYNC mode so that the ZIL gets all the data,
* and then we commit it all in one go.
*/
boolean_t for_sync = (sync_mode == WB_SYNC_ALL);
wbc->sync_mode = WB_SYNC_NONE;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync);
if (sync_mode != wbc->sync_mode) {
ZPL_ENTER(zfsvfs);
ZPL_VERIFY_ZP(zp);
Expand All @@ -702,7 +734,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
* details). That being said, this is a no-op in most cases.
*/
wbc->sync_mode = sync_mode;
result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
result = write_cache_pages(mapping, wbc, zpl_putpage,
&for_sync);
}
return (result);
}
Expand All @@ -719,7 +752,9 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
wbc->sync_mode = WB_SYNC_ALL;

return (zpl_putpage(pp, wbc, pp->mapping));
boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL);

return (zpl_putpage(pp, wbc, &for_sync));
}

/*
Expand Down

0 comments on commit a53d9af

Please sign in to comment.