From 740b8e0d891ac03b4dc452c195c285a377001865 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Sun, 13 Oct 2024 16:13:26 +0000 Subject: [PATCH 1/2] Fix a potential page leak in mappedread_sf() mappedread_sf() may allocate pages; if it fails to populate a page can't free it, it needs to ensure that it's placed into a page queue, otherwise it can't be reclaimed until the vnode is destroyed. I think this is quite unlikely to happen in practice, it was noticed by code inspection. Signed-off-by: Mark Johnston --- module/os/freebsd/zfs/zfs_vnops_os.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index a2222a899380..3ddd05667b86 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -452,8 +452,10 @@ mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio) if (!vm_page_wired(pp) && pp->valid == 0 && vm_page_busy_tryupgrade(pp)) vm_page_free(pp); - else + else { + vm_page_deactivate_noreuse(pp); vm_page_sunbusy(pp); + } zfs_vmobject_wunlock(obj); } } else { From 55cb2e7c5cd574968886ec660c6717e28d4d7889 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Sun, 13 Oct 2024 16:17:08 +0000 Subject: [PATCH 2/2] Grab the rangelock unconditionally in zfs_getpages() As a deadlock avoidance measure, zfs_getpages() would only try to acquire a rangelock, falling back to a single-page read if this was not possible. However, this is incompatible with direct I/O. Instead, release the busy lock before trying to acquire the rangelock in blocking mode. This means that it's possible for the page to be replaced, so we have to re-lookup. Signed-off-by: Mark Johnston --- module/os/freebsd/zfs/zfs_vnops_os.c | 68 +++++++++++++++++++++------- 1 file changed, 51 insertions(+), 17 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 3ddd05667b86..576b4800efab 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -3930,6 +3930,7 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0) return (zfs_vm_pagerret_error); + object = ma[0]->object; start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); @@ -3938,33 +3939,47 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * Note that we need to handle the case of the block size growing. */ for (;;) { + uint64_t len; + blksz = zp->z_blksz; + len = roundup(end, blksz) - rounddown(start, blksz); + lr = zfs_rangelock_tryenter(&zp->z_rangelock, - rounddown(start, blksz), - roundup(end, blksz) - rounddown(start, blksz), RL_READER); + rounddown(start, blksz), len, RL_READER); if (lr == NULL) { - if (rahead != NULL) { - *rahead = 0; - rahead = NULL; - } - if (rbehind != NULL) { - *rbehind = 0; - rbehind = NULL; + /* + * Avoid a deadlock with update_pages(). We need to + * hold the range lock when copying from the DMU, so + * give up the busy lock to allow update_pages() to + * proceed. We might need to allocate new pages, which + * isn't quite right since this allocation isn't subject + * to the page fault handler's OOM logic, but this is + * the best we can do for now. + */ + for (int i = 0; i < count; i++) { + ASSERT(vm_page_none_valid(ma[i])); + vm_page_xunbusy(ma[i]); } - break; + + lr = zfs_rangelock_enter(&zp->z_rangelock, + rounddown(start, blksz), len, RL_READER); + + zfs_vmobject_wlock(object); + (void) vm_page_grab_pages(object, OFF_TO_IDX(start), + VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO, + ma, count); + zfs_vmobject_wunlock(object); } if (blksz == zp->z_blksz) break; zfs_rangelock_exit(lr); } - object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { - if (lr != NULL) - zfs_rangelock_exit(lr); + zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (zfs_vm_pagerret_bad); } @@ -3989,11 +4004,30 @@ zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ - error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b, - &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); + for (int i = 0; i < count; i++) { + int count1, j, last_size; - if (lr != NULL) - zfs_rangelock_exit(lr); + if (vm_page_any_valid(ma[i])) { + ASSERT(vm_page_all_valid(ma[i])); + continue; + } + for (j = i + 1; j < count; j++) { + if (vm_page_any_valid(ma[j])) { + ASSERT(vm_page_all_valid(ma[j])); + break; + } + } + count1 = j - i; + last_size = j == count ? + MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE; + error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1, + i == 0 ? &pgsin_b : NULL, j == count ? &pgsin_a : NULL, + last_size); + if (error != 0) + break; + } + + zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);