From 18ca9ecf609bc248901752057d0ffeac2b2d2cb1 Mon Sep 17 00:00:00 2001 From: Jitendra Patidar Date: Mon, 7 Dec 2020 08:36:16 -0800 Subject: [PATCH] ZFS traverse_visitbp optimization to limit prefetch. Traversal code, traverse_visitbp() does visit blocks recursively. Indirect (Non L0) Block of size 128k could contain, 1024 block pointers of 128 bytes. In case of full traverse OR incremental traverse, where all blocks were modified, it could traverse large number of blocks pointed by indirect. Traversal code does issue prefetch of blocks traversed below indirect. This could result into large number of async reads queued on vdev queue. So, account for prefetch issued for blocks pointed by indirect and limit max prefetch in one go. Module Param: zfs_traverse_indirect_prefetch_limit: Limit of prefetch while traversing an indirect block. Local counters: prefetched: Local counter to account for number prefetch done. pidx: Index for which next prefetch to be issued. ptidx: Index at which next prefetch to be triggered. Keep "ptidx" somewhere in the middle of blocks prefetched, so that blocks prefetch read gets the enough time window before their demand read is issued. Signed-off-by: Jitendra Patidar Closes #11802 --- man/man5/zfs-module-parameters.5 | 13 +++++++ module/zfs/dmu_traverse.c | 67 +++++++++++++++++++++++++------- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index e3e19481aab7..ef7d17a41848 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2895,6 +2895,19 @@ The number of bytes which should be prefetched during a pool traversal Default value: \fB52,428,800\fR. .RE +.sp +.ne 2 +.na +\fBzfs_traverse_indirect_prefetch_limit\fR (int) +.ad +.RS 12n +The number of blocks pointed by indirect (non-L0) block, which should be +prefetched during a pool traversal (eg: \fBzfs send\fR or other data +crawling operations) +.sp +Default value: \fB32\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 31db49dae68c..862c0bf404ad 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -41,6 +41,7 @@ int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ int32_t send_holes_without_birth_time = 1; +int32_t zfs_traverse_indirect_prefetch_limit = 32; typedef struct prefetch_data { kmutex_t pd_mtx; @@ -176,7 +177,10 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, return (RESUME_SKIP_NONE); } -static void +/* + * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. + */ +static boolean_t traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { @@ -184,18 +188,18 @@ traverse_prefetch_metadata(traverse_data_t *td, int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) - return; + return (B_FALSE); /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) - return; + return (B_FALSE); if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) - return; + return (B_FALSE); if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) - return; + return (B_FALSE); ASSERT(!BP_IS_REDACTED(bp)); if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) @@ -203,6 +207,7 @@ traverse_prefetch_metadata(traverse_data_t *td, (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + return (B_TRUE); } static boolean_t @@ -295,7 +300,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_FLAG_WAIT; - int32_t i; + int32_t i, ptidx, pidx; + uint32_t prefetchlimit; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -308,16 +314,46 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); + /* + * When performing a traversal it is beneficial to + * asynchronously read-ahead the upcoming indirect + * blocks since they will be needed shortly. However, + * since a 128k indirect (non-L0) block may contain up + * to 1024 128-byte block pointers, its preferable to not + * prefetch them all at once. Issuing a large number of + * async reads may effect performance, and the earlier + * the indirect blocks are prefetched the less likely + * they are to still be resident in the ARC when needed. + * Therefore, prefetching indirect blocks is limited to + * zfs_traverse_indirect_prefetch_limit=32 blocks by + * default. + * + * pidx: Index for which next prefetch to be issued. + * ptidx: Index at which next prefetch to be triggered. + */ + ptidx = 0; + pidx = 1; + prefetchlimit = zfs_traverse_indirect_prefetch_limit; for (i = 0; i < epb; i++) { - SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, - zb->zb_level - 1, - zb->zb_blkid * epb + i); - traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); - } + if (prefetchlimit && i == ptidx) { + ASSERT3S(ptidx, <=, pidx); + for (uint32_t prefetched = 0; pidx < epb && + prefetched < prefetchlimit; pidx++) { + SET_BOOKMARK(czb, zb->zb_objset, + zb->zb_object, zb->zb_level - 1, + zb->zb_blkid * epb + pidx); + if (traverse_prefetch_metadata(td, + &((blkptr_t *)buf->b_data)[pidx], + czb) == B_TRUE) { + prefetched++; + if (prefetched == + MAX(prefetchlimit / 2, 1)) + ptidx = pidx; + } + } + } - /* recursively visitbp() blocks below this */ - for (i = 0; i < epb; i++) { + /* recursively visitbp() blocks below this */ SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); @@ -777,6 +813,9 @@ EXPORT_SYMBOL(traverse_pool); ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, "Max number of bytes to prefetch"); +ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW, + "Traverse prefetch number of blocks pointed by indirect block"); + #if defined(_KERNEL) module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); MODULE_PARM_DESC(ignore_hole_birth,