Skip to content

Commit

Permalink
ZFS traverse_visitbp optimization to limit prefetch.
Browse files Browse the repository at this point in the history
Traversal code, traverse_visitbp() does visit blocks recursively.
Indirect (Non L0) Block of size 128k could contain, 1024 block pointers
of 128 bytes. In case of full traverse OR incremental traverse, where
all blocks were modified, it could traverse large number of blocks
pointed by indirect. Traversal code does issue prefetch of blocks
traversed below indirect. This could result into large number of
async reads queued on vdev queue. So, account for prefetch issued for
blocks pointed by indirect and limit max prefetch in one go.

Module Param:
zfs_traverse_indirect_prefetch_limit: Limit of prefetch while traversing
an indirect block.

Local counters:
prefetched: Local counter to account for number prefetch done.
pidx: Index for which next prefetch to be issued.
ptidx: Index at which next prefetch to be triggered.

Keep "ptidx" somewhere in the middle of blocks prefetched, so that
blocks prefetch read gets the enough time window before their demand
read is issued.

Signed-off-by: Jitendra Patidar <jitendra.patidar@nutanix.com>
Closes #11802
  • Loading branch information
jsai20 committed Apr 14, 2021
1 parent fe6babc commit 18ca9ec
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 14 deletions.
13 changes: 13 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -2895,6 +2895,19 @@ The number of bytes which should be prefetched during a pool traversal
Default value: \fB52,428,800\fR.
.RE

.sp
.ne 2
.na
\fBzfs_traverse_indirect_prefetch_limit\fR (int)
.ad
.RS 12n
The number of blocks pointed by indirect (non-L0) block, which should be
prefetched during a pool traversal (eg: \fBzfs send\fR or other data
crawling operations)
.sp
Default value: \fB32\fR.
.RE

.sp
.ne 2
.na
Expand Down
67 changes: 53 additions & 14 deletions module/zfs/dmu_traverse.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
int32_t send_holes_without_birth_time = 1;
int32_t zfs_traverse_indirect_prefetch_limit = 32;

typedef struct prefetch_data {
kmutex_t pd_mtx;
Expand Down Expand Up @@ -176,33 +177,37 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
return (RESUME_SKIP_NONE);
}

static void
/*
* Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE.
*/
static boolean_t
traverse_prefetch_metadata(traverse_data_t *td,
const blkptr_t *bp, const zbookmark_phys_t *zb)
{
arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;

if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
return (B_FALSE);
/*
* If we are in the process of resuming, don't prefetch, because
* some children will not be needed (and in fact may have already
* been freed).
*/
if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
return;
return (B_FALSE);
if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
return;
return (B_FALSE);
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return;
return (B_FALSE);
ASSERT(!BP_IS_REDACTED(bp));

if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
zio_flags |= ZIO_FLAG_RAW;

(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
return (B_TRUE);
}

static boolean_t
Expand Down Expand Up @@ -295,7 +300,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,

if (BP_GET_LEVEL(bp) > 0) {
uint32_t flags = ARC_FLAG_WAIT;
int32_t i;
int32_t i, ptidx, pidx;
uint32_t prefetchlimit;
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
zbookmark_phys_t *czb;

Expand All @@ -308,16 +314,46 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,

czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);

/*
* When performing a traversal it is beneficial to
* asynchronously read-ahead the upcoming indirect
* blocks since they will be needed shortly. However,
* since a 128k indirect (non-L0) block may contain up
* to 1024 128-byte block pointers, its preferable to not
* prefetch them all at once. Issuing a large number of
* async reads may effect performance, and the earlier
* the indirect blocks are prefetched the less likely
* they are to still be resident in the ARC when needed.
* Therefore, prefetching indirect blocks is limited to
* zfs_traverse_indirect_prefetch_limit=32 blocks by
* default.
*
* pidx: Index for which next prefetch to be issued.
* ptidx: Index at which next prefetch to be triggered.
*/
ptidx = 0;
pidx = 1;
prefetchlimit = zfs_traverse_indirect_prefetch_limit;
for (i = 0; i < epb; i++) {
SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
traverse_prefetch_metadata(td,
&((blkptr_t *)buf->b_data)[i], czb);
}
if (prefetchlimit && i == ptidx) {
ASSERT3S(ptidx, <=, pidx);
for (uint32_t prefetched = 0; pidx < epb &&
prefetched < prefetchlimit; pidx++) {
SET_BOOKMARK(czb, zb->zb_objset,
zb->zb_object, zb->zb_level - 1,
zb->zb_blkid * epb + pidx);
if (traverse_prefetch_metadata(td,
&((blkptr_t *)buf->b_data)[pidx],
czb) == B_TRUE) {
prefetched++;
if (prefetched ==
MAX(prefetchlimit / 2, 1))
ptidx = pidx;
}
}
}

/* recursively visitbp() blocks below this */
for (i = 0; i < epb; i++) {
/* recursively visitbp() blocks below this */
SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
Expand Down Expand Up @@ -777,6 +813,9 @@ EXPORT_SYMBOL(traverse_pool);
ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
"Max number of bytes to prefetch");

ZFS_MODULE_PARAM(zfs, zfs_, traverse_indirect_prefetch_limit, INT, ZMOD_RW,
"Traverse prefetch number of blocks pointed by indirect block");

#if defined(_KERNEL)
module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644);
MODULE_PARM_DESC(ignore_hole_birth,
Expand Down

0 comments on commit 18ca9ec

Please sign in to comment.