Skip to content

Commit

Permalink
Refactor dmu_prefetch().
Browse files Browse the repository at this point in the history
- Split dmu_prefetch_dnode() from dmu_prefetch() into a separate
function.  It is quite inconvenient to read the code where len = 0
means dnode prefetch instead indirect/data prefetch.  One function
doing both has no benefits, since the code paths are independent.
 - Improve dmu_prefetch() handling of long block ranges.  Instead
of limiting L0 data length to prefetch for to dmu_prefetch_max,
make dmu_prefetch_max limit the actual amount of prefetch at the
specified level, and, if there is more, prefetch all the rest at
higher indirection level.  It should improve random access times
within the prefetched range of any length, reducing importance of
specific dmu_prefetch_max value.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15076
  • Loading branch information
amotin authored Aug 7, 2023
1 parent a97b8fc commit 6c94e64
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 57 deletions.
1 change: 1 addition & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,7 @@ extern uint_t zfs_max_recordsize;
*/
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);

typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
Expand Down
4 changes: 1 addition & 3 deletions module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,

ASSERT3S(outcount, <=, bufsize);

/* Prefetch znode */
if (prefetch)
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);

/*
* Move to the next entry, fill in the previous offset.
Expand Down
7 changes: 2 additions & 5 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
if (done)
break;

/* Prefetch znode */
if (prefetch) {
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
}
if (prefetch)
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);

/*
* Move to the next entry, fill in the previous offset.
Expand Down
103 changes: 61 additions & 42 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -694,74 +694,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
}

/*
* Issue prefetch i/os for the given blocks. If level is greater than 0, the
* Issue prefetch I/Os for the given blocks. If level is greater than 0, the
* indirect blocks prefetched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
* the data starting at offset, and continuing to offset + len. If the range
* it too long, prefetch the first dmu_prefetch_max bytes as requested, while
* for the rest only a higher level, also fitting within dmu_prefetch_max. It
* should primarily help random reads, since for long sequential reads there is
* a speculative prefetcher.
*
* Note that if the indirect blocks above the blocks being prefetched are not
* in cache, they will be asynchronously read in.
* in cache, they will be asynchronously read in. Dnode read by dnode_hold()
* is currently synchronous.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
uint64_t blkid;
int nblks, err;

if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os);
int64_t level2 = level;
uint64_t start, end, start2, end2;

if (object == 0 || object >= DN_MAX_OBJECT)
return;

rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, level,
object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
if (dmu_prefetch_max == 0 || len == 0) {
dmu_prefetch_dnode(os, object, pri);
return;
}

/*
* See comment before the definition of dmu_prefetch_max.
*/
len = MIN(len, dmu_prefetch_max);

/*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
if (dnode_hold(os, object, FTAG, &dn) != 0)
return;

/*
* offset + len - 1 is the last byte we want to prefetch for, and offset
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
* last block we want to prefetch, and dbuf_whichblock(dn, level,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
* Depending on len we may do two prefetches: blocks [start, end) at
* level, and following blocks [start2, end2) at higher level2.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
if (dn->dn_datablkshift != 0) {
/*
* The object has multiple blocks. Calculate the full range
* of blocks [start, end2) and then split it into two parts,
* so that the first [start, end) fits into dmu_prefetch_max.
*/
start = dbuf_whichblock(dn, level, offset);
end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
uint8_t ibs = dn->dn_indblkshift;
uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
start2 = end = MIN(end2, start + limit);

/*
* Find level2 where [start2, end2) fits into dmu_prefetch_max.
*/
uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
do {
level2++;
start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
} while (end2 - start2 > limit);
} else {
nblks = (offset < dn->dn_datablksz);
/* There is only one block. Prefetch it or nothing. */
start = start2 = end2 = 0;
end = start + (level == 0 && offset < dn->dn_datablksz);
}

if (nblks != 0) {
blkid = dbuf_whichblock(dn, level, offset);
for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
for (uint64_t i = start; i < end; i++)
dbuf_prefetch(dn, level, i, pri, 0);
for (uint64_t i = start2; i < end2; i++)
dbuf_prefetch(dn, level2, i, pri, 0);
rw_exit(&dn->dn_struct_rwlock);

dnode_rele(dn, FTAG);
}

/*
* Issue prefetch I/Os for the given object's dnode.
*/
void
dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
{
if (object == 0 || object >= DN_MAX_OBJECT)
return;

dnode_t *dn = DMU_META_DNODE(os);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, 0, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
}

/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crashes in the
Expand Down
8 changes: 4 additions & 4 deletions module/zfs/dsl_deadlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
* in parallel. Then open them all in a second pass.
*/
dle->dle_bpobj.bpo_object = za.za_first_integer;
dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
ZIO_PRIORITY_SYNC_READ);

avl_add(&dl->dl_tree, dle);
}
Expand Down Expand Up @@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
* in parallel. Then open them all in a second pass.
*/
dlce->dlce_bpobj = za.za_first_integer;
dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_cache, dlce);
}
VERIFY3U(error, ==, ENOENT);
Expand Down
4 changes: 2 additions & 2 deletions module/zfs/spa_log_spacemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa)
/* Prefetch log spacemaps dnodes. */
for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
ZIO_PRIORITY_SYNC_READ);
}

uint_t pn = 0;
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/zvol.c
Original file line number Diff line number Diff line change
Expand Up @@ -981,7 +981,7 @@ zvol_prefetch_minors_impl(void *arg)
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
FTAG, &os);
if (job->error == 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
dmu_objset_disown(os, B_TRUE, FTAG);
}
}
Expand Down

0 comments on commit 6c94e64

Please sign in to comment.