From d33e2ca2ec27861f665ea90537c52e1347737c5f Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 21 Jul 2024 21:04:38 -0400 Subject: [PATCH 1/2] Cleanup DB_DNODE() macros usage - Use the macros in few places it was missed. - Reduce scope of DB_DNODE_ENTER/EXIT() and inline some DB_DNODE() uses to make it more obvious what exactly is protected there and make unprotected accesses by mistake more difficult. - Make use of zrl_owner(). Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- module/zfs/dbuf.c | 11 +++----- module/zfs/dmu.c | 61 +++++++++++++++++++------------------------- module/zfs/dmu_tx.c | 5 +--- module/zfs/dnode.c | 2 +- module/zfs/sa.c | 4 +-- module/zfs/zfs_log.c | 2 +- 6 files changed, 34 insertions(+), 51 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 56fe2c4dbe30..7cffaebf346d 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4390,7 +4390,7 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr) dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); - VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3P(DB_DNODE(parent_db), ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); @@ -4813,14 +4813,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) zio, (void) buf; dmu_buf_impl_t *db = vdb; - dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_EXIT(db); ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ @@ -4843,7 +4842,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); } - DB_DNODE_EXIT(db); } static void @@ -5062,8 +5060,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; - ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, - DMU_OT_DNODE); + ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8b440aafba43..9b96988616b0 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -276,13 +276,14 @@ dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) dnode_t *dn; int error; + if (newsize < 0 || newsize > db_fake->db_size) + return (SET_ERROR(EINVAL)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); - } else if (newsize < 0 || newsize > db_fake->db_size) { - error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; @@ -299,12 +300,13 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) dnode_t *dn; int error; + if (!DMU_OT_IS_VALID(type)) + return (SET_ERROR(EINVAL)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (!DMU_OT_IS_VALID(type)) { - error = SET_ERROR(EINVAL); - } else if (dn->dn_bonus != db) { + if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); @@ -319,12 +321,10 @@ dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - type = dn->dn_bonustype; + type = DB_DNODE(db)->dn_bonustype; DB_DNODE_EXIT(db); return (type); @@ -486,7 +486,6 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; - dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; @@ -494,8 +493,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); + err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); @@ -668,13 +666,11 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; int err; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); + err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read, + tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); @@ -1301,15 +1297,13 @@ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_read_uio_dnode(dn, uio, size); + err = dmu_read_uio_dnode(DB_DNODE(db), uio, size); DB_DNODE_EXIT(db); return (err); @@ -1403,15 +1397,13 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx); DB_DNODE_EXIT(db); return (err); @@ -1539,11 +1531,11 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - DB_DNODE_ENTER(dbuf); - err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); - DB_DNODE_EXIT(dbuf); + DB_DNODE_ENTER(db); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx); + DB_DNODE_EXIT(db); return (err); } @@ -1782,7 +1774,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; - dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); @@ -1791,8 +1782,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* @@ -1877,11 +1867,14 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + if (dr_next != NULL) { zp.zp_nopwrite = B_FALSE; - DB_DNODE_EXIT(db); + } else { + DB_DNODE_ENTER(db); + if (dnode_block_freed(DB_DNODE(db), db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + } ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || @@ -2487,11 +2480,9 @@ void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - *dnsize = dn->dn_num_slots << DNODE_SHIFT; + *dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 8451b5082e86..2c2a6c7642a5 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1520,11 +1520,8 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { - dnode_t *dn; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_have_spill) { + if (DB_DNODE(db)->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index a703fd414f87..2904c7bfe101 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1757,7 +1757,7 @@ dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting) * handle. */ #ifdef ZFS_DEBUG - ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread); #endif /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 0ae4c331dd36..32ac287eb758 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1852,7 +1852,6 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; - dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; @@ -1872,8 +1871,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, /* First make of copy of the old data */ DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_bonuslen != 0) { + if (DB_DNODE(db)->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); memcpy(old_data[0], hdl->sa_bonus->db_data, diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index fa4e7093ca46..399f5a0117bb 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -665,13 +665,13 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, DB_DNODE_ENTER(db); err = dmu_read_by_dnode(DB_DNODE(db), off, len, lr + 1, DMU_READ_NO_PREFETCH); + DB_DNODE_EXIT(db); if (err != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } - DB_DNODE_EXIT(db); } itx->itx_wr_state = wr_state; From 64341eadc5272d29414e9e9704327f3197e7db0e Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 21 Jul 2024 21:13:42 -0400 Subject: [PATCH 2/2] Skip dnode handles use when not needed Neither FreeBSD nor Linux currently implement kmem_cache_set_move(), which means dnode_move() is never called. In such situation use of dnode handles with respective locking to access dnode from dbuf is a waste of time for no benefit. This patch implements optional simplified code for such platforms, saving at least 3 dnode lock/dereference/unlock per dbuf life cycle. Originally I hoped to drop the handles completely to save memory, but they are still used in dnodes allocation code, so left for now. Before this change in CPU profiles of some workloads I saw 4-20% of CPU time spent in zrl_add_impl()/zrl_remove(), which are gone now. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- include/sys/dbuf.h | 16 +++++++++++++++- module/zfs/dbuf.c | 8 ++++++++ module/zfs/dnode.c | 13 +++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 3808a04cba80..8b03b1f895f8 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -214,9 +214,15 @@ typedef struct dmu_buf_impl { struct objset *db_objset; /* - * handle to safely access the dnode we belong to (NULL when evicted) + * Handle to safely access the dnode we belong to (NULL when evicted) + * if dnode_move() is used on the platform, or just dnode otherwise. */ +#if !defined(__linux__) && !defined(__FreeBSD__) +#define USE_DNODE_HANDLE 1 struct dnode_handle *db_dnode_handle; +#else + struct dnode *db_dnode; +#endif /* * our parent buffer; if the dnode points to us directly, @@ -417,11 +423,19 @@ void dbuf_stats_destroy(void); int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift); +#ifdef USE_DNODE_HANDLE #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) #define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) #define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) +#else +#define DB_DNODE(_db) ((_db)->db_dnode) +#define DB_DNODE_LOCK(_db) +#define DB_DNODE_ENTER(_db) +#define DB_DNODE_EXIT(_db) +#define DB_DNODE_HELD(_db) (B_TRUE) +#endif void dbuf_init(void); void dbuf_fini(void); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 7cffaebf346d..3173e069749a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3103,7 +3103,11 @@ dbuf_destroy(dmu_buf_impl_t *db) */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); +#ifdef USE_DNODE_HANDLE db->db_dnode_handle = NULL; +#else + db->db_dnode = NULL; +#endif dbuf_hash_remove(db); } else { @@ -3252,7 +3256,11 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_level = level; db->db_blkid = blkid; db->db_dirtycnt = 0; +#ifdef USE_DNODE_HANDLE db->db_dnode_handle = dn->dn_handle; +#else + db->db_dnode = dn; +#endif db->db_parent = parent; db->db_blkptr = blkptr; db->db_hash = hash; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 2904c7bfe101..5058ca374a27 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1020,6 +1020,19 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) int64_t refcount; uint32_t dbufs; +#ifndef USE_DNODE_HANDLE + /* + * We can't move dnodes if dbufs reference them directly without + * using handles and respecitve locking. Unless USE_DNODE_HANDLE + * is defined the code below is only to make sure it still builds, + * but it should never be used, since it is unsafe. + */ +#ifdef ZFS_DEBUG + PANIC("dnode_move() called without USE_DNODE_HANDLE"); +#endif + return (KMEM_CBRC_NO); +#endif + /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when