From 1a3e32e6a2d1eac74e4b5601b5d265a2fd6099ba Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 21 Jul 2024 21:04:38 -0400 Subject: [PATCH 1/5] Cleanup DB_DNODE() macros usage - Use the macros in few places it was missed. - Reduce scope of DB_DNODE_ENTER/EXIT() and inline some DB_DNODE() uses to make it more obvious what exactly is protected there and make unprotected accesses by mistake more difficult. - Make use of zrl_owner(). Reviewed-by: Rob Wing Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16374 --- module/zfs/dbuf.c | 11 +++----- module/zfs/dmu.c | 61 +++++++++++++++++++------------------------- module/zfs/dmu_tx.c | 5 +--- module/zfs/dnode.c | 2 +- module/zfs/sa.c | 4 +-- module/zfs/zfs_log.c | 2 +- 6 files changed, 34 insertions(+), 51 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 56fe2c4dbe30..7cffaebf346d 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4390,7 +4390,7 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr) dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); - VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3P(DB_DNODE(parent_db), ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); @@ -4813,14 +4813,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) zio, (void) buf; dmu_buf_impl_t *db = vdb; - dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_EXIT(db); ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ @@ -4843,7 +4842,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); } - DB_DNODE_EXIT(db); } static void @@ -5062,8 +5060,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; - ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, - DMU_OT_DNODE); + ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index c3b6b97f8a0e..f45354a7f303 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -276,13 +276,14 @@ dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) dnode_t *dn; int error; + if (newsize < 0 || newsize > db_fake->db_size) + return (SET_ERROR(EINVAL)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); - } else if (newsize < 0 || newsize > db_fake->db_size) { - error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; @@ -299,12 +300,13 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) dnode_t *dn; int error; + if (!DMU_OT_IS_VALID(type)) + return (SET_ERROR(EINVAL)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (!DMU_OT_IS_VALID(type)) { - error = SET_ERROR(EINVAL); - } else if (dn->dn_bonus != db) { + if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); @@ -319,12 +321,10 @@ dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - type = dn->dn_bonustype; + type = DB_DNODE(db)->dn_bonustype; DB_DNODE_EXIT(db); return (type); @@ -486,7 +486,6 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; - dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; @@ -494,8 +493,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); + err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); @@ -668,13 +666,11 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; int err; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); + err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read, + tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); @@ -1408,15 +1404,13 @@ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_read_uio_dnode(dn, uio, size); + err = dmu_read_uio_dnode(DB_DNODE(db), uio, size); DB_DNODE_EXIT(db); return (err); @@ -1510,15 +1504,13 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx); DB_DNODE_EXIT(db); return (err); @@ -1754,11 +1746,11 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - DB_DNODE_ENTER(dbuf); - err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); - DB_DNODE_EXIT(dbuf); + DB_DNODE_ENTER(db); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx); + DB_DNODE_EXIT(db); return (err); } @@ -1997,7 +1989,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; - dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); @@ -2006,8 +1997,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* @@ -2092,11 +2082,14 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + if (dr_next != NULL) { zp.zp_nopwrite = B_FALSE; - DB_DNODE_EXIT(db); + } else { + DB_DNODE_ENTER(db); + if (dnode_block_freed(DB_DNODE(db), db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + } ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || @@ -2702,11 +2695,9 @@ void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - *dnsize = dn->dn_num_slots << DNODE_SHIFT; + *dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 8451b5082e86..2c2a6c7642a5 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1520,11 +1520,8 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { - dnode_t *dn; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_have_spill) { + if (DB_DNODE(db)->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index a703fd414f87..2904c7bfe101 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1757,7 +1757,7 @@ dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting) * handle. */ #ifdef ZFS_DEBUG - ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread); #endif /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 0ae4c331dd36..32ac287eb758 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1852,7 +1852,6 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; - dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; @@ -1872,8 +1871,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, /* First make of copy of the old data */ DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_bonuslen != 0) { + if (DB_DNODE(db)->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); memcpy(old_data[0], hdl->sa_bonus->db_data, diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index fa4e7093ca46..399f5a0117bb 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -665,13 +665,13 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, DB_DNODE_ENTER(db); err = dmu_read_by_dnode(DB_DNODE(db), off, len, lr + 1, DMU_READ_NO_PREFETCH); + DB_DNODE_EXIT(db); if (err != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } - DB_DNODE_EXIT(db); } itx->itx_wr_state = wr_state; From ed87d456e4ebcd031d43c29b93d0a664581a51f7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 21 Jul 2024 21:13:42 -0400 Subject: [PATCH 2/5] Skip dnode handles use when not needed Neither FreeBSD nor Linux currently implement kmem_cache_set_move(), which means dnode_move() is never called. In such situation use of dnode handles with respective locking to access dnode from dbuf is a waste of time for no benefit. This patch implements optional simplified code for such platforms, saving at least 3 dnode lock/dereference/unlock per dbuf life cycle. Originally I hoped to drop the handles completely to save memory, but they are still used in dnodes allocation code, so left for now. Before this change in CPU profiles of some workloads I saw 4-20% of CPU time spent in zrl_add_impl()/zrl_remove(), which are gone now. Reviewed-by: Rob Wing Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16374 --- include/sys/dbuf.h | 16 +++++++++++++++- module/zfs/dbuf.c | 8 ++++++++ module/zfs/dnode.c | 13 +++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 3808a04cba80..8b03b1f895f8 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -214,9 +214,15 @@ typedef struct dmu_buf_impl { struct objset *db_objset; /* - * handle to safely access the dnode we belong to (NULL when evicted) + * Handle to safely access the dnode we belong to (NULL when evicted) + * if dnode_move() is used on the platform, or just dnode otherwise. */ +#if !defined(__linux__) && !defined(__FreeBSD__) +#define USE_DNODE_HANDLE 1 struct dnode_handle *db_dnode_handle; +#else + struct dnode *db_dnode; +#endif /* * our parent buffer; if the dnode points to us directly, @@ -417,11 +423,19 @@ void dbuf_stats_destroy(void); int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift); +#ifdef USE_DNODE_HANDLE #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) #define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) #define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) +#else +#define DB_DNODE(_db) ((_db)->db_dnode) +#define DB_DNODE_LOCK(_db) +#define DB_DNODE_ENTER(_db) +#define DB_DNODE_EXIT(_db) +#define DB_DNODE_HELD(_db) (B_TRUE) +#endif void dbuf_init(void); void dbuf_fini(void); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 7cffaebf346d..3173e069749a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3103,7 +3103,11 @@ dbuf_destroy(dmu_buf_impl_t *db) */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); +#ifdef USE_DNODE_HANDLE db->db_dnode_handle = NULL; +#else + db->db_dnode = NULL; +#endif dbuf_hash_remove(db); } else { @@ -3252,7 +3256,11 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_level = level; db->db_blkid = blkid; db->db_dirtycnt = 0; +#ifdef USE_DNODE_HANDLE db->db_dnode_handle = dn->dn_handle; +#else + db->db_dnode = dn; +#endif db->db_parent = parent; db->db_blkptr = blkptr; db->db_hash = hash; diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 2904c7bfe101..5058ca374a27 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1020,6 +1020,19 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) int64_t refcount; uint32_t dbufs; +#ifndef USE_DNODE_HANDLE + /* + * We can't move dnodes if dbufs reference them directly without + * using handles and respecitve locking. Unless USE_DNODE_HANDLE + * is defined the code below is only to make sure it still builds, + * but it should never be used, since it is unsafe. + */ +#ifdef ZFS_DEBUG + PANIC("dnode_move() called without USE_DNODE_HANDLE"); +#endif + return (KMEM_CBRC_NO); +#endif + /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when From e26b3771ee43256a0347b7ec2503abbed7b5ecd5 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 27 Jun 2023 11:03:29 +1000 Subject: [PATCH 3/5] spa_preferred_class: pass the entire zio Rather than picking out specific values out of the properties, just pass the entire zio in, to make it easier in the future to use more of that info to decide on the storage class. I would have rathered just pass io_prop in, but having spa.h include zio.h gets a bit tricky. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15894 --- include/sys/spa.h | 3 +-- module/zfs/spa_misc.c | 13 ++++++++----- module/zfs/zio.c | 7 ++----- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 12321c6562be..3998f5a6de73 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1049,8 +1049,7 @@ extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); -extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, - dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); +extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); extern void spa_evicting_os_register(spa_t *, objset_t *os); diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 439e56f0d0df..c9ee36c0f9b2 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2007,9 +2007,11 @@ spa_special_has_ddt(spa_t *spa) * Locate an appropriate allocation class */ metaslab_class_t * -spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, - uint_t level, uint_t special_smallblk) +spa_preferred_class(spa_t *spa, const zio_t *zio) { + const zio_prop_t *zp = &zio->io_prop; + dmu_object_type_t objtype = zp->zp_type; + /* * ZIL allocations determine their class in zio_alloc_zil(). */ @@ -2027,14 +2029,15 @@ spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, } /* Indirect blocks for user data can land in special if allowed */ - if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { + if (zp->zp_level > 0 && + (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } - if (DMU_OT_IS_METADATA(objtype) || level > 0) { + if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { if (has_special_class) return (spa_special_class(spa)); else @@ -2047,7 +2050,7 @@ spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && - has_special_class && size <= special_smallblk) { + has_special_class && zio->io_size <= zp->zp_zpl_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index bc5a3c9b709b..781011373793 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3637,8 +3637,7 @@ zio_dva_throttle(zio_t *zio) metaslab_class_t *mc; /* locate an appropriate allocation class */ - mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, - zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); + mc = spa_preferred_class(spa, zio); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || @@ -3710,9 +3709,7 @@ zio_dva_allocate(zio_t *zio) */ mc = zio->io_metaslab_class; if (mc == NULL) { - mc = spa_preferred_class(spa, zio->io_size, - zio->io_prop.zp_type, zio->io_prop.zp_level, - zio->io_prop.zp_zpl_smallblk); + mc = spa_preferred_class(spa, zio); zio->io_metaslab_class = mc; } From d54d0fff392a8889e794a10c1e90746f4493cf30 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 27 Jun 2023 12:50:18 +1000 Subject: [PATCH 4/5] dnode: allow storage class to be overridden by object type spa_preferred_class() selects a storage class based on (among other things) the DMU object type. This only works for old-style object types that match only one specific kind of thing. For DMU_OTN_ types we need another way to signal the storage class. This commit allows the object type to be overridden in the IO policy for the purposes of choosing a storage class. It then adds the ability to set the storage type on a dnode hold, such that all writes generated under that hold will get it. This method has two shortcomings: - it would be better if we could "name" a set of storage class preferences rather than it being implied by the object type. - it would be better if this info were stored in the dnode on disk. In the absence of those things, this seems like the smallest possible change. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15894 --- include/sys/dnode.h | 5 +++++ include/sys/zio.h | 1 + module/zfs/dmu.c | 1 + module/zfs/dnode.c | 17 +++++++++++++++++ module/zfs/spa_misc.c | 11 ++++++++++- module/zfs/zio.c | 2 +- 6 files changed, 35 insertions(+), 2 deletions(-) diff --git a/include/sys/dnode.h b/include/sys/dnode.h index dbe7350d4da7..5d0f0fb26d02 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -380,6 +380,9 @@ struct dnode { /* holds prefetch structure */ struct zfetch dn_zfetch; + + /* Not in dn_phys, but should be. set it after taking a hold */ + dmu_object_type_t dn_storage_type; /* type for storage class */ }; /* @@ -462,6 +465,8 @@ void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); void dnode_free_interior_slots(dnode_t *dn); +void dnode_set_storage_type(dnode_t *dn, dmu_object_type_t type); + #define DNODE_IS_DIRTY(_dn) \ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) diff --git a/include/sys/zio.h b/include/sys/zio.h index 77c70b9b481c..446b64ccd8ab 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -356,6 +356,7 @@ typedef struct zio_prop { uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; + dmu_object_type_t zp_storage_type; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f45354a7f303..3dcf49ceb64e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2362,6 +2362,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; + zp->zp_storage_type = dn ? dn->dn_storage_type : DMU_OT_NONE; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 5058ca374a27..b6cc512cbb85 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -543,6 +543,17 @@ dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); } +void +dnode_set_storage_type(dnode_t *dn, dmu_object_type_t newtype) +{ + /* + * This is not in the dnode_phys, but it should be, and perhaps one day + * will. For now we require it be set after taking a hold. + */ + ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); + dn->dn_storage_type = newtype; +} + void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { @@ -604,6 +615,8 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; + dn->dn_storage_type = DMU_OT_NONE; + dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); @@ -687,6 +700,8 @@ dnode_destroy(dnode_t *dn) dn->dn_newprojid = ZFS_DEFAULT_PROJID; dn->dn_id_flags = 0; + dn->dn_storage_type = DMU_OT_NONE; + dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); @@ -946,6 +961,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_newgid = odn->dn_newgid; ndn->dn_newprojid = odn->dn_newprojid; ndn->dn_id_flags = odn->dn_id_flags; + ndn->dn_storage_type = odn->dn_storage_type; dmu_zfetch_init(&ndn->dn_zfetch, ndn); /* @@ -1004,6 +1020,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_newgid = 0; odn->dn_newprojid = ZFS_DEFAULT_PROJID; odn->dn_id_flags = 0; + odn->dn_storage_type = DMU_OT_NONE; /* * Mark the dnode. diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index c9ee36c0f9b2..97191e768549 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2010,7 +2010,16 @@ metaslab_class_t * spa_preferred_class(spa_t *spa, const zio_t *zio) { const zio_prop_t *zp = &zio->io_prop; - dmu_object_type_t objtype = zp->zp_type; + + /* + * Override object type for the purposes of selecting a storage class. + * Primarily for DMU_OTN_ types where we can't explicitly control their + * storage class; instead, choose a static type most closely matches + * what we want. + */ + dmu_object_type_t objtype = + zp->zp_storage_type == DMU_OT_NONE ? + zp->zp_type : zp->zp_storage_type; /* * ZIL allocations determine their class in zio_alloc_zil(). diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 781011373793..00c93e76ae25 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3069,7 +3069,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; - zp.zp_type = DMU_OT_NONE; + zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; From d4b5517ef95948d22a26376768442a59116fa4e7 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 30 Jul 2024 14:40:47 -0400 Subject: [PATCH 5/5] Linux: Report reclaimable memory to kernel as such (#16385) Linux provides SLAB_RECLAIM_ACCOUNT and __GFP_RECLAIMABLE flags to mark memory allocations that can be freed via shinker calls. It should allow kernel to tune and group such allocations for lower memory fragmentation and better reclamation under pressure. This patch marks as reclaimable most of ARC memory, directly evictable via ZFS shrinker, plus also dnode/znode/sa memory, indirectly evictable via kernel's superblock shrinker. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Reviewed-by: Tony Hutter Reviewed-by: Allan Jude --- include/os/freebsd/spl/sys/kmem.h | 1 + include/os/linux/spl/sys/kmem_cache.h | 5 ++--- include/sys/zfs_context.h | 1 + module/os/freebsd/zfs/abd_os.c | 2 +- module/os/freebsd/zfs/zfs_znode.c | 2 +- module/os/linux/spl/spl-kmem-cache.c | 8 ++++++++ module/os/linux/spl/spl-zlib.c | 2 +- module/os/linux/zfs/abd_os.c | 6 +++--- module/os/linux/zfs/zfs_znode.c | 3 ++- module/zfs/arc.c | 2 +- module/zfs/dnode.c | 2 +- module/zfs/lz4_zfs.c | 3 ++- module/zfs/sa.c | 2 +- module/zfs/zio.c | 4 ++++ 14 files changed, 29 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h index c633799318d5..ae786f0e20ca 100644 --- a/include/os/freebsd/spl/sys/kmem.h +++ b/include/os/freebsd/spl/sys/kmem.h @@ -49,6 +49,7 @@ MALLOC_DECLARE(M_SOLARIS); #define KM_NOSLEEP M_NOWAIT #define KM_NORMALPRI 0 #define KMC_NODEBUG UMA_ZONE_NODUMP +#define KMC_RECLAIMABLE 0x0 typedef struct vmem vmem_t; diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 905ff57a1434..2b4f120e6427 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -45,6 +45,7 @@ typedef enum kmc_bit { KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ KMC_BIT_MAX = 20, /* Proc handler helper bit */ + KMC_BIT_RECLAIMABLE = 21, /* Can be freed by shrinker */ } kmc_bit_t; /* kmem move callback return values */ @@ -66,9 +67,7 @@ typedef enum kmem_cbrc { #define KMC_TOTAL (1 << KMC_BIT_TOTAL) #define KMC_ALLOC (1 << KMC_BIT_ALLOC) #define KMC_MAX (1 << KMC_BIT_MAX) - -#define KMC_REAP_CHUNK INT_MAX -#define KMC_DEFAULT_SEEKS 1 +#define KMC_RECLAIMABLE (1 << KMC_BIT_RECLAIMABLE) extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index e4711ce4194a..998eaa5dd813 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -413,6 +413,7 @@ void procfs_list_add(procfs_list_t *procfs_list, void *p); #define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ #define KMC_NODEBUG UMC_NODEBUG #define KMC_KVMEM 0x0 +#define KMC_RECLAIMABLE 0x0 #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 3b812271f98b..fb5c46ecf7c2 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -300,7 +300,7 @@ void abd_init(void) { abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0, - NULL, NULL, NULL, NULL, 0, KMC_NODEBUG); + NULL, NULL, NULL, NULL, 0, KMC_NODEBUG | KMC_RECLAIMABLE); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 0eea2a849416..afbea9798c33 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -236,7 +236,7 @@ zfs_znode_init(void) ASSERT3P(znode_cache, ==, NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, - zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); } static znode_t * diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 737c2e063f71..16412bc9e6cf 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -144,6 +144,8 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + if (skc->skc_flags & KMC_RECLAIMABLE) + lflags |= __GFP_RECLAIMABLE; ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ @@ -424,6 +426,8 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); + if (skc->skc_flags & KMC_RECLAIMABLE) + lflags |= __GFP_RECLAIMABLE; ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); @@ -663,6 +667,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache * KMC_NODEBUG Disable debugging (unsupported) + * KMC_RECLAIMABLE Memory can be freed under pressure */ spl_kmem_cache_t * spl_kmem_cache_create(const char *name, size_t size, size_t align, @@ -780,6 +785,9 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, if (size > spl_kmem_cache_slab_limit) goto out; + if (skc->skc_flags & KMC_RECLAIMABLE) + slabflags |= SLAB_RECLAIM_ACCOUNT; + #if defined(SLAB_USERCOPY) /* * Required for PAX-enabled kernels if the slab is to be diff --git a/module/os/linux/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c index 8c6282ee5d16..a7b6c14ee150 100644 --- a/module/os/linux/spl/spl-zlib.c +++ b/module/os/linux/spl/spl-zlib.c @@ -202,7 +202,7 @@ spl_zlib_init(void) zlib_workspace_cache = kmem_cache_create( "spl_zlib_workspace_cache", size, 0, NULL, NULL, NULL, NULL, NULL, - KMC_KVMEM); + KMC_KVMEM | KMC_RECLAIMABLE); if (!zlib_workspace_cache) return (-ENOMEM); diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 4bf9eaf771b5..f7af20c619a4 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -281,7 +281,7 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct sg_table table; struct scatterlist *sg; struct page *page, *tmp_page = NULL; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; unsigned int max_order = MIN(zfs_abd_scatter_max_order, ABD_MAX_ORDER - 1); @@ -403,7 +403,7 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct scatterlist *sg = NULL; struct sg_table table; struct page *page; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; int nr_pages = abd_chunkcnt_for_bytes(size); int i = 0; @@ -762,7 +762,7 @@ abd_init(void) int i; abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); + 0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_linear_cnt, 0); diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index b99df188c64b..265153e011e7 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -194,7 +194,8 @@ zfs_znode_init(void) ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, - zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); + zfs_znode_cache_destructor, NULL, NULL, NULL, + KMC_SLAB | KMC_RECLAIMABLE); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6605243bac47..d01bf0947df6 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1258,7 +1258,7 @@ buf_init(void) } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, - 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index b6cc512cbb85..ecc6761f8fa4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -306,7 +306,7 @@ dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), - 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); kmem_cache_set_move(dnode_cache, dnode_move); wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0); diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c index e28215cf3501..a3b9e7070373 100644 --- a/module/zfs/lz4_zfs.c +++ b/module/zfs/lz4_zfs.c @@ -867,7 +867,8 @@ void lz4_init(void) { lz4_cache = kmem_cache_create("lz4_cache", - sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, + KMC_RECLAIMABLE); } void diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 32ac287eb758..a1b2ffa8705c 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -236,7 +236,7 @@ sa_cache_init(void) { sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, - sa_cache_destructor, NULL, NULL, NULL, 0); + sa_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); } void diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 00c93e76ae25..26ffc597f1ec 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -194,6 +194,10 @@ zio_init(void) cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? KMC_NODEBUG : 0; data_cflags = KMC_NODEBUG; + if (abd_size_alloc_linear(size)) { + cflags |= KMC_RECLAIMABLE; + data_cflags |= KMC_RECLAIMABLE; + } if (cflags == data_cflags) { /* * Resulting kmem caches would be identical.