diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h index c633799318d5..ae786f0e20ca 100644 --- a/include/os/freebsd/spl/sys/kmem.h +++ b/include/os/freebsd/spl/sys/kmem.h @@ -49,6 +49,7 @@ MALLOC_DECLARE(M_SOLARIS); #define KM_NOSLEEP M_NOWAIT #define KM_NORMALPRI 0 #define KMC_NODEBUG UMA_ZONE_NODUMP +#define KMC_RECLAIMABLE 0x0 typedef struct vmem vmem_t; diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 905ff57a1434..2b4f120e6427 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -45,6 +45,7 @@ typedef enum kmc_bit { KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ KMC_BIT_MAX = 20, /* Proc handler helper bit */ + KMC_BIT_RECLAIMABLE = 21, /* Can be freed by shrinker */ } kmc_bit_t; /* kmem move callback return values */ @@ -66,9 +67,7 @@ typedef enum kmem_cbrc { #define KMC_TOTAL (1 << KMC_BIT_TOTAL) #define KMC_ALLOC (1 << KMC_BIT_ALLOC) #define KMC_MAX (1 << KMC_BIT_MAX) - -#define KMC_REAP_CHUNK INT_MAX -#define KMC_DEFAULT_SEEKS 1 +#define KMC_RECLAIMABLE (1 << KMC_BIT_RECLAIMABLE) extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 3808a04cba80..8b03b1f895f8 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -214,9 +214,15 @@ typedef struct dmu_buf_impl { struct objset *db_objset; /* - * handle to safely access the dnode we belong to (NULL when evicted) + * Handle to safely access the dnode we belong to (NULL when evicted) + * if dnode_move() is used on the platform, or just dnode otherwise. */ +#if !defined(__linux__) && !defined(__FreeBSD__) +#define USE_DNODE_HANDLE 1 struct dnode_handle *db_dnode_handle; +#else + struct dnode *db_dnode; +#endif /* * our parent buffer; if the dnode points to us directly, @@ -417,11 +423,19 @@ void dbuf_stats_destroy(void); int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift); +#ifdef USE_DNODE_HANDLE #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) #define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) #define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) +#else +#define DB_DNODE(_db) ((_db)->db_dnode) +#define DB_DNODE_LOCK(_db) +#define DB_DNODE_ENTER(_db) +#define DB_DNODE_EXIT(_db) +#define DB_DNODE_HELD(_db) (B_TRUE) +#endif void dbuf_init(void); void dbuf_fini(void); diff --git a/include/sys/dnode.h b/include/sys/dnode.h index dbe7350d4da7..5d0f0fb26d02 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -380,6 +380,9 @@ struct dnode { /* holds prefetch structure */ struct zfetch dn_zfetch; + + /* Not in dn_phys, but should be. set it after taking a hold */ + dmu_object_type_t dn_storage_type; /* type for storage class */ }; /* @@ -462,6 +465,8 @@ void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); void dnode_free_interior_slots(dnode_t *dn); +void dnode_set_storage_type(dnode_t *dn, dmu_object_type_t type); + #define DNODE_IS_DIRTY(_dn) \ ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) diff --git a/include/sys/spa.h b/include/sys/spa.h index 12321c6562be..3998f5a6de73 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1049,8 +1049,7 @@ extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); -extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, - dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); +extern metaslab_class_t *spa_preferred_class(spa_t *spa, const zio_t *zio); extern boolean_t spa_special_has_ddt(spa_t *spa); extern void spa_evicting_os_register(spa_t *, objset_t *os); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index e4711ce4194a..998eaa5dd813 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -413,6 +413,7 @@ void procfs_list_add(procfs_list_t *procfs_list, void *p); #define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ #define KMC_NODEBUG UMC_NODEBUG #define KMC_KVMEM 0x0 +#define KMC_RECLAIMABLE 0x0 #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) diff --git a/include/sys/zio.h b/include/sys/zio.h index 77c70b9b481c..446b64ccd8ab 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -356,6 +356,7 @@ typedef struct zio_prop { uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; uint32_t zp_zpl_smallblk; + dmu_object_type_t zp_storage_type; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 3b812271f98b..fb5c46ecf7c2 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -300,7 +300,7 @@ void abd_init(void) { abd_chunk_cache = kmem_cache_create("abd_chunk", PAGE_SIZE, 0, - NULL, NULL, NULL, NULL, 0, KMC_NODEBUG); + NULL, NULL, NULL, NULL, 0, KMC_NODEBUG | KMC_RECLAIMABLE); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index 0eea2a849416..afbea9798c33 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -236,7 +236,7 @@ zfs_znode_init(void) ASSERT3P(znode_cache, ==, NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, - zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); } static znode_t * diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 737c2e063f71..16412bc9e6cf 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -144,6 +144,8 @@ kv_alloc(spl_kmem_cache_t *skc, int size, int flags) gfp_t lflags = kmem_flags_convert(flags); void *ptr; + if (skc->skc_flags & KMC_RECLAIMABLE) + lflags |= __GFP_RECLAIMABLE; ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); /* Resulting allocated memory will be page aligned */ @@ -424,6 +426,8 @@ spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) if (!empty) return (-EEXIST); + if (skc->skc_flags & KMC_RECLAIMABLE) + lflags |= __GFP_RECLAIMABLE; ske = kmalloc(sizeof (*ske), lflags); if (ske == NULL) return (-ENOMEM); @@ -663,6 +667,7 @@ spl_magazine_destroy(spl_kmem_cache_t *skc) * KMC_KVMEM Force kvmem backed SPL cache * KMC_SLAB Force Linux slab backed cache * KMC_NODEBUG Disable debugging (unsupported) + * KMC_RECLAIMABLE Memory can be freed under pressure */ spl_kmem_cache_t * spl_kmem_cache_create(const char *name, size_t size, size_t align, @@ -780,6 +785,9 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, if (size > spl_kmem_cache_slab_limit) goto out; + if (skc->skc_flags & KMC_RECLAIMABLE) + slabflags |= SLAB_RECLAIM_ACCOUNT; + #if defined(SLAB_USERCOPY) /* * Required for PAX-enabled kernels if the slab is to be diff --git a/module/os/linux/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c index 8c6282ee5d16..a7b6c14ee150 100644 --- a/module/os/linux/spl/spl-zlib.c +++ b/module/os/linux/spl/spl-zlib.c @@ -202,7 +202,7 @@ spl_zlib_init(void) zlib_workspace_cache = kmem_cache_create( "spl_zlib_workspace_cache", size, 0, NULL, NULL, NULL, NULL, NULL, - KMC_KVMEM); + KMC_KVMEM | KMC_RECLAIMABLE); if (!zlib_workspace_cache) return (-ENOMEM); diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 4bf9eaf771b5..f7af20c619a4 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -281,7 +281,7 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct sg_table table; struct scatterlist *sg; struct page *page, *tmp_page = NULL; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; unsigned int max_order = MIN(zfs_abd_scatter_max_order, ABD_MAX_ORDER - 1); @@ -403,7 +403,7 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct scatterlist *sg = NULL; struct sg_table table; struct page *page; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp = __GFP_RECLAIMABLE | __GFP_NOWARN | GFP_NOIO; int nr_pages = abd_chunkcnt_for_bytes(size); int i = 0; @@ -762,7 +762,7 @@ abd_init(void) int i; abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); + 0, NULL, NULL, NULL, NULL, NULL, KMC_RECLAIMABLE); wmsum_init(&abd_sums.abdstat_struct_size, 0); wmsum_init(&abd_sums.abdstat_linear_cnt, 0); diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index b99df188c64b..265153e011e7 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -194,7 +194,8 @@ zfs_znode_init(void) ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, - zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); + zfs_znode_cache_destructor, NULL, NULL, NULL, + KMC_SLAB | KMC_RECLAIMABLE); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9f39df87e45b..f9673087086c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -1258,7 +1258,7 @@ buf_init(void) } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, - 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, NULL, NULL, 0); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 56fe2c4dbe30..3173e069749a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3103,7 +3103,11 @@ dbuf_destroy(dmu_buf_impl_t *db) */ mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, db, B_TRUE); +#ifdef USE_DNODE_HANDLE db->db_dnode_handle = NULL; +#else + db->db_dnode = NULL; +#endif dbuf_hash_remove(db); } else { @@ -3252,7 +3256,11 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_level = level; db->db_blkid = blkid; db->db_dirtycnt = 0; +#ifdef USE_DNODE_HANDLE db->db_dnode_handle = dn->dn_handle; +#else + db->db_dnode = dn; +#endif db->db_parent = parent; db->db_blkptr = blkptr; db->db_hash = hash; @@ -4390,7 +4398,7 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr) dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; VERIFY3U(parent_db->db_level, ==, 1); - VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3P(DB_DNODE(parent_db), ==, dn); VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); blkptr_t *bp = parent_db->db.db_data; return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); @@ -4813,14 +4821,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { (void) zio, (void) buf; dmu_buf_impl_t *db = vdb; - dnode_t *dn; blkptr_t *bp; unsigned int epbs, i; ASSERT3U(db->db_level, >, 0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + epbs = DB_DNODE(db)->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + DB_DNODE_EXIT(db); ASSERT3U(epbs, <, 31); /* Determine if all our children are holes */ @@ -4843,7 +4850,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) memset(db->db.db_data, 0, db->db.db_size); rw_exit(&db->db_rwlock); } - DB_DNODE_EXIT(db); } static void @@ -5062,8 +5068,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) } } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { dnode_phys_t *dnp = db->db.db_data; - ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, - DMU_OT_DNODE); + ASSERT3U(dn->dn_type, ==, DMU_OT_DNODE); for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i += dnp[i].dn_extra_slots + 1) { for (int j = 0; j < dnp[i].dn_nblkptr; j++) { diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index c3b6b97f8a0e..3dcf49ceb64e 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -276,13 +276,14 @@ dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) dnode_t *dn; int error; + if (newsize < 0 || newsize > db_fake->db_size) + return (SET_ERROR(EINVAL)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); - } else if (newsize < 0 || newsize > db_fake->db_size) { - error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; @@ -299,12 +300,13 @@ dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) dnode_t *dn; int error; + if (!DMU_OT_IS_VALID(type)) + return (SET_ERROR(EINVAL)); + DB_DNODE_ENTER(db); dn = DB_DNODE(db); - if (!DMU_OT_IS_VALID(type)) { - error = SET_ERROR(EINVAL); - } else if (dn->dn_bonus != db) { + if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); @@ -319,12 +321,10 @@ dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - type = dn->dn_bonustype; + type = DB_DNODE(db)->dn_bonustype; DB_DNODE_EXIT(db); return (type); @@ -486,7 +486,6 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; - dnode_t *dn; int err; uint32_t db_flags = DB_RF_CANFAIL; @@ -494,8 +493,7 @@ dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag, db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); + err = dmu_spill_hold_by_dnode(DB_DNODE(db), db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); @@ -668,13 +666,11 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; int err; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp, DMU_READ_PREFETCH); + err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read, + tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); @@ -1408,15 +1404,13 @@ int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_read_uio_dnode(dn, uio, size); + err = dmu_read_uio_dnode(DB_DNODE(db), uio, size); DB_DNODE_EXIT(db); return (err); @@ -1510,15 +1504,13 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; - dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx); DB_DNODE_EXIT(db); return (err); @@ -1754,11 +1746,11 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { int err; - dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - DB_DNODE_ENTER(dbuf); - err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); - DB_DNODE_EXIT(dbuf); + DB_DNODE_ENTER(db); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx); + DB_DNODE_EXIT(db); return (err); } @@ -1997,7 +1989,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; - dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); @@ -2006,8 +1997,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* @@ -2092,11 +2082,14 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + if (dr_next != NULL) { zp.zp_nopwrite = B_FALSE; - DB_DNODE_EXIT(db); + } else { + DB_DNODE_ENTER(db); + if (dnode_block_freed(DB_DNODE(db), db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + } ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || @@ -2369,6 +2362,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; + zp->zp_storage_type = dn ? dn->dn_storage_type : DMU_OT_NONE; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); } @@ -2702,11 +2696,9 @@ void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - dnode_t *dn; DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - *dnsize = dn->dn_num_slots << DNODE_SHIFT; + *dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 8451b5082e86..2c2a6c7642a5 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1520,11 +1520,8 @@ dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { - dnode_t *dn; - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_have_spill) { + if (DB_DNODE(db)->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index a703fd414f87..ecc6761f8fa4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -306,7 +306,7 @@ dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), - 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_RECLAIMABLE); kmem_cache_set_move(dnode_cache, dnode_move); wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0); @@ -543,6 +543,17 @@ dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); } +void +dnode_set_storage_type(dnode_t *dn, dmu_object_type_t newtype) +{ + /* + * This is not in the dnode_phys, but it should be, and perhaps one day + * will. For now we require it be set after taking a hold. + */ + ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); + dn->dn_storage_type = newtype; +} + void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { @@ -604,6 +615,8 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; + dn->dn_storage_type = DMU_OT_NONE; + dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); @@ -687,6 +700,8 @@ dnode_destroy(dnode_t *dn) dn->dn_newprojid = ZFS_DEFAULT_PROJID; dn->dn_id_flags = 0; + dn->dn_storage_type = DMU_OT_NONE; + dmu_zfetch_fini(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); @@ -946,6 +961,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ndn->dn_newgid = odn->dn_newgid; ndn->dn_newprojid = odn->dn_newprojid; ndn->dn_id_flags = odn->dn_id_flags; + ndn->dn_storage_type = odn->dn_storage_type; dmu_zfetch_init(&ndn->dn_zfetch, ndn); /* @@ -1004,6 +1020,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) odn->dn_newgid = 0; odn->dn_newprojid = ZFS_DEFAULT_PROJID; odn->dn_id_flags = 0; + odn->dn_storage_type = DMU_OT_NONE; /* * Mark the dnode. @@ -1020,6 +1037,19 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg) int64_t refcount; uint32_t dbufs; +#ifndef USE_DNODE_HANDLE + /* + * We can't move dnodes if dbufs reference them directly without + * using handles and respecitve locking. Unless USE_DNODE_HANDLE + * is defined the code below is only to make sure it still builds, + * but it should never be used, since it is unsafe. + */ +#ifdef ZFS_DEBUG + PANIC("dnode_move() called without USE_DNODE_HANDLE"); +#endif + return (KMEM_CBRC_NO); +#endif + /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when @@ -1757,7 +1787,7 @@ dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting) * handle. */ #ifdef ZFS_DEBUG - ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + ASSERT(refs > 0 || zrl_owner(&dnh->dnh_zrlock) != curthread); #endif /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c index e28215cf3501..a3b9e7070373 100644 --- a/module/zfs/lz4_zfs.c +++ b/module/zfs/lz4_zfs.c @@ -867,7 +867,8 @@ void lz4_init(void) { lz4_cache = kmem_cache_create("lz4_cache", - sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); + sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, + KMC_RECLAIMABLE); } void diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 0ae4c331dd36..a1b2ffa8705c 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -236,7 +236,7 @@ sa_cache_init(void) { sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, - sa_cache_destructor, NULL, NULL, NULL, 0); + sa_cache_destructor, NULL, NULL, NULL, KMC_RECLAIMABLE); } void @@ -1852,7 +1852,6 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; - dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; @@ -1872,8 +1871,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, /* First make of copy of the old data */ DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (dn->dn_bonuslen != 0) { + if (DB_DNODE(db)->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); memcpy(old_data[0], hdl->sa_bonus->db_data, diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 439e56f0d0df..97191e768549 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2007,9 +2007,20 @@ spa_special_has_ddt(spa_t *spa) * Locate an appropriate allocation class */ metaslab_class_t * -spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, - uint_t level, uint_t special_smallblk) +spa_preferred_class(spa_t *spa, const zio_t *zio) { + const zio_prop_t *zp = &zio->io_prop; + + /* + * Override object type for the purposes of selecting a storage class. + * Primarily for DMU_OTN_ types where we can't explicitly control their + * storage class; instead, choose a static type most closely matches + * what we want. + */ + dmu_object_type_t objtype = + zp->zp_storage_type == DMU_OT_NONE ? + zp->zp_type : zp->zp_storage_type; + /* * ZIL allocations determine their class in zio_alloc_zil(). */ @@ -2027,14 +2038,15 @@ spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, } /* Indirect blocks for user data can land in special if allowed */ - if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { + if (zp->zp_level > 0 && + (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { if (has_special_class && zfs_user_indirect_is_special) return (spa_special_class(spa)); else return (spa_normal_class(spa)); } - if (DMU_OT_IS_METADATA(objtype) || level > 0) { + if (DMU_OT_IS_METADATA(objtype) || zp->zp_level > 0) { if (has_special_class) return (spa_special_class(spa)); else @@ -2047,7 +2059,7 @@ spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, * zfs_special_class_metadata_reserve_pct exclusively for metadata. */ if (DMU_OT_IS_FILE(objtype) && - has_special_class && size <= special_smallblk) { + has_special_class && zio->io_size <= zp->zp_zpl_smallblk) { metaslab_class_t *special = spa_special_class(spa); uint64_t alloc = metaslab_class_get_alloc(special); uint64_t space = metaslab_class_get_space(special); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index fa4e7093ca46..399f5a0117bb 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -665,13 +665,13 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, DB_DNODE_ENTER(db); err = dmu_read_by_dnode(DB_DNODE(db), off, len, lr + 1, DMU_READ_NO_PREFETCH); + DB_DNODE_EXIT(db); if (err != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } - DB_DNODE_EXIT(db); } itx->itx_wr_state = wr_state; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 45f3f5866280..9debb6f0ec9f 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -203,6 +203,10 @@ zio_init(void) if (align != 0) { char name[36]; + if (abd_size_alloc_linear(size)) { + cflags |= KMC_RECLAIMABLE; + data_cflags |= KMC_RECLAIMABLE; + } if (cflags == data_cflags) { /* * Resulting kmem caches would be identical. @@ -3080,7 +3084,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_complevel = gio->io_prop.zp_complevel; - zp.zp_type = DMU_OT_NONE; + zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; zp.zp_dedup = B_FALSE; @@ -3648,8 +3652,7 @@ zio_dva_throttle(zio_t *zio) metaslab_class_t *mc; /* locate an appropriate allocation class */ - mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, - zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); + mc = spa_preferred_class(spa, zio); if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || @@ -3721,9 +3724,7 @@ zio_dva_allocate(zio_t *zio) */ mc = zio->io_metaslab_class; if (mc == NULL) { - mc = spa_preferred_class(spa, zio->io_size, - zio->io_prop.zp_type, zio->io_prop.zp_level, - zio->io_prop.zp_zpl_smallblk); + mc = spa_preferred_class(spa, zio); zio->io_metaslab_class = mc; }