From b11ec637e7b01eccbc281aad775a9d01c64edf9e Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 10 Nov 2024 17:29:25 -0500 Subject: [PATCH] BRT: Rework structures and locks to be per-vdev While block cloning operation from the beginning was made per-vdev, before this change most of its data were protected by two pool- wide locks. It created lots of lock contention in many workload. This change makes most of block cloning data structures per-vdev, which allows to lock them separately. The only pool-wide lock now it spa_brt_lock, protecting array of per-vdev pointers and in most cases taken as reader. Also this splits per-vdev locks into three different ones: bv_pending_lock protects the AVL-tree of pending operations in open context, bv_mos_entries_lock protects BRT ZAP object from while being prefetched, and bv_lock protects the rest of per-vdev context during TXG commit process. There should be no functional difference aside of some optimizations. Reviewed-by: Brian Behlendorf Reviewed-by: Pawel Jakub Dawidek Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16740 --- cmd/zdb/zdb.c | 34 +- include/sys/brt_impl.h | 91 +++-- include/sys/spa.h | 1 + include/sys/spa_impl.h | 6 +- module/zfs/brt.c | 783 +++++++++++++++++------------------------ module/zfs/spa_misc.c | 17 +- 6 files changed, 400 insertions(+), 532 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 46587671202a..0179a2714cab 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2119,9 +2119,6 @@ dump_brt(spa_t *spa) return; } - brt_t *brt = spa->spa_brt; - VERIFY(brt); - char count[32], used[32], saved[32]; zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); @@ -2132,11 +2129,8 @@ dump_brt(spa_t *spa) if (dump_opt['T'] < 2) return; - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd == NULL) - continue; - + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; if (!brtvd->bv_initiated) { printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); continue; @@ -2160,20 +2154,21 @@ dump_brt(spa_t *spa) if (!do_histo) printf("\n%-16s %-10s\n", "DVA", "REFCNT"); - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd == NULL || !brtvd->bv_initiated) + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + if (!brtvd->bv_initiated) continue; uint64_t counts[64] = {}; zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); - for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); + for (zap_cursor_init(&zc, spa->spa_meta_objset, + brtvd->bv_mos_entries); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { uint64_t refcnt; - VERIFY0(zap_lookup_uint64(brt->brt_mos, + VERIFY0(zap_lookup_uint64(spa->spa_meta_objset, brtvd->bv_mos_entries, (const uint64_t *)za->za_name, 1, za->za_integer_length, za->za_num_integers, @@ -8227,14 +8222,11 @@ dump_mos_leaks(spa_t *spa) } } - if (spa->spa_brt != NULL) { - brt_t *brt = spa->spa_brt; - for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; - if (brtvd != NULL && brtvd->bv_initiated) { - mos_obj_refd(brtvd->bv_mos_brtvdev); - mos_obj_refd(brtvd->bv_mos_entries); - } + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + if (brtvd->bv_initiated) { + mos_obj_refd(brtvd->bv_mos_brtvdev); + mos_obj_refd(brtvd->bv_mos_entries); } } diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h index 9cc06fbb2c3a..6f02e4257a85 100644 --- a/include/sys/brt_impl.h +++ b/include/sys/brt_impl.h @@ -86,28 +86,38 @@ typedef struct brt_vdev_phys { uint64_t bvp_savedspace; } brt_vdev_phys_t; -typedef struct brt_vdev { +struct brt_vdev { /* - * VDEV id. + * Pending changes from open contexts. */ - uint64_t bv_vdevid; + kmutex_t bv_pending_lock; + avl_tree_t bv_pending_tree[TXG_SIZE]; /* - * Is the structure initiated? - * (bv_entcount and bv_bitmap are allocated?) + * Protects bv_mos_*. */ - boolean_t bv_initiated; + krwlock_t bv_mos_entries_lock ____cacheline_aligned; + /* + * Protects all the fields starting from bv_initiated. + */ + krwlock_t bv_lock ____cacheline_aligned; + /* + * VDEV id. + */ + uint64_t bv_vdevid ____cacheline_aligned; /* * Object number in the MOS for the entcount array and brt_vdev_phys. */ uint64_t bv_mos_brtvdev; /* - * Object number in the MOS for the entries table. + * Object number in the MOS and dnode for the entries table. */ uint64_t bv_mos_entries; + dnode_t *bv_mos_entries_dnode; /* - * Entries to sync. + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) */ - avl_tree_t bv_tree; + boolean_t bv_initiated; /* * Does the bv_entcount[] array needs byte swapping? */ @@ -120,6 +130,26 @@ typedef struct brt_vdev { * This is the array with BRT entry count per BRT_RANGESIZE. */ uint16_t *bv_entcount; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; /* * Sum of all bv_entcount[]s. */ @@ -133,45 +163,10 @@ typedef struct brt_vdev { */ uint64_t bv_savedspace; /* - * brt_vdev_phys needs updating on disk. - */ - boolean_t bv_meta_dirty; - /* - * bv_entcount[] needs updating on disk. - */ - boolean_t bv_entcount_dirty; - /* - * bv_entcount[] potentially can be a bit too big to sychronize it all - * when we just changed few entcounts. The fields below allow us to - * track updates to bv_entcount[] array since the last sync. - * A single bit in the bv_bitmap represents as many entcounts as can - * fit into a single BRT_BLOCKSIZE. - * For example we have 65536 entcounts in the bv_entcount array - * (so the whole array is 128kB). We updated bv_entcount[2] and - * bv_entcount[5]. In that case only first bit in the bv_bitmap will - * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + * Entries to sync. */ - ulong_t *bv_bitmap; - uint64_t bv_nblocks; -} brt_vdev_t; - -/* - * In-core brt - */ -typedef struct brt { - krwlock_t brt_lock; - spa_t *brt_spa; -#define brt_mos brt_spa->spa_meta_objset - uint64_t brt_rangesize; - uint64_t brt_usedspace; - uint64_t brt_savedspace; - avl_tree_t brt_pending_tree[TXG_SIZE]; - kmutex_t brt_pending_lock[TXG_SIZE]; - /* Sum of all entries across all bv_trees. */ - uint64_t brt_nentries; - brt_vdev_t *brt_vdevs; - uint64_t brt_nvdevs; -} brt_t; + avl_tree_t bv_tree; +}; /* Size of bre_offset / sizeof (uint64_t). */ #define BRT_KEY_WORDS (1) @@ -188,7 +183,7 @@ typedef struct brt_entry { typedef struct brt_pending_entry { blkptr_t bpe_bp; - int bpe_count; + uint64_t bpe_count; avl_node_t bpe_node; } brt_pending_entry_t; diff --git a/include/sys/spa.h b/include/sys/spa.h index ca30b60c0af7..52601921fc3c 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -53,6 +53,7 @@ extern "C" { /* * Forward references that lots of things need. */ +typedef struct brt_vdev brt_vdev_t; typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 7811abbb9ce3..d1da87105103 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -412,8 +412,12 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + uint64_t spa_rdspace; /* raw (non-dedup) --//-- */ boolean_t spa_active_ddt_prune; /* ddt prune process active */ - struct brt *spa_brt; /* in-core BRT */ + brt_vdev_t **spa_brt_vdevs; /* array of per-vdev BRTs */ + uint64_t spa_brt_nvdevs; /* number of vdevs in BRT */ + uint64_t spa_brt_rangesize; /* pool's BRT range size */ + krwlock_t spa_brt_lock; /* Protects brt_vdevs/nvdevs */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ diff --git a/module/zfs/brt.c b/module/zfs/brt.c index ea8c0735c4b7..b1894e3fed49 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -317,23 +317,24 @@ struct { static int brt_entry_compare(const void *x1, const void *x2); static int brt_pending_entry_compare(const void *x1, const void *x2); +static void brt_vdevs_expand(spa_t *spa, uint64_t nvdevs); static void -brt_rlock(brt_t *brt) +brt_rlock(spa_t *spa) { - rw_enter(&brt->brt_lock, RW_READER); + rw_enter(&spa->spa_brt_lock, RW_READER); } static void -brt_wlock(brt_t *brt) +brt_wlock(spa_t *spa) { - rw_enter(&brt->brt_lock, RW_WRITER); + rw_enter(&spa->spa_brt_lock, RW_WRITER); } static void -brt_unlock(brt_t *brt) +brt_unlock(spa_t *spa) { - rw_exit(&brt->brt_lock); + rw_exit(&spa->spa_brt_lock); } static uint16_t @@ -394,14 +395,15 @@ brt_vdev_dump(brt_vdev_t *brtvd) { uint64_t idx; + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " - "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu", (u_longlong_t)brtvd->bv_vdevid, brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, (u_longlong_t)brtvd->bv_size, (u_longlong_t)brtvd->bv_totalcount, - (u_longlong_t)brtvd->bv_nblocks, - (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + (u_longlong_t)nblocks, + (size_t)BT_SIZEOFMAP(nblocks)); if (brtvd->bv_totalcount > 0) { zfs_dbgmsg(" entcounts:"); for (idx = 0; idx < brtvd->bv_size; idx++) { @@ -415,51 +417,59 @@ brt_vdev_dump(brt_vdev_t *brtvd) if (brtvd->bv_entcount_dirty) { char *bitmap; - bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); - for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap = kmem_alloc(nblocks + 1, KM_SLEEP); + for (idx = 0; idx < nblocks; idx++) { bitmap[idx] = BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } bitmap[idx] = '\0'; zfs_dbgmsg(" dirty: %s", bitmap); - kmem_free(bitmap, brtvd->bv_nblocks + 1); + kmem_free(bitmap, nblocks + 1); } } #endif static brt_vdev_t * -brt_vdev(brt_t *brt, uint64_t vdevid) +brt_vdev(spa_t *spa, uint64_t vdevid, boolean_t alloc) { - brt_vdev_t *brtvd; + brt_vdev_t *brtvd = NULL; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - if (vdevid < brt->brt_nvdevs) { - brtvd = &brt->brt_vdevs[vdevid]; - } else { - brtvd = NULL; + brt_rlock(spa); + if (vdevid < spa->spa_brt_nvdevs) { + brtvd = spa->spa_brt_vdevs[vdevid]; + } else if (alloc) { + /* New VDEV was added. */ + brt_unlock(spa); + brt_wlock(spa); + if (vdevid >= spa->spa_brt_nvdevs) + brt_vdevs_expand(spa, vdevid + 1); + brtvd = spa->spa_brt_vdevs[vdevid]; } - + brt_unlock(spa); return (brtvd); } static void -brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_create(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); ASSERT0(brtvd->bv_mos_brtvdev); ASSERT0(brtvd->bv_mos_entries); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_bitmap != NULL); - ASSERT(brtvd->bv_nblocks > 0); - brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, + uint64_t mos_entries = zap_create_flags(spa->spa_meta_objset, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); - VERIFY(brtvd->bv_mos_entries != 0); + VERIFY(mos_entries != 0); + VERIFY0(dnode_hold(spa->spa_meta_objset, mos_entries, brtvd, + &brtvd->bv_mos_entries_dnode)); + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); + brtvd->bv_mos_entries = mos_entries; + rw_exit(&brtvd->bv_mos_entries_lock); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); @@ -468,7 +478,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) * We will keep array size (bv_size) and cummulative count for all * bv_entcount[]s (bv_totalcount) in the bonus buffer. */ - brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, + brtvd->bv_mos_brtvdev = dmu_object_alloc(spa->spa_meta_objset, DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); VERIFY(brtvd->bv_mos_brtvdev != 0); @@ -477,66 +487,62 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, + VERIFY0(zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); BRT_DEBUG("Pool directory object created, object=%s", name); - spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); + spa_feature_incr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void -brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_realloc(spa_t *spa, brt_vdev_t *brtvd) { vdev_t *vd; uint16_t *entcount; ulong_t *bitmap; - uint64_t nblocks, size; + uint64_t nblocks, onblocks, size; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); - spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); - vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); - size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; - spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, brtvd->bv_vdevid); + size = (vdev_get_min_asize(vd) - 1) / spa->spa_brt_rangesize + 1; + spa_config_exit(spa, SCL_VDEV, FTAG); entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); if (!brtvd->bv_initiated) { + ASSERT0(avl_numnodes(&brtvd->bv_tree)); ASSERT0(brtvd->bv_size); ASSERT(brtvd->bv_entcount == NULL); ASSERT(brtvd->bv_bitmap == NULL); - ASSERT0(brtvd->bv_nblocks); - - avl_create(&brtvd->bv_tree, brt_entry_compare, - sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); } else { ASSERT(brtvd->bv_size > 0); ASSERT(brtvd->bv_entcount != NULL); ASSERT(brtvd->bv_bitmap != NULL); - ASSERT(brtvd->bv_nblocks > 0); /* * TODO: Allow vdev shrinking. We only need to implement * shrinking the on-disk BRT VDEV object. - * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, - * size, tx); + * dmu_free_range(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + * offset, size, tx); */ ASSERT3U(brtvd->bv_size, <=, size); memcpy(entcount, brtvd->bv_entcount, sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); - memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), - BT_SIZEOFMAP(brtvd->bv_nblocks))); vmem_free(brtvd->bv_entcount, sizeof (entcount[0]) * brtvd->bv_size); - kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + onblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), + BT_SIZEOFMAP(onblocks))); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(onblocks)); } brtvd->bv_size = size; brtvd->bv_entcount = entcount; brtvd->bv_bitmap = bitmap; - brtvd->bv_nblocks = nblocks; if (!brtvd->bv_initiated) { brtvd->bv_need_byteswap = FALSE; brtvd->bv_initiated = TRUE; @@ -546,7 +552,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) } static void -brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_load(spa_t *spa, brt_vdev_t *brtvd) { char name[64]; dmu_buf_t *db; @@ -555,26 +561,27 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); if (error != 0) return; ASSERT(brtvd->bv_mos_brtvdev != 0); - error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); + error = dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + FTAG, &db); ASSERT0(error); if (error != 0) return; bvphys = db->db_data; - if (brt->brt_rangesize == 0) { - brt->brt_rangesize = bvphys->bvp_rangesize; + if (spa->spa_brt_rangesize == 0) { + spa->spa_brt_rangesize = bvphys->bvp_rangesize; } else { - ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); + ASSERT3U(spa->spa_brt_rangesize, ==, bvphys->bvp_rangesize); } ASSERT(!brtvd->bv_initiated); - brt_vdev_realloc(brt, brtvd); + brt_vdev_realloc(spa, brtvd); /* TODO: We don't support VDEV shrinking. */ ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); @@ -582,20 +589,22 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) /* * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. */ - error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + error = dmu_read(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), brtvd->bv_entcount, DMU_READ_NO_PREFETCH); ASSERT0(error); + ASSERT(bvphys->bvp_mos_entries != 0); + VERIFY0(dnode_hold(spa->spa_meta_objset, bvphys->bvp_mos_entries, brtvd, + &brtvd->bv_mos_entries_dnode)); + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = bvphys->bvp_mos_entries; - ASSERT(brtvd->bv_mos_entries != 0); + rw_exit(&brtvd->bv_mos_entries_lock); brtvd->bv_need_byteswap = (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); brtvd->bv_totalcount = bvphys->bvp_totalcount; brtvd->bv_usedspace = bvphys->bvp_usedspace; brtvd->bv_savedspace = bvphys->bvp_savedspace; - brt->brt_usedspace += brtvd->bv_usedspace; - brt->brt_savedspace += brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); @@ -605,107 +614,120 @@ brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) } static void -brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) +brt_vdev_dealloc(brt_vdev_t *brtvd) { - - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd->bv_initiated); + ASSERT0(avl_numnodes(&brtvd->bv_tree)); vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); brtvd->bv_entcount = NULL; - kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(nblocks)); brtvd->bv_bitmap = NULL; - ASSERT0(avl_numnodes(&brtvd->bv_tree)); - avl_destroy(&brtvd->bv_tree); brtvd->bv_size = 0; - brtvd->bv_nblocks = 0; brtvd->bv_initiated = FALSE; BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); } static void -brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_destroy(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { char name[64]; uint64_t count; - dmu_buf_t *db; - brt_vdev_phys_t *bvphys; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(brtvd->bv_initiated); ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(brtvd->bv_mos_entries != 0); + ASSERT0(brtvd->bv_totalcount); + ASSERT0(brtvd->bv_usedspace); + ASSERT0(brtvd->bv_savedspace); - VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); - VERIFY0(count); - VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); - BRT_DEBUG("MOS entries destroyed, object=%llu", - (u_longlong_t)brtvd->bv_mos_entries); + uint64_t mos_entries = brtvd->bv_mos_entries; + rw_enter(&brtvd->bv_mos_entries_lock, RW_WRITER); brtvd->bv_mos_entries = 0; + rw_exit(&brtvd->bv_mos_entries_lock); + dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); + brtvd->bv_mos_entries_dnode = NULL; + ASSERT0(zap_count(spa->spa_meta_objset, mos_entries, &count)); + ASSERT0(count); + VERIFY0(zap_destroy(spa->spa_meta_objset, mos_entries, tx)); + BRT_DEBUG("MOS entries destroyed, object=%llu", + (u_longlong_t)mos_entries); - VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); - bvphys = db->db_data; - ASSERT0(bvphys->bvp_totalcount); - ASSERT0(bvphys->bvp_usedspace); - ASSERT0(bvphys->bvp_savedspace); - dmu_buf_rele(db, FTAG); - - VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); + VERIFY0(dmu_object_free(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + tx)); BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", (u_longlong_t)brtvd->bv_mos_brtvdev); brtvd->bv_mos_brtvdev = 0; snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, (u_longlong_t)brtvd->bv_vdevid); - VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, tx)); BRT_DEBUG("Pool directory object removed, object=%s", name); - brt_vdev_dealloc(brt, brtvd); + brtvd->bv_meta_dirty = FALSE; + + rw_enter(&brtvd->bv_lock, RW_WRITER); + brt_vdev_dealloc(brtvd); + rw_exit(&brtvd->bv_lock); - spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); + spa_feature_decr(spa, SPA_FEATURE_BLOCK_CLONING, tx); } static void -brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) +brt_vdevs_expand(spa_t *spa, uint64_t nvdevs) { - brt_vdev_t *brtvd, *vdevs; - uint64_t vdevid; + brt_vdev_t **vdevs; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT3U(nvdevs, >, brt->brt_nvdevs); + ASSERT(RW_WRITE_HELD(&spa->spa_brt_lock)); + ASSERT3U(nvdevs, >, spa->spa_brt_nvdevs); - vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); - if (brt->brt_nvdevs > 0) { - ASSERT(brt->brt_vdevs != NULL); + vdevs = kmem_zalloc(sizeof (*spa->spa_brt_vdevs) * nvdevs, KM_SLEEP); + if (spa->spa_brt_nvdevs > 0) { + ASSERT(spa->spa_brt_vdevs != NULL); - memcpy(vdevs, brt->brt_vdevs, - sizeof (brt_vdev_t) * brt->brt_nvdevs); - kmem_free(brt->brt_vdevs, - sizeof (brt_vdev_t) * brt->brt_nvdevs); + memcpy(vdevs, spa->spa_brt_vdevs, + sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); + kmem_free(spa->spa_brt_vdevs, + sizeof (*spa->spa_brt_vdevs) * spa->spa_brt_nvdevs); } - for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { - brtvd = &vdevs[vdevid]; + spa->spa_brt_vdevs = vdevs; + for (uint64_t vdevid = spa->spa_brt_nvdevs; vdevid < nvdevs; vdevid++) { + brt_vdev_t *brtvd = kmem_zalloc(sizeof (*brtvd), KM_SLEEP); + rw_init(&brtvd->bv_lock, NULL, RW_DEFAULT, NULL); brtvd->bv_vdevid = vdevid; brtvd->bv_initiated = FALSE; + rw_init(&brtvd->bv_mos_entries_lock, NULL, RW_DEFAULT, NULL); + avl_create(&brtvd->bv_tree, brt_entry_compare, + sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); + for (int i = 0; i < TXG_SIZE; i++) { + avl_create(&brtvd->bv_pending_tree[i], + brt_pending_entry_compare, + sizeof (brt_pending_entry_t), + offsetof(brt_pending_entry_t, bpe_node)); + } + mutex_init(&brtvd->bv_pending_lock, NULL, MUTEX_DEFAULT, NULL); + spa->spa_brt_vdevs[vdevid] = brtvd; } BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", - (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); - - brt->brt_vdevs = vdevs; - brt->brt_nvdevs = nvdevs; + (u_longlong_t)spa->spa_brt_nvdevs, (u_longlong_t)nvdevs); + spa->spa_brt_nvdevs = nvdevs; } static boolean_t -brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) +brt_vdev_lookup(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre) { uint64_t idx; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + ASSERT(RW_LOCK_HELD(&brtvd->bv_lock)); - idx = bre->bre_offset / brt->brt_rangesize; + idx = bre->bre_offset / spa->spa_brt_rangesize; if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { /* VDEV wasn't expanded. */ return (brt_vdev_entcount_get(brtvd, idx) > 0); @@ -715,30 +737,27 @@ brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) } static void -brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, - uint64_t dsize) +brt_vdev_addref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, + uint64_t dsize, uint64_t count) { uint64_t idx; - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); + ASSERT(RW_LOCK_HELD(&brtvd->bv_lock)); ASSERT(brtvd != NULL); ASSERT(brtvd->bv_entcount != NULL); - brt->brt_savedspace += dsize; - brtvd->bv_savedspace += dsize; + brtvd->bv_savedspace += dsize * count; brtvd->bv_meta_dirty = TRUE; - if (bre->bre_refcount > 1) { + if (bre->bre_refcount > 0) return; - } - brt->brt_usedspace += dsize; brtvd->bv_usedspace += dsize; - idx = bre->bre_offset / brt->brt_rangesize; + idx = bre->bre_offset / spa->spa_brt_rangesize; if (idx >= brtvd->bv_size) { /* VDEV has been expanded. */ - brt_vdev_realloc(brt, brtvd); + brt_vdev_realloc(spa, brtvd); } ASSERT3U(idx, <, brtvd->bv_size); @@ -748,35 +767,27 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); - -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_BRT) - brt_vdev_dump(brtvd); -#endif } static void -brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, +brt_vdev_decref(spa_t *spa, brt_vdev_t *brtvd, const brt_entry_t *bre, uint64_t dsize) { uint64_t idx; - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); + ASSERT(RW_WRITE_HELD(&brtvd->bv_lock)); ASSERT(brtvd != NULL); ASSERT(brtvd->bv_entcount != NULL); - brt->brt_savedspace -= dsize; brtvd->bv_savedspace -= dsize; brtvd->bv_meta_dirty = TRUE; - if (bre->bre_refcount > 0) { + if (bre->bre_refcount > 0) return; - } - brt->brt_usedspace -= dsize; brtvd->bv_usedspace -= dsize; - idx = bre->bre_offset / brt->brt_rangesize; + idx = bre->bre_offset / spa->spa_brt_rangesize; ASSERT3U(idx, <, brtvd->bv_size); ASSERT(brtvd->bv_totalcount > 0); @@ -785,15 +796,10 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, brtvd->bv_entcount_dirty = TRUE; idx = idx / BRT_BLOCKSIZE / 8; BT_SET(brtvd->bv_bitmap, idx); - -#ifdef ZFS_DEBUG - if (zfs_flags & ZFS_DEBUG_BRT) - brt_vdev_dump(brtvd); -#endif } static void -brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) +brt_vdev_sync(spa_t *spa, brt_vdev_t *brtvd, dmu_tx_t *tx) { dmu_buf_t *db; brt_vdev_phys_t *bvphys; @@ -802,16 +808,18 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) ASSERT(brtvd->bv_mos_brtvdev != 0); ASSERT(dmu_tx_is_syncing(tx)); - VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); + VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, + FTAG, &db)); if (brtvd->bv_entcount_dirty) { /* * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. */ - dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, + dmu_write(spa->spa_meta_objset, brtvd->bv_mos_brtvdev, 0, brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), brtvd->bv_entcount, tx); - memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); + uint64_t nblocks = BRT_RANGESIZE_TO_NBLOCKS(brtvd->bv_size); + memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(nblocks)); brtvd->bv_entcount_dirty = FALSE; } @@ -825,7 +833,7 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; } bvphys->bvp_totalcount = brtvd->bv_totalcount; - bvphys->bvp_rangesize = brt->brt_rangesize; + bvphys->bvp_rangesize = spa->spa_brt_rangesize; bvphys->bvp_usedspace = brtvd->bv_usedspace; bvphys->bvp_savedspace = brtvd->bv_savedspace; dmu_buf_rele(db, FTAG); @@ -834,47 +842,49 @@ brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) } static void -brt_vdevs_alloc(brt_t *brt, boolean_t load) +brt_vdevs_alloc(spa_t *spa, boolean_t load) { - brt_vdev_t *brtvd; - uint64_t vdevid; - - brt_wlock(brt); - - brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); + brt_wlock(spa); + brt_vdevs_expand(spa, spa->spa_root_vdev->vdev_children); if (load) { - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; + vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + rw_enter(&brtvd->bv_lock, RW_WRITER); ASSERT(brtvd->bv_entcount == NULL); - - brt_vdev_load(brt, brtvd); + brt_vdev_load(spa, brtvd); + rw_exit(&brtvd->bv_lock); } } - if (brt->brt_rangesize == 0) { - brt->brt_rangesize = BRT_RANGESIZE; + if (spa->spa_brt_rangesize == 0) { + spa->spa_brt_rangesize = BRT_RANGESIZE; } - - brt_unlock(brt); + brt_unlock(spa); } static void -brt_vdevs_free(brt_t *brt) +brt_vdevs_free(spa_t *spa) { - brt_vdev_t *brtvd; - uint64_t vdevid; - - brt_wlock(brt); - - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + rw_enter(&brtvd->bv_lock, RW_WRITER); if (brtvd->bv_initiated) - brt_vdev_dealloc(brt, brtvd); + brt_vdev_dealloc(brtvd); + rw_exit(&brtvd->bv_lock); + rw_destroy(&brtvd->bv_lock); + if (brtvd->bv_mos_entries != 0) + dnode_rele(brtvd->bv_mos_entries_dnode, brtvd); + rw_destroy(&brtvd->bv_mos_entries_lock); + avl_destroy(&brtvd->bv_tree); + for (int i = 0; i < TXG_SIZE; i++) + avl_destroy(&brtvd->bv_pending_tree[i]); + mutex_destroy(&brtvd->bv_pending_lock); + kmem_free(brtvd, sizeof (*brtvd)); } - kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); - - brt_unlock(brt); + kmem_free(spa->spa_brt_vdevs, sizeof (*spa->spa_brt_vdevs) * + spa->spa_brt_nvdevs); } static void @@ -897,53 +907,27 @@ brt_entry_compare(const void *x1, const void *x2) } static int -brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) +brt_entry_lookup(spa_t *spa, brt_vdev_t *brtvd, brt_entry_t *bre, krw_t rw) { - uint64_t mos_entries; - int error; + ASSERT(RW_LOCK_HELD(&brtvd->bv_lock)); - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - - if (!brt_vdev_lookup(brt, brtvd, bre)) + if (!brt_vdev_lookup(spa, brtvd, bre)) return (SET_ERROR(ENOENT)); - /* - * Remember mos_entries object number. After we reacquire the BRT lock, - * the brtvd pointer may be invalid. - */ - mos_entries = brtvd->bv_mos_entries; - if (mos_entries == 0) + if (brtvd->bv_mos_entries == 0) return (SET_ERROR(ENOENT)); - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); - error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, - BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); + int error = zap_lookup_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &bre->bre_offset, BRT_KEY_WORDS, 1, + sizeof (bre->bre_refcount), &bre->bre_refcount); - brt_wlock(brt); + rw_enter(&brtvd->bv_lock, rw); return (error); } -static void -brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) -{ - brt_vdev_t *brtvd; - uint64_t mos_entries = 0; - - brt_rlock(brt); - brtvd = brt_vdev(brt, vdevid); - if (brtvd != NULL) - mos_entries = brtvd->bv_mos_entries; - brt_unlock(brt); - - if (mos_entries == 0) - return; - - (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); -} - /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which @@ -952,25 +936,24 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t bre_search; boolean_t mayexists = FALSE; uint64_t vdevid; - brt_entry_fill(bp, &bre_search, &vdevid); + if (spa->spa_brt_nvdevs == 0) + return (B_FALSE); - brt_rlock(brt); + brt_entry_fill(bp, &bre_search, &vdevid); - brtvd = brt_vdev(brt, vdevid); - if (brtvd != NULL && brtvd->bv_initiated) { - if (!avl_is_empty(&brtvd->bv_tree) || - brt_vdev_lookup(brt, brtvd, &bre_search)) { - mayexists = TRUE; - } - } + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); + if (brtvd == NULL) + return (FALSE); - brt_unlock(brt); + rw_enter(&brtvd->bv_lock, RW_READER); + if (brtvd->bv_initiated && (!avl_is_empty(&brtvd->bv_tree) || + brt_vdev_lookup(spa, brtvd, &bre_search))) + mayexists = TRUE; + rw_exit(&brtvd->bv_lock); return (mayexists); } @@ -978,46 +961,44 @@ brt_maybe_exists(spa_t *spa, const blkptr_t *bp) uint64_t brt_get_dspace(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_nvdevs == 0) return (0); - return (brt->brt_savedspace); + brt_rlock(spa); + uint64_t s = 0; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) + s += spa->spa_brt_vdevs[vdevid]->bv_savedspace; + brt_unlock(spa); + return (s); } uint64_t brt_get_used(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) + if (spa->spa_brt_nvdevs == 0) return (0); - return (brt->brt_usedspace); + brt_rlock(spa); + uint64_t s = 0; + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) + s += spa->spa_brt_vdevs[vdevid]->bv_usedspace; + brt_unlock(spa); + return (s); } uint64_t brt_get_saved(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) - return (0); - - return (brt->brt_savedspace); + return (brt_get_dspace(spa)); } uint64_t brt_get_ratio(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt->brt_usedspace == 0) + uint64_t used = brt_get_used(spa); + if (used == 0) return (100); - - return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / - brt->brt_usedspace); + return ((used + brt_get_saved(spa)) * 100 / used); } static int @@ -1148,82 +1129,58 @@ brt_entry_free(brt_entry_t *bre) } static void -brt_entry_addref(brt_t *brt, const blkptr_t *bp) +brt_entry_addref(spa_t *spa, brt_vdev_t *brtvd, const blkptr_t *bp, + uint64_t count) { - brt_vdev_t *brtvd; brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; uint64_t vdevid; int error; - ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); - brt_entry_fill(bp, &bre_search, &vdevid); + ASSERT3U(brtvd->bv_vdevid, ==, vdevid); - brt_wlock(brt); - - brtvd = brt_vdev(brt, vdevid); - if (brtvd == NULL) { - ASSERT3U(vdevid, >=, brt->brt_nvdevs); - - /* New VDEV was added. */ - brt_vdevs_expand(brt, vdevid + 1); - brtvd = brt_vdev(brt, vdevid); - } - ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_WRITER); if (!brtvd->bv_initiated) - brt_vdev_realloc(brt, brtvd); + brt_vdev_realloc(spa, brtvd); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_addref_entry_in_memory); } else { - /* - * brt_entry_lookup() may drop the BRT (read) lock and - * reacquire it (write). - */ - error = brt_entry_lookup(brt, brtvd, &bre_search); + /* brt_entry_lookup() may drop the lock */ + error = brt_entry_lookup(spa, brtvd, &bre_search, RW_WRITER); /* bre_search now contains correct bre_refcount */ ASSERT(error == 0 || error == ENOENT); if (error == 0) BRTSTAT_BUMP(brt_addref_entry_on_disk); else BRTSTAT_BUMP(brt_addref_entry_not_on_disk); - /* - * When the BRT lock was dropped, brt_vdevs[] may have been - * expanded and reallocated, we need to update brtvd's pointer. - */ - brtvd = brt_vdev(brt, vdevid); - ASSERT(brtvd != NULL); racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre == NULL) { bre = brt_entry_alloc(&bre_search); - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); avl_insert(&brtvd->bv_tree, bre, where); - brt->brt_nentries++; } else { /* - * The entry was added when the BRT lock was dropped in + * The entry was added when the lock was dropped in * brt_entry_lookup(). */ BRTSTAT_BUMP(brt_addref_entry_read_lost_race); bre = racebre; } } - bre->bre_refcount++; - brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + brt_vdev_addref(spa, brtvd, bre, bp_get_dsize(spa, bp), count); + bre->bre_refcount += count; - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); } /* Return TRUE if block should be freed immediately. */ boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t *bre, *racebre; brt_entry_t bre_search; avl_index_t where; @@ -1232,11 +1189,11 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) brt_entry_fill(bp, &bre_search, &vdevid); - brt_wlock(brt); - - brtvd = brt_vdev(brt, vdevid); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_WRITER); + ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre != NULL) { BRTSTAT_BUMP(brt_decref_entry_in_memory); @@ -1245,18 +1202,10 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) BRTSTAT_BUMP(brt_decref_entry_not_in_memory); } - /* - * brt_entry_lookup() may drop the BRT lock and reacquire it. - */ - error = brt_entry_lookup(brt, brtvd, &bre_search); + /* brt_entry_lookup() may drop the lock. */ + error = brt_entry_lookup(spa, brtvd, &bre_search, RW_WRITER); /* bre_search now contains correct bre_refcount */ ASSERT(error == 0 || error == ENOENT); - /* - * When the BRT lock was dropped, brt_vdevs[] may have been expanded - * and reallocated, we need to update brtvd's pointer. - */ - brtvd = brt_vdev(brt, vdevid); - ASSERT(brtvd != NULL); if (error == ENOENT) { BRTSTAT_BUMP(brt_decref_entry_not_on_disk); @@ -1267,7 +1216,7 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); if (racebre != NULL) { /* - * The entry was added when the BRT lock was dropped in + * The entry was added when the lock was dropped in * brt_entry_lookup(). */ BRTSTAT_BUMP(brt_decref_entry_read_lost_race); @@ -1277,21 +1226,19 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); bre = brt_entry_alloc(&bre_search); - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); avl_insert(&brtvd->bv_tree, bre, where); - brt->brt_nentries++; out: if (bre == NULL) { /* * This is a free of a regular (not cloned) block. */ - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); BRTSTAT_BUMP(brt_decref_no_entry); return (B_TRUE); } if (bre->bre_refcount == 0) { - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); BRTSTAT_BUMP(brt_decref_free_data_now); return (B_TRUE); } @@ -1302,9 +1249,9 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) BRTSTAT_BUMP(brt_decref_free_data_later); else BRTSTAT_BUMP(brt_decref_entry_still_referenced); - brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); + brt_vdev_decref(spa, brtvd, bre, bp_get_dsize_sync(spa, bp)); - brt_unlock(brt); + rw_exit(&brtvd->bv_lock); return (B_FALSE); } @@ -1312,22 +1259,20 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) { - brt_t *brt = spa->spa_brt; - brt_vdev_t *brtvd; brt_entry_t bre_search, *bre; uint64_t vdevid, refcnt; int error; brt_entry_fill(bp, &bre_search, &vdevid); - brt_rlock(brt); - - brtvd = brt_vdev(brt, vdevid); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_FALSE); ASSERT(brtvd != NULL); + rw_enter(&brtvd->bv_lock, RW_READER); + ASSERT(brtvd->bv_initiated); bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); if (bre == NULL) { - error = brt_entry_lookup(brt, brtvd, &bre_search); + error = brt_entry_lookup(spa, brtvd, &bre_search, RW_READER); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) refcnt = 0; @@ -1335,25 +1280,24 @@ brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) refcnt = bre_search.bre_refcount; } else refcnt = bre->bre_refcount; + rw_exit(&brtvd->bv_lock); - brt_unlock(brt); return (refcnt); } static void -brt_prefetch(brt_t *brt, const blkptr_t *bp) +brt_prefetch(brt_vdev_t *brtvd, const blkptr_t *bp) { - brt_entry_t bre; - uint64_t vdevid; - - ASSERT(bp != NULL); - - if (!brt_zap_prefetch) + if (!brt_zap_prefetch || brtvd->bv_mos_entries == 0) return; - brt_entry_fill(bp, &bre, &vdevid); - - brt_entry_prefetch(brt, vdevid, &bre); + rw_enter(&brtvd->bv_mos_entries_lock, RW_READER); + if (brtvd->bv_mos_entries != 0) { + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[0]); + (void) zap_prefetch_uint64_by_dnode(brtvd->bv_mos_entries_dnode, + &offset, BRT_KEY_WORDS); + } + rw_exit(&brtvd->bv_mos_entries_lock); } static int @@ -1363,15 +1307,10 @@ brt_pending_entry_compare(const void *x1, const void *x2) const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; int cmp; - cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), - DVA_GET_VDEV(&bp2->blk_dva[0])); - if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), - DVA_GET_OFFSET(&bp2->blk_dva[0])); - if (unlikely(cmp == 0)) { - cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); - } - } + cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0])); + if (unlikely(cmp == 0)) + cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2)); return (cmp); } @@ -1379,25 +1318,22 @@ brt_pending_entry_compare(const void *x1, const void *x2) void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { - brt_t *brt; - avl_tree_t *pending_tree; - kmutex_t *pending_lock; brt_pending_entry_t *bpe, *newbpe; avl_index_t where; uint64_t txg; - brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; + + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); + avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); newbpe->bpe_bp = *bp; newbpe->bpe_count = 1; - mutex_enter(pending_lock); - + mutex_enter(&brtvd->bv_pending_lock); bpe = avl_find(pending_tree, newbpe, &where); if (bpe == NULL) { avl_insert(pending_tree, newbpe, where); @@ -1405,8 +1341,7 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) } else { bpe->bpe_count++; } - - mutex_exit(pending_lock); + mutex_exit(&brtvd->bv_pending_lock); if (newbpe != NULL) { ASSERT(bpe != NULL); @@ -1416,82 +1351,86 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) ASSERT(bpe == NULL); /* Prefetch BRT entry for the syncing context. */ - brt_prefetch(brt, bp); + brt_prefetch(brtvd, bp); } } void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) { - brt_t *brt; - avl_tree_t *pending_tree; - kmutex_t *pending_lock; brt_pending_entry_t *bpe, bpe_search; uint64_t txg; - brt = spa->spa_brt; txg = dmu_tx_get_txg(tx); ASSERT3U(txg, !=, 0); - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - bpe_search.bpe_bp = *bp; + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[0]); + brt_vdev_t *brtvd = brt_vdev(spa, vdevid, B_TRUE); + avl_tree_t *pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; - mutex_enter(pending_lock); + bpe_search.bpe_bp = *bp; + mutex_enter(&brtvd->bv_pending_lock); bpe = avl_find(pending_tree, &bpe_search, NULL); /* I believe we should always find bpe when this function is called. */ if (bpe != NULL) { ASSERT(bpe->bpe_count > 0); - bpe->bpe_count--; - if (bpe->bpe_count == 0) { + if (bpe->bpe_count == 0) avl_remove(pending_tree, bpe); - kmem_cache_free(brt_pending_entry_cache, bpe); - } + else + bpe = NULL; } + mutex_exit(&brtvd->bv_pending_lock); - mutex_exit(pending_lock); + if (bpe) + kmem_cache_free(brt_pending_entry_cache, bpe); } void brt_pending_apply(spa_t *spa, uint64_t txg) { - brt_t *brt = spa->spa_brt; brt_pending_entry_t *bpe; avl_tree_t *pending_tree; - void *c; ASSERT3U(txg, !=, 0); - /* - * We are in syncing context, so no other brt_pending_tree accesses - * are possible for the TXG. Don't need to acquire brt_pending_lock. - */ - pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); - c = NULL; - while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { - boolean_t added_to_ddt; + /* + * We are in syncing context, so no other bv_pending_tree + * accesses are possible for the TXG. So we don't need to + * acquire bv_pending_lock. + */ + pending_tree = &brtvd->bv_pending_tree[txg & TXG_MASK]; - for (int i = 0; i < bpe->bpe_count; i++) { + void *c = NULL; + while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { /* * If the block has DEDUP bit set, it means that it * already exists in the DEDUP table, so we can just - * use that instead of creating new entry in - * the BRT table. + * use that instead of creating new entry in the BRT. */ if (BP_GET_DEDUP(&bpe->bpe_bp)) { - added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); + for (uint64_t c = bpe->bpe_count; c > 0; c--) { + if (ddt_addref(spa, &bpe->bpe_bp)) + continue; + brt_entry_addref(spa, brtvd, + &bpe->bpe_bp, c); + break; + } } else { - added_to_ddt = B_FALSE; + brt_entry_addref(spa, brtvd, &bpe->bpe_bp, + bpe->bpe_count); } - if (!added_to_ddt) - brt_entry_addref(brt, &bpe->bpe_bp); + kmem_cache_free(brt_pending_entry_cache, bpe); } - - kmem_cache_free(brt_pending_entry_cache, bpe); + brt_rlock(spa); } + brt_unlock(spa); } static void @@ -1509,25 +1448,19 @@ brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) } static void -brt_sync_table(brt_t *brt, dmu_tx_t *tx) +brt_sync_table(spa_t *spa, dmu_tx_t *tx) { - brt_vdev_t *brtvd; brt_entry_t *bre; - dnode_t *dn; - uint64_t vdevid; - void *c; - - brt_wlock(brt); - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - brtvd = &brt->brt_vdevs[vdevid]; - - if (!brtvd->bv_initiated) - continue; + brt_rlock(spa); + for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid]; + brt_unlock(spa); if (!brtvd->bv_meta_dirty) { ASSERT(!brtvd->bv_entcount_dirty); ASSERT0(avl_numnodes(&brtvd->bv_tree)); + brt_rlock(spa); continue; } @@ -1535,132 +1468,80 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) avl_numnodes(&brtvd->bv_tree) != 0); if (brtvd->bv_mos_brtvdev == 0) - brt_vdev_create(brt, brtvd, tx); - - VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, - FTAG, &dn)); + brt_vdev_create(spa, brtvd, tx); - c = NULL; + void *c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(dn, bre, tx); + brt_sync_entry(brtvd->bv_mos_entries_dnode, bre, tx); brt_entry_free(bre); - ASSERT(brt->brt_nentries > 0); - brt->brt_nentries--; } - dnode_rele(dn, FTAG); - - brt_vdev_sync(brt, brtvd, tx); - +#ifdef ZFS_DEBUG + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); +#endif if (brtvd->bv_totalcount == 0) - brt_vdev_destroy(brt, brtvd, tx); + brt_vdev_destroy(spa, brtvd, tx); + else + brt_vdev_sync(spa, brtvd, tx); + brt_rlock(spa); } - - ASSERT0(brt->brt_nentries); - - brt_unlock(brt); + brt_unlock(spa); } void brt_sync(spa_t *spa, uint64_t txg) { dmu_tx_t *tx; - brt_t *brt; + uint64_t vdevid; ASSERT(spa_syncing_txg(spa) == txg); - brt = spa->spa_brt; - brt_rlock(brt); - if (brt->brt_nentries == 0) { - /* No changes. */ - brt_unlock(brt); + brt_rlock(spa); + for (vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) { + if (spa->spa_brt_vdevs[vdevid]->bv_meta_dirty) + break; + } + if (vdevid >= spa->spa_brt_nvdevs) { + brt_unlock(spa); return; } - brt_unlock(brt); + brt_unlock(spa); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); - - brt_sync_table(brt, tx); - + brt_sync_table(spa, tx); dmu_tx_commit(tx); } -static void -brt_table_alloc(brt_t *brt) -{ - - for (int i = 0; i < TXG_SIZE; i++) { - avl_create(&brt->brt_pending_tree[i], - brt_pending_entry_compare, - sizeof (brt_pending_entry_t), - offsetof(brt_pending_entry_t, bpe_node)); - mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, - NULL); - } -} - -static void -brt_table_free(brt_t *brt) -{ - - for (int i = 0; i < TXG_SIZE; i++) { - ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); - - avl_destroy(&brt->brt_pending_tree[i]); - mutex_destroy(&brt->brt_pending_lock[i]); - } -} - static void brt_alloc(spa_t *spa) { - brt_t *brt; - - ASSERT(spa->spa_brt == NULL); - - brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); - rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); - brt->brt_spa = spa; - brt->brt_rangesize = 0; - brt->brt_nentries = 0; - brt->brt_vdevs = NULL; - brt->brt_nvdevs = 0; - brt_table_alloc(brt); - - spa->spa_brt = brt; + rw_init(&spa->spa_brt_lock, NULL, RW_DEFAULT, NULL); + spa->spa_brt_vdevs = NULL; + spa->spa_brt_nvdevs = 0; + spa->spa_brt_rangesize = 0; } void brt_create(spa_t *spa) { - brt_alloc(spa); - brt_vdevs_alloc(spa->spa_brt, B_FALSE); + brt_vdevs_alloc(spa, B_FALSE); } int brt_load(spa_t *spa) { - brt_alloc(spa); - brt_vdevs_alloc(spa->spa_brt, B_TRUE); - + brt_vdevs_alloc(spa, B_TRUE); return (0); } void brt_unload(spa_t *spa) { - brt_t *brt = spa->spa_brt; - - if (brt == NULL) - return; - - brt_vdevs_free(brt); - brt_table_free(brt); - rw_destroy(&brt->brt_lock); - kmem_free(brt, sizeof (*brt)); - spa->spa_brt = NULL; + brt_vdevs_free(spa); + rw_destroy(&spa->spa_brt_lock); } /* BEGIN CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f486513fcaf9..32542e7ce701 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1870,13 +1870,7 @@ spa_get_slop_space(spa_t *spa) if (spa->spa_dedup_dspace == ~0ULL) spa_update_dspace(spa); - /* - * spa_get_dspace() includes the space only logically "used" by - * deduplicated data, so since it's not useful to reserve more - * space with more deduplicated data, we subtract that out here. - */ - space = - spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa); + space = spa->spa_rdspace; slop = MIN(space >> spa_slop_shift, spa_max_slop); /* @@ -1912,8 +1906,7 @@ spa_get_checkpoint_space(spa_t *spa) void spa_update_dspace(spa_t *spa) { - spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + - ddt_get_dedup_dspace(spa) + brt_get_dspace(spa); + spa->spa_rdspace = metaslab_class_get_dspace(spa_normal_class(spa)); if (spa->spa_nonallocating_dspace > 0) { /* * Subtract the space provided by all non-allocating vdevs that @@ -1933,9 +1926,11 @@ spa_update_dspace(spa_t *spa) * doesn't matter that the data we are moving may be * allocated twice (on the old device and the new device). */ - ASSERT3U(spa->spa_dspace, >=, spa->spa_nonallocating_dspace); - spa->spa_dspace -= spa->spa_nonallocating_dspace; + ASSERT3U(spa->spa_rdspace, >=, spa->spa_nonallocating_dspace); + spa->spa_rdspace -= spa->spa_nonallocating_dspace; } + spa->spa_dspace = spa->spa_rdspace + ddt_get_dedup_dspace(spa) + + brt_get_dspace(spa); } /*