Skip to content

Commit

Permalink
Fix dnode allocation race
Browse files Browse the repository at this point in the history
When performing concurrent object allocations using the new
multi-threaded allocator and large dnodes it's possible to
allocate overlapping large dnodes.

This case should have been handled by detecting an error
returned by dnode_hold_impl().  But that logic only checked
the returned dnp was not-NULL, and the dnp variable was not
reset to NULL when retrying.  Resolve this issue by properly
checking the return value of dnode_hold_impl().

Additionally, it was possible that dnode_hold_impl() would
misreport a dnode as free when it was in fact in use.  This
could occurs for two reasons:

* The per-slot zrl_lock must be held over the entire critical
  section which includes the alloc/free until the new dnode
  is assigned to children_dnodes.  Additionally, all of the
  zrl_lock's in the range must be held to protect moving
  dnodes.

* The dn->dn_ot_type cannot be solely relied upon to check
  the type.  When allocating a new dnode its type will be
  DMU_OT_NONE after dnode_create().  Only latter when
  dnode_allocate() is called will it transition to the new
  type.  This means there's a window when allocating where
  it can mistaken for a free dnode.

Reviewed-by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed-by: Ned Bass <bass6@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes openzfs#6414 
Closes openzfs#6439
  • Loading branch information
behlendorf authored and FransUrbo committed Apr 28, 2019
1 parent 5887a80 commit 0603d69
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 34 deletions.
2 changes: 1 addition & 1 deletion module/zfs/dmu_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1252,7 +1252,7 @@ dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
THT_SPILL, 0, 0);
if (txh != NULL)
(void) zfs_refcount_add_many(&txh->txh_space_towrite,
(void) refcount_add_many(&txh->txh_space_towrite,
SPA_OLD_MAXBLOCKSIZE, FTAG);
}

Expand Down
155 changes: 144 additions & 11 deletions module/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ dnode_setdblksz(dnode_t *dn, int size)
}

static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
dnode_t *dn;
Expand Down Expand Up @@ -459,11 +459,15 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;

if (slots && dn->dn_type == DMU_OT_NONE)
dn->dn_num_slots = slots;
else
dn->dn_num_slots = dnp->dn_extra_slots + 1;

dmu_zfetch_init(&dn->dn_zfetch, dn);

ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
Expand Down Expand Up @@ -1168,6 +1172,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
{
dnode_t *dn;

dn = dnode_create(os, dnp, 0, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
zrl_tryenter(&dnh->dnh_zrlock);

Expand Down Expand Up @@ -1214,8 +1219,124 @@ dnode_buf_evict_async(void *dbu)
zrl_destroy(&dnh->dnh_zrlock);
dnh->dnh_dnode = DN_SLOT_UNINIT;
}
kmem_free(dnc, sizeof (dnode_children_t) +
dnc->dnc_count * sizeof (dnode_handle_t));
kmem_free(children_dnodes, sizeof (dnode_children_t) +
children_dnodes->dnc_count * sizeof (dnode_handle_t));
}

/*
* Return true if the given index is interior to a dnode already
* allocated in the block. That is, the index is neither free nor
* allocated, but is consumed by a large dnode.
*
* The dnode_phys_t buffer may not be in sync with the in-core dnode
* structure, so we try to check the dnode structure first and fall back
* to the dnode_phys_t buffer it doesn't exist. When an in-code dnode
* exists we can always trust dn->dn_num_slots to be accurate, even for
* a held dnode which has not yet been fully allocated.
*/
static boolean_t
dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
{
int skip, i;

for (i = 0; i < idx; i += skip) {
dnode_handle_t *dnh = &children->dnc_children[i];

if (dnh->dnh_dnode != NULL) {
skip = dnh->dnh_dnode->dn_num_slots;
} else {
if (dn_block[i].dn_type != DMU_OT_NONE)
skip = dn_block[i].dn_extra_slots + 1;
else
skip = 1;
}
}

return (i > idx);
}

/*
* Return true if the given index in the dnode block is a valid
* allocated dnode. That is, the index is not consumed by a large
* dnode and is not free.
*
* The dnode_phys_t buffer may not be in sync with the in-core dnode
* structure, so we try to check the dnode structure first and fall back
* to the dnode_phys_t buffer it doesn't exist.
*/
static boolean_t
dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
{
dnode_handle_t *dnh;
dmu_object_type_t ot;

if (dnode_is_consumed(children, dn_block, idx))
return (B_FALSE);

dnh = &children->dnc_children[idx];
if (dnh->dnh_dnode != NULL)
ot = dnh->dnh_dnode->dn_type;
else
ot = dn_block[idx].dn_type;

return (ot != DMU_OT_NONE);
}

/*
* Return true if the given range of indices in the dnode block are
* free. That is, the starting index is not consumed by a large dnode
* and none of the indices are allocated.
*
* The dnode_phys_t buffer may not be in sync with the in-core dnode
* structure, so we try to check the dnode structure first and fall back
* to the dnode_phys_t buffer it doesn't exist.
*/
static boolean_t
dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
int slots)
{
if (idx + slots > DNODES_PER_BLOCK)
return (B_FALSE);

if (dnode_is_consumed(children, dn_block, idx))
return (B_FALSE);

for (int i = idx; i < idx + slots; i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
dmu_object_type_t ot;

if (dnh->dnh_dnode != NULL) {
if (dnh->dnh_dnode->dn_num_slots > 1)
return (B_FALSE);

ot = dnh->dnh_dnode->dn_type;
} else {
ot = dn_block[i].dn_type;
}

if (ot != DMU_OT_NONE)
return (B_FALSE);
}

return (B_TRUE);
}

static void
dnode_hold_slots(dnode_children_t *children, int idx, int slots)
{
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
zrl_add(&dnh->dnh_zrlock);
}
}

static void
dnode_rele_slots(dnode_children_t *children, int idx, int slots)
{
for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
dnode_handle_t *dnh = &children->dnc_children[i];
zrl_remove(&dnh->dnh_zrlock);
}
}

/*
Expand Down Expand Up @@ -1358,20 +1479,32 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
ASSERT(dnc->dnc_count == epb);
dn = DN_SLOT_UNINIT;

if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
dnode_hold_slots(children_dnodes, idx, slots);

if ((flag & DNODE_MUST_BE_FREE) &&
!dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOSPC));
} else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
!dnode_is_allocated(db, idx)) {
!dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(ENOENT));
}

if (dn->dn_free_txg) {
DNODE_STAT_BUMP(dnode_hold_free_txg);
type = dn->dn_type;
dnh = &children_dnodes->dnc_children[idx];
dn = dnh->dnh_dnode;
if (dn == NULL)
dn = dnode_create(os, dn_block_begin + idx, slots, db,
object, dnh);

mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
mutex_exit(&dn->dn_mtx);
dnode_slots_rele(dnc, idx, slots);
dnode_rele_slots(children_dnodes, idx, slots);
dbuf_rele(db, FTAG);
return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
}
Expand All @@ -1382,7 +1515,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
mutex_exit(&dn->dn_mtx);

/* Now we can rely on the hold to prevent the dnode from moving. */
dnode_slots_rele(dnc, idx, slots);
dnode_rele_slots(children_dnodes, idx, slots);

DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
Expand Down
3 changes: 1 addition & 2 deletions tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,7 @@ tags = ['functional', 'features', 'async_destroy']
[tests/functional/features/large_dnode]
tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
'large_dnode_007_neg', 'large_dnode_008_pos', 'large_dnode_009_pos']
tags = ['functional', 'features', 'large_dnode']
'large_dnode_007_neg', 'large_dnode_008_pos']

[tests/functional/grow_pool]
tests = ['grow_pool_001_pos']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ dist_pkgdata_SCRIPTS = \
large_dnode_005_pos.ksh \
large_dnode_006_pos.ksh \
large_dnode_007_neg.ksh \
large_dnode_008_pos.ksh \
large_dnode_009_pos.ksh
large_dnode_008_pos.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,6 @@ function cleanup
datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
}

function verify_dnode_packing
{
zdb -dd $TEST_FS | grep -A 3 'Dnode slots' | awk '
/Total used:/ {total_used=$NF}
/Max used:/ {max_used=$NF}
/Percent empty:/ {print total_used, max_used, int($NF)}
' | while read total_used max_used pct_empty
do
log_note "total_used $total_used max_used $max_used pct_empty $pct_empty"
if [ $pct_empty -gt 5 ]; then
log_fail "Holes in dnode array: pct empty $pct_empty > 5"
fi
done
}

log_onexit cleanup
log_assert "xattrtest runs concurrently on dataset with large dnodes"

Expand All @@ -67,11 +52,9 @@ log_must zfs set xattr=sa $TEST_FS
for ((i=0; i < 100; i++)); do
dir="/$TEST_FS/dir.$i"
log_must mkdir "$dir"
log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir >/dev/null 2>&1 &"
log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
done

log_must wait

verify_dnode_packing

log_pass

0 comments on commit 0603d69

Please sign in to comment.