Skip to content

Commit

Permalink
4390 i/o errors when deleting filesystem/zvol can lead to space map c…
Browse files Browse the repository at this point in the history
…orruption

Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Approved by: Dan McDonald <danmcd@omniti.com>
  • Loading branch information
ahrens authored and Christopher Siden committed Jun 5, 2014
1 parent 5d7b4d4 commit 7fd05ac
Show file tree
Hide file tree
Showing 19 changed files with 297 additions and 160 deletions.
1 change: 0 additions & 1 deletion usr/src/cmd/mdb/common/modules/zfs/zfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1501,7 +1501,6 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
return (WALK_ERR);

for (i = 0; i < TXG_SIZE; i++) {

if (mdb_ctf_vread(&rt, "range_tree_t",
"mdb_range_tree_t", ms.ms_alloctree[i], 0) == -1)
sd->ms_alloctree[i] += rt.rt_space;
Expand Down
4 changes: 2 additions & 2 deletions usr/src/cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@
DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))

#ifndef lint
extern int zfs_recover;
extern boolean_t zfs_recover;
#else
int zfs_recover;
boolean_t zfs_recover;
#endif

const char cmdname[] = "zdb";
Expand Down
2 changes: 2 additions & 0 deletions usr/src/common/zfs/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ zpool_prop_init(void)
ZFS_TYPE_POOL, "<size>", "FREE");
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "FREEING");
zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "LEAKED");
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
Expand Down
1 change: 1 addition & 0 deletions usr/src/lib/libzfs/common/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
case ZPOOL_PROP_ALLOCATED:
case ZPOOL_PROP_FREE:
case ZPOOL_PROP_FREEING:
case ZPOOL_PROP_LEAKED:
case ZPOOL_PROP_EXPANDSZ:
if (literal) {
(void) snprintf(buf, len, "%llu",
Expand Down
101 changes: 82 additions & 19 deletions usr/src/uts/common/fs/zfs/bptree.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
return (dmu_object_free(os, obj, tx));
}

boolean_t
bptree_is_empty(objset_t *os, uint64_t obj)
{
dmu_buf_t *db;
bptree_phys_t *bt;
boolean_t rv;

VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
bt = db->db_data;
rv = (bt->bt_begin == bt->bt_end);
dmu_buf_rele(db, FTAG);
return (rv);
}

void
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
{
dmu_buf_t *db;
bptree_phys_t *bt;
bptree_entry_phys_t bte;
bptree_entry_phys_t bte = { 0 };

/*
* bptree objects are in the pool mos, therefore they can only be
Expand All @@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,

bte.be_birth_txg = birth_txg;
bte.be_bp = *bp;
bzero(&bte.be_zb, sizeof (bte.be_zb));
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);

dmu_buf_will_dirty(db, tx);
Expand Down Expand Up @@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
return (err);
}

/*
* If "free" is set:
* - It is assumed that "func" will be freeing the block pointers.
* - If "func" returns nonzero, the bookmark will be remembered and
* iteration will be restarted from this point on next invocation.
* - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
* bptree_iterate will remember the bookmark, continue traversing
* any additional entries, and return 0.
*
* If "free" is not set, traversal will stop and return an error if
* an i/o error is encountered.
*
* In either case, if zfs_free_leak_on_eio is set, i/o errors will be
* ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
* traverse_dataset_destroyed()).
*/
int
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
void *arg, dmu_tx_t *tx)
{
boolean_t ioerr = B_FALSE;
int err;
uint64_t i;
dmu_buf_t *db;
Expand All @@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
bptree_entry_phys_t bte;
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;

ASSERT(!free || i == ba.ba_phys->bt_begin);

err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
if (err != 0)
break;

if (zfs_recover)
if (zfs_free_leak_on_eio)
flags |= TRAVERSE_HARD;
zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
"bookmark %lld/%lld/%lld/%lld",
i, (longlong_t)bte.be_birth_txg,
(longlong_t)bte.be_zb.zb_objset,
(longlong_t)bte.be_zb.zb_object,
(longlong_t)bte.be_zb.zb_level,
(longlong_t)bte.be_zb.zb_blkid);
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
bte.be_birth_txg, &bte.be_zb, flags,
bptree_visit_cb, &ba);
if (free) {
if (err == ERESTART) {
/*
* The callback has freed the visited block pointers.
* Record our traversal progress on disk, either by
* updating this record's bookmark, or by logically
* removing this record by advancing bt_begin.
*/
if (err != 0) {
/* save bookmark for future resume */
ASSERT3U(bte.be_zb.zb_objset, ==,
ZB_DESTROYED_OBJSET);
ASSERT0(bte.be_zb.zb_level);
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
break;
}
if (err != 0) {
if (err == EIO || err == ECKSUM ||
err == ENXIO) {
/*
* Skip the rest of this tree and
* continue on to the next entry.
*/
err = 0;
ioerr = B_TRUE;
} else {
break;
}
} else if (ioerr) {
/*
* We can not properly handle an i/o
* error, because the traversal code
* does not know how to resume from an
* arbitrary bookmark.
* This entry is finished, but there were
* i/o errors on previous entries, so we
* can't adjust bt_begin. Set this entry's
* be_birth_txg such that it will be
* treated as a no-op in future traversals.
*/
zfs_panic_recover("error %u from "
"traverse_dataset_destroyed()", err);
bte.be_birth_txg = UINT64_MAX;
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
}

ba.ba_phys->bt_begin++;
(void) dmu_free_range(os, obj,
i * sizeof (bte), sizeof (bte), tx);
if (!ioerr) {
ba.ba_phys->bt_begin++;
(void) dmu_free_range(os, obj,
i * sizeof (bte), sizeof (bte), tx);
}
} else if (err != 0) {
break;
}
}

ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
ASSERT(!free || err != 0 || ioerr ||
ba.ba_phys->bt_begin == ba.ba_phys->bt_end);

/* if all blocks are free there should be no used space */
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
if (zfs_free_leak_on_eio) {
ba.ba_phys->bt_bytes = 0;
ba.ba_phys->bt_comp = 0;
ba.ba_phys->bt_uncomp = 0;
}

ASSERT0(ba.ba_phys->bt_bytes);
ASSERT0(ba.ba_phys->bt_comp);
ASSERT0(ba.ba_phys->bt_uncomp);
Expand Down
Loading

0 comments on commit 7fd05ac

Please sign in to comment.