Skip to content

Commit 7fd05ac

Browse files
ahrensChristopher Siden
authored andcommitted
4390 i/o errors when deleting filesystem/zvol can lead to space map corruption
Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com> Approved by: Dan McDonald <danmcd@omniti.com>
1 parent 5d7b4d4 commit 7fd05ac

File tree

19 files changed

+297
-160
lines changed

19 files changed

+297
-160
lines changed

usr/src/cmd/mdb/common/modules/zfs/zfs.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1501,7 +1501,6 @@ space_cb(uintptr_t addr, const void *unknown, void *arg)
15011501
return (WALK_ERR);
15021502

15031503
for (i = 0; i < TXG_SIZE; i++) {
1504-
15051504
if (mdb_ctf_vread(&rt, "range_tree_t",
15061505
"mdb_range_tree_t", ms.ms_alloctree[i], 0) == -1)
15071506
sd->ms_alloctree[i] += rt.rt_space;

usr/src/cmd/zdb/zdb.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@
7575
DMU_OT_ZAP_OTHER : DMU_OT_NUMTYPES))
7676

7777
#ifndef lint
78-
extern int zfs_recover;
78+
extern boolean_t zfs_recover;
7979
#else
80-
int zfs_recover;
80+
boolean_t zfs_recover;
8181
#endif
8282

8383
const char cmdname[] = "zdb";

usr/src/common/zfs/zpool_prop.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ zpool_prop_init(void)
8181
ZFS_TYPE_POOL, "<size>", "FREE");
8282
zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
8383
ZFS_TYPE_POOL, "<size>", "FREEING");
84+
zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
85+
ZFS_TYPE_POOL, "<size>", "LEAKED");
8486
zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
8587
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
8688
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,

usr/src/lib/libzfs/common/libzfs_pool.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
275275
case ZPOOL_PROP_ALLOCATED:
276276
case ZPOOL_PROP_FREE:
277277
case ZPOOL_PROP_FREEING:
278+
case ZPOOL_PROP_LEAKED:
278279
case ZPOOL_PROP_EXPANDSZ:
279280
if (literal) {
280281
(void) snprintf(buf, len, "%llu",

usr/src/uts/common/fs/zfs/bptree.c

Lines changed: 82 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
102102
return (dmu_object_free(os, obj, tx));
103103
}
104104

105+
boolean_t
106+
bptree_is_empty(objset_t *os, uint64_t obj)
107+
{
108+
dmu_buf_t *db;
109+
bptree_phys_t *bt;
110+
boolean_t rv;
111+
112+
VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
113+
bt = db->db_data;
114+
rv = (bt->bt_begin == bt->bt_end);
115+
dmu_buf_rele(db, FTAG);
116+
return (rv);
117+
}
118+
105119
void
106120
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
107121
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
108122
{
109123
dmu_buf_t *db;
110124
bptree_phys_t *bt;
111-
bptree_entry_phys_t bte;
125+
bptree_entry_phys_t bte = { 0 };
112126

113127
/*
114128
* bptree objects are in the pool mos, therefore they can only be
@@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
122136

123137
bte.be_birth_txg = birth_txg;
124138
bte.be_bp = *bp;
125-
bzero(&bte.be_zb, sizeof (bte.be_zb));
126139
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
127140

128141
dmu_buf_will_dirty(db, tx);
@@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
153166
return (err);
154167
}
155168

169+
/*
170+
* If "free" is set:
171+
* - It is assumed that "func" will be freeing the block pointers.
172+
* - If "func" returns nonzero, the bookmark will be remembered and
173+
* iteration will be restarted from this point on next invocation.
174+
* - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
175+
* bptree_iterate will remember the bookmark, continue traversing
176+
* any additional entries, and return 0.
177+
*
178+
* If "free" is not set, traversal will stop and return an error if
179+
* an i/o error is encountered.
180+
*
181+
* In either case, if zfs_free_leak_on_eio is set, i/o errors will be
182+
* ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
183+
* traverse_dataset_destroyed()).
184+
*/
156185
int
157186
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
158187
void *arg, dmu_tx_t *tx)
159188
{
189+
boolean_t ioerr = B_FALSE;
160190
int err;
161191
uint64_t i;
162192
dmu_buf_t *db;
@@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
182212
bptree_entry_phys_t bte;
183213
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
184214

185-
ASSERT(!free || i == ba.ba_phys->bt_begin);
186-
187215
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
188216
&bte, DMU_READ_NO_PREFETCH);
189217
if (err != 0)
190218
break;
191219

192-
if (zfs_recover)
220+
if (zfs_free_leak_on_eio)
193221
flags |= TRAVERSE_HARD;
222+
zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld "
223+
"bookmark %lld/%lld/%lld/%lld",
224+
i, (longlong_t)bte.be_birth_txg,
225+
(longlong_t)bte.be_zb.zb_objset,
226+
(longlong_t)bte.be_zb.zb_object,
227+
(longlong_t)bte.be_zb.zb_level,
228+
(longlong_t)bte.be_zb.zb_blkid);
194229
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
195230
bte.be_birth_txg, &bte.be_zb, flags,
196231
bptree_visit_cb, &ba);
197232
if (free) {
198-
if (err == ERESTART) {
233+
/*
234+
* The callback has freed the visited block pointers.
235+
* Record our traversal progress on disk, either by
236+
* updating this record's bookmark, or by logically
237+
* removing this record by advancing bt_begin.
238+
*/
239+
if (err != 0) {
199240
/* save bookmark for future resume */
200241
ASSERT3U(bte.be_zb.zb_objset, ==,
201242
ZB_DESTROYED_OBJSET);
202243
ASSERT0(bte.be_zb.zb_level);
203244
dmu_write(os, obj, i * sizeof (bte),
204245
sizeof (bte), &bte, tx);
205-
break;
206-
}
207-
if (err != 0) {
246+
if (err == EIO || err == ECKSUM ||
247+
err == ENXIO) {
248+
/*
249+
* Skip the rest of this tree and
250+
* continue on to the next entry.
251+
*/
252+
err = 0;
253+
ioerr = B_TRUE;
254+
} else {
255+
break;
256+
}
257+
} else if (ioerr) {
208258
/*
209-
* We can not properly handle an i/o
210-
* error, because the traversal code
211-
* does not know how to resume from an
212-
* arbitrary bookmark.
259+
* This entry is finished, but there were
260+
* i/o errors on previous entries, so we
261+
* can't adjust bt_begin. Set this entry's
262+
* be_birth_txg such that it will be
263+
* treated as a no-op in future traversals.
213264
*/
214-
zfs_panic_recover("error %u from "
215-
"traverse_dataset_destroyed()", err);
265+
bte.be_birth_txg = UINT64_MAX;
266+
dmu_write(os, obj, i * sizeof (bte),
267+
sizeof (bte), &bte, tx);
216268
}
217269

218-
ba.ba_phys->bt_begin++;
219-
(void) dmu_free_range(os, obj,
220-
i * sizeof (bte), sizeof (bte), tx);
270+
if (!ioerr) {
271+
ba.ba_phys->bt_begin++;
272+
(void) dmu_free_range(os, obj,
273+
i * sizeof (bte), sizeof (bte), tx);
274+
}
275+
} else if (err != 0) {
276+
break;
221277
}
222278
}
223279

224-
ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
280+
ASSERT(!free || err != 0 || ioerr ||
281+
ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
225282

226283
/* if all blocks are free there should be no used space */
227284
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
285+
if (zfs_free_leak_on_eio) {
286+
ba.ba_phys->bt_bytes = 0;
287+
ba.ba_phys->bt_comp = 0;
288+
ba.ba_phys->bt_uncomp = 0;
289+
}
290+
228291
ASSERT0(ba.ba_phys->bt_bytes);
229292
ASSERT0(ba.ba_phys->bt_comp);
230293
ASSERT0(ba.ba_phys->bt_uncomp);

0 commit comments

Comments
 (0)