@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
102
102
return (dmu_object_free (os , obj , tx ));
103
103
}
104
104
105
+ boolean_t
106
+ bptree_is_empty (objset_t * os , uint64_t obj )
107
+ {
108
+ dmu_buf_t * db ;
109
+ bptree_phys_t * bt ;
110
+ boolean_t rv ;
111
+
112
+ VERIFY0 (dmu_bonus_hold (os , obj , FTAG , & db ));
113
+ bt = db -> db_data ;
114
+ rv = (bt -> bt_begin == bt -> bt_end );
115
+ dmu_buf_rele (db , FTAG );
116
+ return (rv );
117
+ }
118
+
105
119
void
106
120
bptree_add (objset_t * os , uint64_t obj , blkptr_t * bp , uint64_t birth_txg ,
107
121
uint64_t bytes , uint64_t comp , uint64_t uncomp , dmu_tx_t * tx )
108
122
{
109
123
dmu_buf_t * db ;
110
124
bptree_phys_t * bt ;
111
- bptree_entry_phys_t bte ;
125
+ bptree_entry_phys_t bte = { 0 } ;
112
126
113
127
/*
114
128
* bptree objects are in the pool mos, therefore they can only be
@@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
122
136
123
137
bte .be_birth_txg = birth_txg ;
124
138
bte .be_bp = * bp ;
125
- bzero (& bte .be_zb , sizeof (bte .be_zb ));
126
139
dmu_write (os , obj , bt -> bt_end * sizeof (bte ), sizeof (bte ), & bte , tx );
127
140
128
141
dmu_buf_will_dirty (db , tx );
@@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
153
166
return (err );
154
167
}
155
168
169
+ /*
170
+ * If "free" is set:
171
+ * - It is assumed that "func" will be freeing the block pointers.
172
+ * - If "func" returns nonzero, the bookmark will be remembered and
173
+ * iteration will be restarted from this point on next invocation.
174
+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
175
+ * bptree_iterate will remember the bookmark, continue traversing
176
+ * any additional entries, and return 0.
177
+ *
178
+ * If "free" is not set, traversal will stop and return an error if
179
+ * an i/o error is encountered.
180
+ *
181
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
182
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
183
+ * traverse_dataset_destroyed()).
184
+ */
156
185
int
157
186
bptree_iterate (objset_t * os , uint64_t obj , boolean_t free , bptree_itor_t func ,
158
187
void * arg , dmu_tx_t * tx )
159
188
{
189
+ boolean_t ioerr = B_FALSE ;
160
190
int err ;
161
191
uint64_t i ;
162
192
dmu_buf_t * db ;
@@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
182
212
bptree_entry_phys_t bte ;
183
213
int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST ;
184
214
185
- ASSERT (!free || i == ba .ba_phys -> bt_begin );
186
-
187
215
err = dmu_read (os , obj , i * sizeof (bte ), sizeof (bte ),
188
216
& bte , DMU_READ_NO_PREFETCH );
189
217
if (err != 0 )
190
218
break ;
191
219
192
- if (zfs_recover )
220
+ if (zfs_free_leak_on_eio )
193
221
flags |= TRAVERSE_HARD ;
222
+ zfs_dbgmsg ("bptree index %d: traversing from min_txg=%lld "
223
+ "bookmark %lld/%lld/%lld/%lld" ,
224
+ i , (longlong_t )bte .be_birth_txg ,
225
+ (longlong_t )bte .be_zb .zb_objset ,
226
+ (longlong_t )bte .be_zb .zb_object ,
227
+ (longlong_t )bte .be_zb .zb_level ,
228
+ (longlong_t )bte .be_zb .zb_blkid );
194
229
err = traverse_dataset_destroyed (os -> os_spa , & bte .be_bp ,
195
230
bte .be_birth_txg , & bte .be_zb , flags ,
196
231
bptree_visit_cb , & ba );
197
232
if (free ) {
198
- if (err == ERESTART ) {
233
+ /*
234
+ * The callback has freed the visited block pointers.
235
+ * Record our traversal progress on disk, either by
236
+ * updating this record's bookmark, or by logically
237
+ * removing this record by advancing bt_begin.
238
+ */
239
+ if (err != 0 ) {
199
240
/* save bookmark for future resume */
200
241
ASSERT3U (bte .be_zb .zb_objset , = = ,
201
242
ZB_DESTROYED_OBJSET );
202
243
ASSERT0 (bte .be_zb .zb_level );
203
244
dmu_write (os , obj , i * sizeof (bte ),
204
245
sizeof (bte ), & bte , tx );
205
- break ;
206
- }
207
- if (err != 0 ) {
246
+ if (err == EIO || err == ECKSUM ||
247
+ err == ENXIO ) {
248
+ /*
249
+ * Skip the rest of this tree and
250
+ * continue on to the next entry.
251
+ */
252
+ err = 0 ;
253
+ ioerr = B_TRUE ;
254
+ } else {
255
+ break ;
256
+ }
257
+ } else if (ioerr ) {
208
258
/*
209
- * We can not properly handle an i/o
210
- * error, because the traversal code
211
- * does not know how to resume from an
212
- * arbitrary bookmark.
259
+ * This entry is finished, but there were
260
+ * i/o errors on previous entries, so we
261
+ * can't adjust bt_begin. Set this entry's
262
+ * be_birth_txg such that it will be
263
+ * treated as a no-op in future traversals.
213
264
*/
214
- zfs_panic_recover ("error %u from "
215
- "traverse_dataset_destroyed()" , err );
265
+ bte .be_birth_txg = UINT64_MAX ;
266
+ dmu_write (os , obj , i * sizeof (bte ),
267
+ sizeof (bte ), & bte , tx );
216
268
}
217
269
218
- ba .ba_phys -> bt_begin ++ ;
219
- (void ) dmu_free_range (os , obj ,
220
- i * sizeof (bte ), sizeof (bte ), tx );
270
+ if (!ioerr ) {
271
+ ba .ba_phys -> bt_begin ++ ;
272
+ (void ) dmu_free_range (os , obj ,
273
+ i * sizeof (bte ), sizeof (bte ), tx );
274
+ }
275
+ } else if (err != 0 ) {
276
+ break ;
221
277
}
222
278
}
223
279
224
- ASSERT (!free || err != 0 || ba .ba_phys -> bt_begin == ba .ba_phys -> bt_end );
280
+ ASSERT (!free || err != 0 || ioerr ||
281
+ ba .ba_phys -> bt_begin == ba .ba_phys -> bt_end );
225
282
226
283
/* if all blocks are free there should be no used space */
227
284
if (ba .ba_phys -> bt_begin == ba .ba_phys -> bt_end ) {
285
+ if (zfs_free_leak_on_eio ) {
286
+ ba .ba_phys -> bt_bytes = 0 ;
287
+ ba .ba_phys -> bt_comp = 0 ;
288
+ ba .ba_phys -> bt_uncomp = 0 ;
289
+ }
290
+
228
291
ASSERT0 (ba .ba_phys -> bt_bytes );
229
292
ASSERT0 (ba .ba_phys -> bt_comp );
230
293
ASSERT0 (ba .ba_phys -> bt_uncomp );
0 commit comments