@@ -102,13 +102,27 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
102102 return (dmu_object_free (os , obj , tx ));
103103}
104104
105+ boolean_t
106+ bptree_is_empty (objset_t * os , uint64_t obj )
107+ {
108+ dmu_buf_t * db ;
109+ bptree_phys_t * bt ;
110+ boolean_t rv ;
111+
112+ VERIFY0 (dmu_bonus_hold (os , obj , FTAG , & db ));
113+ bt = db -> db_data ;
114+ rv = (bt -> bt_begin == bt -> bt_end );
115+ dmu_buf_rele (db , FTAG );
116+ return (rv );
117+ }
118+
105119void
106120bptree_add (objset_t * os , uint64_t obj , blkptr_t * bp , uint64_t birth_txg ,
107121 uint64_t bytes , uint64_t comp , uint64_t uncomp , dmu_tx_t * tx )
108122{
109123 dmu_buf_t * db ;
110124 bptree_phys_t * bt ;
111- bptree_entry_phys_t bte ;
125+ bptree_entry_phys_t bte = { 0 } ;
112126
113127 /*
114128 * bptree objects are in the pool mos, therefore they can only be
@@ -122,7 +136,6 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
122136
123137 bte .be_birth_txg = birth_txg ;
124138 bte .be_bp = * bp ;
125- bzero (& bte .be_zb , sizeof (bte .be_zb ));
126139 dmu_write (os , obj , bt -> bt_end * sizeof (bte ), sizeof (bte ), & bte , tx );
127140
128141 dmu_buf_will_dirty (db , tx );
@@ -153,10 +166,27 @@ bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
153166 return (err );
154167}
155168
169+ /*
170+ * If "free" is set:
171+ * - It is assumed that "func" will be freeing the block pointers.
172+ * - If "func" returns nonzero, the bookmark will be remembered and
173+ * iteration will be restarted from this point on next invocation.
174+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
175+ * bptree_iterate will remember the bookmark, continue traversing
176+ * any additional entries, and return 0.
177+ *
178+ * If "free" is not set, traversal will stop and return an error if
179+ * an i/o error is encountered.
180+ *
181+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
182+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
183+ * traverse_dataset_destroyed()).
184+ */
156185int
157186bptree_iterate (objset_t * os , uint64_t obj , boolean_t free , bptree_itor_t func ,
158187 void * arg , dmu_tx_t * tx )
159188{
189+ boolean_t ioerr = B_FALSE ;
160190 int err ;
161191 uint64_t i ;
162192 dmu_buf_t * db ;
@@ -182,49 +212,82 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
182212 bptree_entry_phys_t bte ;
183213 int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST ;
184214
185- ASSERT (!free || i == ba .ba_phys -> bt_begin );
186-
187215 err = dmu_read (os , obj , i * sizeof (bte ), sizeof (bte ),
188216 & bte , DMU_READ_NO_PREFETCH );
189217 if (err != 0 )
190218 break ;
191219
192- if (zfs_recover )
220+ if (zfs_free_leak_on_eio )
193221 flags |= TRAVERSE_HARD ;
222+ zfs_dbgmsg ("bptree index %d: traversing from min_txg=%lld "
223+ "bookmark %lld/%lld/%lld/%lld" ,
224+ i , (longlong_t )bte .be_birth_txg ,
225+ (longlong_t )bte .be_zb .zb_objset ,
226+ (longlong_t )bte .be_zb .zb_object ,
227+ (longlong_t )bte .be_zb .zb_level ,
228+ (longlong_t )bte .be_zb .zb_blkid );
194229 err = traverse_dataset_destroyed (os -> os_spa , & bte .be_bp ,
195230 bte .be_birth_txg , & bte .be_zb , flags ,
196231 bptree_visit_cb , & ba );
197232 if (free ) {
198- if (err == ERESTART ) {
233+ /*
234+ * The callback has freed the visited block pointers.
235+ * Record our traversal progress on disk, either by
236+ * updating this record's bookmark, or by logically
237+ * removing this record by advancing bt_begin.
238+ */
239+ if (err != 0 ) {
199240 /* save bookmark for future resume */
200241 ASSERT3U (bte .be_zb .zb_objset , = = ,
201242 ZB_DESTROYED_OBJSET );
202243 ASSERT0 (bte .be_zb .zb_level );
203244 dmu_write (os , obj , i * sizeof (bte ),
204245 sizeof (bte ), & bte , tx );
205- break ;
206- }
207- if (err != 0 ) {
246+ if (err == EIO || err == ECKSUM ||
247+ err == ENXIO ) {
248+ /*
249+ * Skip the rest of this tree and
250+ * continue on to the next entry.
251+ */
252+ err = 0 ;
253+ ioerr = B_TRUE ;
254+ } else {
255+ break ;
256+ }
257+ } else if (ioerr ) {
208258 /*
209- * We can not properly handle an i/o
210- * error, because the traversal code
211- * does not know how to resume from an
212- * arbitrary bookmark.
259+ * This entry is finished, but there were
260+ * i/o errors on previous entries, so we
261+ * can't adjust bt_begin. Set this entry's
262+ * be_birth_txg such that it will be
263+ * treated as a no-op in future traversals.
213264 */
214- zfs_panic_recover ("error %u from "
215- "traverse_dataset_destroyed()" , err );
265+ bte .be_birth_txg = UINT64_MAX ;
266+ dmu_write (os , obj , i * sizeof (bte ),
267+ sizeof (bte ), & bte , tx );
216268 }
217269
218- ba .ba_phys -> bt_begin ++ ;
219- (void ) dmu_free_range (os , obj ,
220- i * sizeof (bte ), sizeof (bte ), tx );
270+ if (!ioerr ) {
271+ ba .ba_phys -> bt_begin ++ ;
272+ (void ) dmu_free_range (os , obj ,
273+ i * sizeof (bte ), sizeof (bte ), tx );
274+ }
275+ } else if (err != 0 ) {
276+ break ;
221277 }
222278 }
223279
224- ASSERT (!free || err != 0 || ba .ba_phys -> bt_begin == ba .ba_phys -> bt_end );
280+ ASSERT (!free || err != 0 || ioerr ||
281+ ba .ba_phys -> bt_begin == ba .ba_phys -> bt_end );
225282
226283 /* if all blocks are free there should be no used space */
227284 if (ba .ba_phys -> bt_begin == ba .ba_phys -> bt_end ) {
285+ if (zfs_free_leak_on_eio ) {
286+ ba .ba_phys -> bt_bytes = 0 ;
287+ ba .ba_phys -> bt_comp = 0 ;
288+ ba .ba_phys -> bt_uncomp = 0 ;
289+ }
290+
228291 ASSERT0 (ba .ba_phys -> bt_bytes );
229292 ASSERT0 (ba .ba_phys -> bt_comp );
230293 ASSERT0 (ba .ba_phys -> bt_uncomp );
0 commit comments