@@ -412,25 +412,29 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
412
412
413
413
static void __btrfs_remove_delayed_item (struct btrfs_delayed_item * delayed_item )
414
414
{
415
+ struct btrfs_delayed_node * delayed_node = delayed_item -> delayed_node ;
415
416
struct rb_root_cached * root ;
416
417
struct btrfs_delayed_root * delayed_root ;
417
418
418
419
/* Not inserted, ignore it. */
419
420
if (RB_EMPTY_NODE (& delayed_item -> rb_node ))
420
421
return ;
421
422
422
- delayed_root = delayed_item -> delayed_node -> root -> fs_info -> delayed_root ;
423
+ /* If it's in a rbtree, then we need to have delayed node locked. */
424
+ lockdep_assert_held (& delayed_node -> mutex );
425
+
426
+ delayed_root = delayed_node -> root -> fs_info -> delayed_root ;
423
427
424
428
BUG_ON (!delayed_root );
425
429
426
430
if (delayed_item -> type == BTRFS_DELAYED_INSERTION_ITEM )
427
- root = & delayed_item -> delayed_node -> ins_root ;
431
+ root = & delayed_node -> ins_root ;
428
432
else
429
- root = & delayed_item -> delayed_node -> del_root ;
433
+ root = & delayed_node -> del_root ;
430
434
431
435
rb_erase_cached (& delayed_item -> rb_node , root );
432
436
RB_CLEAR_NODE (& delayed_item -> rb_node );
433
- delayed_item -> delayed_node -> count -- ;
437
+ delayed_node -> count -- ;
434
438
435
439
finish_one_item (delayed_root );
436
440
}
@@ -1153,20 +1157,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
1153
1157
ret = __btrfs_commit_inode_delayed_items (trans , path ,
1154
1158
curr_node );
1155
1159
if (ret ) {
1156
- btrfs_release_delayed_node (curr_node );
1157
- curr_node = NULL ;
1158
1160
btrfs_abort_transaction (trans , ret );
1159
1161
break ;
1160
1162
}
1161
1163
1162
1164
prev_node = curr_node ;
1163
1165
curr_node = btrfs_next_delayed_node (curr_node );
1166
+ /*
1167
+ * See the comment below about releasing path before releasing
1168
+ * node. If the commit of delayed items was successful the path
1169
+ * should always be released, but in case of an error, it may
1170
+ * point to locked extent buffers (a leaf at the very least).
1171
+ */
1172
+ ASSERT (path -> nodes [0 ] == NULL );
1164
1173
btrfs_release_delayed_node (prev_node );
1165
1174
}
1166
1175
1176
+ /*
1177
+ * Release the path to avoid a potential deadlock and lockdep splat when
1178
+ * releasing the delayed node, as that requires taking the delayed node's
1179
+ * mutex. If another task starts running delayed items before we take
1180
+ * the mutex, it will first lock the mutex and then it may try to lock
1181
+ * the same btree path (leaf).
1182
+ */
1183
+ btrfs_free_path (path );
1184
+
1167
1185
if (curr_node )
1168
1186
btrfs_release_delayed_node (curr_node );
1169
- btrfs_free_path (path );
1170
1187
trans -> block_rsv = block_rsv ;
1171
1188
1172
1189
return ret ;
@@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
1413
1430
btrfs_wq_run_delayed_node (delayed_root , fs_info , BTRFS_DELAYED_BATCH );
1414
1431
}
1415
1432
1416
- /* Will return 0 or -ENOMEM */
1433
+ static void btrfs_release_dir_index_item_space (struct btrfs_trans_handle * trans )
1434
+ {
1435
+ struct btrfs_fs_info * fs_info = trans -> fs_info ;
1436
+ const u64 bytes = btrfs_calc_insert_metadata_size (fs_info , 1 );
1437
+
1438
+ if (test_bit (BTRFS_FS_LOG_RECOVERING , & fs_info -> flags ))
1439
+ return ;
1440
+
1441
+ /*
1442
+ * Adding the new dir index item does not require touching another
1443
+ * leaf, so we can release 1 unit of metadata that was previously
1444
+ * reserved when starting the transaction. This applies only to
1445
+ * the case where we had a transaction start and excludes the
1446
+ * transaction join case (when replaying log trees).
1447
+ */
1448
+ trace_btrfs_space_reservation (fs_info , "transaction" ,
1449
+ trans -> transid , bytes , 0 );
1450
+ btrfs_block_rsv_release (fs_info , trans -> block_rsv , bytes , NULL );
1451
+ ASSERT (trans -> bytes_reserved >= bytes );
1452
+ trans -> bytes_reserved -= bytes ;
1453
+ }
1454
+
1455
+ /* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */
1417
1456
int btrfs_insert_delayed_dir_index (struct btrfs_trans_handle * trans ,
1418
1457
const char * name , int name_len ,
1419
1458
struct btrfs_inode * dir ,
@@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1455
1494
1456
1495
mutex_lock (& delayed_node -> mutex );
1457
1496
1497
+ /*
1498
+ * First attempt to insert the delayed item. This is to make the error
1499
+ * handling path simpler in case we fail (-EEXIST). There's no risk of
1500
+ * any other task coming in and running the delayed item before we do
1501
+ * the metadata space reservation below, because we are holding the
1502
+ * delayed node's mutex and that mutex must also be locked before the
1503
+ * node's delayed items can be run.
1504
+ */
1505
+ ret = __btrfs_add_delayed_item (delayed_node , delayed_item );
1506
+ if (unlikely (ret )) {
1507
+ btrfs_err (trans -> fs_info ,
1508
+ "error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d" ,
1509
+ name_len , name , index , btrfs_root_id (delayed_node -> root ),
1510
+ delayed_node -> inode_id , dir -> index_cnt ,
1511
+ delayed_node -> index_cnt , ret );
1512
+ btrfs_release_delayed_item (delayed_item );
1513
+ btrfs_release_dir_index_item_space (trans );
1514
+ mutex_unlock (& delayed_node -> mutex );
1515
+ goto release_node ;
1516
+ }
1517
+
1458
1518
if (delayed_node -> index_item_leaves == 0 ||
1459
1519
delayed_node -> curr_index_batch_size + data_len > leaf_data_size ) {
1460
1520
delayed_node -> curr_index_batch_size = data_len ;
@@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1472
1532
* impossible.
1473
1533
*/
1474
1534
if (WARN_ON (ret )) {
1475
- mutex_unlock (& delayed_node -> mutex );
1476
1535
btrfs_release_delayed_item (delayed_item );
1536
+ mutex_unlock (& delayed_node -> mutex );
1477
1537
goto release_node ;
1478
1538
}
1479
1539
1480
1540
delayed_node -> index_item_leaves ++ ;
1481
- } else if (!test_bit (BTRFS_FS_LOG_RECOVERING , & fs_info -> flags )) {
1482
- const u64 bytes = btrfs_calc_insert_metadata_size (fs_info , 1 );
1483
-
1484
- /*
1485
- * Adding the new dir index item does not require touching another
1486
- * leaf, so we can release 1 unit of metadata that was previously
1487
- * reserved when starting the transaction. This applies only to
1488
- * the case where we had a transaction start and excludes the
1489
- * transaction join case (when replaying log trees).
1490
- */
1491
- trace_btrfs_space_reservation (fs_info , "transaction" ,
1492
- trans -> transid , bytes , 0 );
1493
- btrfs_block_rsv_release (fs_info , trans -> block_rsv , bytes , NULL );
1494
- ASSERT (trans -> bytes_reserved >= bytes );
1495
- trans -> bytes_reserved -= bytes ;
1496
- }
1497
-
1498
- ret = __btrfs_add_delayed_item (delayed_node , delayed_item );
1499
- if (unlikely (ret )) {
1500
- btrfs_err (trans -> fs_info ,
1501
- "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)" ,
1502
- name_len , name , delayed_node -> root -> root_key .objectid ,
1503
- delayed_node -> inode_id , ret );
1504
- BUG ();
1541
+ } else {
1542
+ btrfs_release_dir_index_item_space (trans );
1505
1543
}
1506
1544
mutex_unlock (& delayed_node -> mutex );
1507
1545
0 commit comments