Skip to content

Commit b2f963b

Browse files
Vladimir DavydovJeff Kirsher
authored andcommitted
e1000: fix lockdep warning in e1000_reset_task
The patch fixes the following lockdep warning, which is 100% reproducible on network restart: ====================================================== [ INFO: possible circular locking dependency detected ] 3.12.0+ torvalds#47 Tainted: GF ------------------------------------------------------- kworker/1:1/27 is trying to acquire lock: ((&(&adapter->watchdog_task)->work)){+.+...}, at: [<ffffffff8108a5b0>] flush_work+0x0/0x70 but task is already holding lock: (&adapter->mutex){+.+...}, at: [<ffffffffa0177c0a>] e1000_reset_task+0x4a/0xa0 [e1000] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&adapter->mutex){+.+...}: [<ffffffff810bdb5d>] lock_acquire+0x9d/0x120 [<ffffffff816b8cbc>] mutex_lock_nested+0x4c/0x390 [<ffffffffa017233d>] e1000_watchdog+0x7d/0x5b0 [e1000] [<ffffffff8108b972>] process_one_work+0x1d2/0x510 [<ffffffff8108ca80>] worker_thread+0x120/0x3a0 [<ffffffff81092c1e>] kthread+0xee/0x110 [<ffffffff816c3d7c>] ret_from_fork+0x7c/0xb0 -> #0 ((&(&adapter->watchdog_task)->work)){+.+...}: [<ffffffff810bd9c0>] __lock_acquire+0x1710/0x1810 [<ffffffff810bdb5d>] lock_acquire+0x9d/0x120 [<ffffffff8108a5eb>] flush_work+0x3b/0x70 [<ffffffff8108b5d8>] __cancel_work_timer+0x98/0x140 [<ffffffff8108b693>] cancel_delayed_work_sync+0x13/0x20 [<ffffffffa0170cec>] e1000_down_and_stop+0x3c/0x60 [e1000] [<ffffffffa01775b1>] e1000_down+0x131/0x220 [e1000] [<ffffffffa0177c12>] e1000_reset_task+0x52/0xa0 [e1000] [<ffffffff8108b972>] process_one_work+0x1d2/0x510 [<ffffffff8108ca80>] worker_thread+0x120/0x3a0 [<ffffffff81092c1e>] kthread+0xee/0x110 [<ffffffff816c3d7c>] ret_from_fork+0x7c/0xb0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&adapter->mutex); lock((&(&adapter->watchdog_task)->work)); lock(&adapter->mutex); lock((&(&adapter->watchdog_task)->work)); *** DEADLOCK *** 3 locks held by kworker/1:1/27: #0: (events){.+.+.+}, at: [<ffffffff8108b906>] process_one_work+0x166/0x510 #1: ((&adapter->reset_task)){+.+...}, at: [<ffffffff8108b906>] process_one_work+0x166/0x510 #2: (&adapter->mutex){+.+...}, at: [<ffffffffa0177c0a>] e1000_reset_task+0x4a/0xa0 [e1000] stack backtrace: CPU: 1 PID: 27 Comm: kworker/1:1 Tainted: GF 3.12.0+ torvalds#47 Hardware name: System manufacturer System Product Name/P5B-VM SE, BIOS 0501 05/31/2007 Workqueue: events e1000_reset_task [e1000] ffffffff820f6000 ffff88007b9dba98 ffffffff816b54a2 0000000000000002 ffffffff820f5e50 ffff88007b9dbae8 ffffffff810ba936 ffff88007b9dbac8 ffff88007b9dbb48 ffff88007b9d8f00 ffff88007b9d8780 ffff88007b9d8f00 Call Trace: [<ffffffff816b54a2>] dump_stack+0x49/0x5f [<ffffffff810ba936>] print_circular_bug+0x216/0x310 [<ffffffff810bd9c0>] __lock_acquire+0x1710/0x1810 [<ffffffff8108a5b0>] ? __flush_work+0x250/0x250 [<ffffffff810bdb5d>] lock_acquire+0x9d/0x120 [<ffffffff8108a5b0>] ? __flush_work+0x250/0x250 [<ffffffff8108a5eb>] flush_work+0x3b/0x70 [<ffffffff8108a5b0>] ? __flush_work+0x250/0x250 [<ffffffff8108b5d8>] __cancel_work_timer+0x98/0x140 [<ffffffff8108b693>] cancel_delayed_work_sync+0x13/0x20 [<ffffffffa0170cec>] e1000_down_and_stop+0x3c/0x60 [e1000] [<ffffffffa01775b1>] e1000_down+0x131/0x220 [e1000] [<ffffffffa0177c12>] e1000_reset_task+0x52/0xa0 [e1000] [<ffffffff8108b972>] process_one_work+0x1d2/0x510 [<ffffffff8108b906>] ? process_one_work+0x166/0x510 [<ffffffff8108ca80>] worker_thread+0x120/0x3a0 [<ffffffff8108c960>] ? manage_workers+0x2c0/0x2c0 [<ffffffff81092c1e>] kthread+0xee/0x110 [<ffffffff81092b30>] ? __init_kthread_worker+0x70/0x70 [<ffffffff816c3d7c>] ret_from_fork+0x7c/0xb0 [<ffffffff81092b30>] ? __init_kthread_worker+0x70/0x70 == The issue background == The problem occurs, because e1000_down(), which is called under adapter->mutex by e1000_reset_task(), tries to synchronously cancel e1000 auxiliary works (reset_task, watchdog_task, phy_info_task, fifo_stall_task), which take adapter->mutex in their handlers. So the question is what does adapter->mutex protect there? The adapter->mutex was introduced by commit 0ef4ee ("e1000: convert to private mutex from rtnl") as a replacement for rtnl_lock() taken in the asynchronous handlers. It targeted on fixing a similar lockdep warning issued when e1000_down() was called under rtnl_lock(), and it fixed it, but unfortunately it introduced the lockdep warning described above. Anyway, that said the source of this bug is that the asynchronous works were made to take rtnl_lock() some time ago, so let's look deeper and find why it was added there. The rtnl_lock() was added to asynchronous handlers by commit 338c15 ("e1000: fix occasional panic on unload") in order to prevent asynchronous handlers from execution after the module is unloaded (e1000_down() is called) as it follows from the comment to the commit: > Net drivers in general have an issue where timers fired > by mod_timer or work threads with schedule_work are running > outside of the rtnl_lock. > > With no other lock protection these routines are vulnerable > to races with driver unload or reset paths. > > The longer term solution to this might be a redesign with > safer locks being taken in the driver to guarantee no > reentrance, but for now a safe and effective fix is > to take the rtnl_lock in these routines. I'm not sure if this locking scheme fixed the problem or just made it unlikely, although I incline to the latter. Anyway, this was long time ago when e1000 auxiliary works were implemented as timers scheduling real work handlers in their routines. The e1000_down() function only canceled the timers, but left the real handlers running if they were running, which could result in work execution after module unload. Today, the e1000 driver uses sane delayed works instead of the pair timer+work to implement its delayed asynchronous handlers, and the e1000_down() synchronously cancels all the works so that the problem that commit 338c15 tried to cope with disappeared, and we don't need any locks in the handlers any more. Moreover, any locking there can potentially result in a deadlock. So, this patch reverts commits 0ef4ee and 338c15. Fixes: 0ef4eed ("e1000: convert to private mutex from rtnl") Fixes: 338c15e ("e1000: fix occasional panic on unload") Cc: Tushar Dave <tushar.n.dave@intel.com> Cc: Patrick McHardy <kaber@trash.net> Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Tested-by: Aaron Brown <aaron.f.brown@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
1 parent 6a7d64e commit b2f963b

File tree

2 files changed

+3
-35
lines changed

2 files changed

+3
-35
lines changed

drivers/net/ethernet/intel/e1000/e1000.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,6 @@ struct e1000_adapter {
317317
struct delayed_work watchdog_task;
318318
struct delayed_work fifo_stall_task;
319319
struct delayed_work phy_info_task;
320-
321-
struct mutex mutex;
322320
};
323321

324322
enum e1000_state_t {

drivers/net/ethernet/intel/e1000/e1000_main.c

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -544,21 +544,8 @@ void e1000_down(struct e1000_adapter *adapter)
544544
e1000_clean_all_rx_rings(adapter);
545545
}
546546

547-
static void e1000_reinit_safe(struct e1000_adapter *adapter)
548-
{
549-
while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
550-
msleep(1);
551-
mutex_lock(&adapter->mutex);
552-
e1000_down(adapter);
553-
e1000_up(adapter);
554-
mutex_unlock(&adapter->mutex);
555-
clear_bit(__E1000_RESETTING, &adapter->flags);
556-
}
557-
558547
void e1000_reinit_locked(struct e1000_adapter *adapter)
559548
{
560-
/* if rtnl_lock is not held the call path is bogus */
561-
ASSERT_RTNL();
562549
WARN_ON(in_interrupt());
563550
while (test_and_set_bit(__E1000_RESETTING, &adapter->flags))
564551
msleep(1);
@@ -1316,7 +1303,6 @@ static int e1000_sw_init(struct e1000_adapter *adapter)
13161303
e1000_irq_disable(adapter);
13171304

13181305
spin_lock_init(&adapter->stats_lock);
1319-
mutex_init(&adapter->mutex);
13201306

13211307
set_bit(__E1000_DOWN, &adapter->flags);
13221308

@@ -2329,11 +2315,8 @@ static void e1000_update_phy_info_task(struct work_struct *work)
23292315
struct e1000_adapter *adapter = container_of(work,
23302316
struct e1000_adapter,
23312317
phy_info_task.work);
2332-
if (test_bit(__E1000_DOWN, &adapter->flags))
2333-
return;
2334-
mutex_lock(&adapter->mutex);
2318+
23352319
e1000_phy_get_info(&adapter->hw, &adapter->phy_info);
2336-
mutex_unlock(&adapter->mutex);
23372320
}
23382321

23392322
/**
@@ -2349,9 +2332,6 @@ static void e1000_82547_tx_fifo_stall_task(struct work_struct *work)
23492332
struct net_device *netdev = adapter->netdev;
23502333
u32 tctl;
23512334

2352-
if (test_bit(__E1000_DOWN, &adapter->flags))
2353-
return;
2354-
mutex_lock(&adapter->mutex);
23552335
if (atomic_read(&adapter->tx_fifo_stall)) {
23562336
if ((er32(TDT) == er32(TDH)) &&
23572337
(er32(TDFT) == er32(TDFH)) &&
@@ -2372,7 +2352,6 @@ static void e1000_82547_tx_fifo_stall_task(struct work_struct *work)
23722352
schedule_delayed_work(&adapter->fifo_stall_task, 1);
23732353
}
23742354
}
2375-
mutex_unlock(&adapter->mutex);
23762355
}
23772356

23782357
bool e1000_has_link(struct e1000_adapter *adapter)
@@ -2426,10 +2405,6 @@ static void e1000_watchdog(struct work_struct *work)
24262405
struct e1000_tx_ring *txdr = adapter->tx_ring;
24272406
u32 link, tctl;
24282407

2429-
if (test_bit(__E1000_DOWN, &adapter->flags))
2430-
return;
2431-
2432-
mutex_lock(&adapter->mutex);
24332408
link = e1000_has_link(adapter);
24342409
if ((netif_carrier_ok(netdev)) && link)
24352410
goto link_up;
@@ -2520,7 +2495,7 @@ static void e1000_watchdog(struct work_struct *work)
25202495
adapter->tx_timeout_count++;
25212496
schedule_work(&adapter->reset_task);
25222497
/* exit immediately since reset is imminent */
2523-
goto unlock;
2498+
return;
25242499
}
25252500
}
25262501

@@ -2548,9 +2523,6 @@ static void e1000_watchdog(struct work_struct *work)
25482523
/* Reschedule the task */
25492524
if (!test_bit(__E1000_DOWN, &adapter->flags))
25502525
schedule_delayed_work(&adapter->watchdog_task, 2 * HZ);
2551-
2552-
unlock:
2553-
mutex_unlock(&adapter->mutex);
25542526
}
25552527

25562528
enum latency_range {
@@ -3499,10 +3471,8 @@ static void e1000_reset_task(struct work_struct *work)
34993471
struct e1000_adapter *adapter =
35003472
container_of(work, struct e1000_adapter, reset_task);
35013473

3502-
if (test_bit(__E1000_DOWN, &adapter->flags))
3503-
return;
35043474
e_err(drv, "Reset adapter\n");
3505-
e1000_reinit_safe(adapter);
3475+
e1000_reinit_locked(adapter);
35063476
}
35073477

35083478
/**

0 commit comments

Comments
 (0)