Skip to content

Commit 8678b10

Browse files
hnazalexdeucher
authored andcommitted
drm/amdgpu: fix deadlock while reading mqd from debugfs
An errant disk backup on my desktop got into debugfs and triggered the following deadlock scenario in the amdgpu debugfs files. The machine also hard-resets immediately after those lines are printed (although I wasn't able to reproduce that part when reading by hand): [ 1318.016074][ T1082] ====================================================== [ 1318.016607][ T1082] WARNING: possible circular locking dependency detected [ 1318.017107][ T1082] 6.8.0-rc7-00015-ge0c8221b72c0 #17 Not tainted [ 1318.017598][ T1082] ------------------------------------------------------ [ 1318.018096][ T1082] tar/1082 is trying to acquire lock: [ 1318.018585][ T1082] ffff98c44175d6a0 (&mm->mmap_lock){++++}-{3:3}, at: __might_fault+0x40/0x80 [ 1318.019084][ T1082] [ 1318.019084][ T1082] but task is already holding lock: [ 1318.020052][ T1082] ffff98c4c13f55f8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: amdgpu_debugfs_mqd_read+0x6a/0x250 [amdgpu] [ 1318.020607][ T1082] [ 1318.020607][ T1082] which lock already depends on the new lock. [ 1318.020607][ T1082] [ 1318.022081][ T1082] [ 1318.022081][ T1082] the existing dependency chain (in reverse order) is: [ 1318.023083][ T1082] [ 1318.023083][ T1082] -> #2 (reservation_ww_class_mutex){+.+.}-{3:3}: [ 1318.024114][ T1082] __ww_mutex_lock.constprop.0+0xe0/0x12f0 [ 1318.024639][ T1082] ww_mutex_lock+0x32/0x90 [ 1318.025161][ T1082] dma_resv_lockdep+0x18a/0x330 [ 1318.025683][ T1082] do_one_initcall+0x6a/0x350 [ 1318.026210][ T1082] kernel_init_freeable+0x1a3/0x310 [ 1318.026728][ T1082] kernel_init+0x15/0x1a0 [ 1318.027242][ T1082] ret_from_fork+0x2c/0x40 [ 1318.027759][ T1082] ret_from_fork_asm+0x11/0x20 [ 1318.028281][ T1082] [ 1318.028281][ T1082] -> #1 (reservation_ww_class_acquire){+.+.}-{0:0}: [ 1318.029297][ T1082] dma_resv_lockdep+0x16c/0x330 [ 1318.029790][ T1082] do_one_initcall+0x6a/0x350 [ 1318.030263][ T1082] kernel_init_freeable+0x1a3/0x310 [ 1318.030722][ T1082] kernel_init+0x15/0x1a0 [ 1318.031168][ T1082] ret_from_fork+0x2c/0x40 [ 1318.031598][ T1082] ret_from_fork_asm+0x11/0x20 [ 1318.032011][ T1082] [ 1318.032011][ T1082] -> #0 (&mm->mmap_lock){++++}-{3:3}: [ 1318.032778][ T1082] __lock_acquire+0x14bf/0x2680 [ 1318.033141][ T1082] lock_acquire+0xcd/0x2c0 [ 1318.033487][ T1082] __might_fault+0x58/0x80 [ 1318.033814][ T1082] amdgpu_debugfs_mqd_read+0x103/0x250 [amdgpu] [ 1318.034181][ T1082] full_proxy_read+0x55/0x80 [ 1318.034487][ T1082] vfs_read+0xa7/0x360 [ 1318.034788][ T1082] ksys_read+0x70/0xf0 [ 1318.035085][ T1082] do_syscall_64+0x94/0x180 [ 1318.035375][ T1082] entry_SYSCALL_64_after_hwframe+0x46/0x4e [ 1318.035664][ T1082] [ 1318.035664][ T1082] other info that might help us debug this: [ 1318.035664][ T1082] [ 1318.036487][ T1082] Chain exists of: [ 1318.036487][ T1082] &mm->mmap_lock --> reservation_ww_class_acquire --> reservation_ww_class_mutex [ 1318.036487][ T1082] [ 1318.037310][ T1082] Possible unsafe locking scenario: [ 1318.037310][ T1082] [ 1318.037838][ T1082] CPU0 CPU1 [ 1318.038101][ T1082] ---- ---- [ 1318.038350][ T1082] lock(reservation_ww_class_mutex); [ 1318.038590][ T1082] lock(reservation_ww_class_acquire); [ 1318.038839][ T1082] lock(reservation_ww_class_mutex); [ 1318.039083][ T1082] rlock(&mm->mmap_lock); [ 1318.039328][ T1082] [ 1318.039328][ T1082] *** DEADLOCK *** [ 1318.039328][ T1082] [ 1318.040029][ T1082] 1 lock held by tar/1082: [ 1318.040259][ T1082] #0: ffff98c4c13f55f8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: amdgpu_debugfs_mqd_read+0x6a/0x250 [amdgpu] [ 1318.040560][ T1082] [ 1318.040560][ T1082] stack backtrace: [ 1318.041053][ T1082] CPU: 22 PID: 1082 Comm: tar Not tainted 6.8.0-rc7-00015-ge0c8221b72c0 #17 3316c85d50e282c5643b075d1f01a4f6365e39c2 [ 1318.041329][ T1082] Hardware name: Gigabyte Technology Co., Ltd. B650 AORUS PRO AX/B650 AORUS PRO AX, BIOS F20 12/14/2023 [ 1318.041614][ T1082] Call Trace: [ 1318.041895][ T1082] <TASK> [ 1318.042175][ T1082] dump_stack_lvl+0x4a/0x80 [ 1318.042460][ T1082] check_noncircular+0x145/0x160 [ 1318.042743][ T1082] __lock_acquire+0x14bf/0x2680 [ 1318.043022][ T1082] lock_acquire+0xcd/0x2c0 [ 1318.043301][ T1082] ? __might_fault+0x40/0x80 [ 1318.043580][ T1082] ? __might_fault+0x40/0x80 [ 1318.043856][ T1082] __might_fault+0x58/0x80 [ 1318.044131][ T1082] ? __might_fault+0x40/0x80 [ 1318.044408][ T1082] amdgpu_debugfs_mqd_read+0x103/0x250 [amdgpu 8fe2afaa910cbd7654c8cab23563a94d6caebaab] [ 1318.044749][ T1082] full_proxy_read+0x55/0x80 [ 1318.045042][ T1082] vfs_read+0xa7/0x360 [ 1318.045333][ T1082] ksys_read+0x70/0xf0 [ 1318.045623][ T1082] do_syscall_64+0x94/0x180 [ 1318.045913][ T1082] ? do_syscall_64+0xa0/0x180 [ 1318.046201][ T1082] ? lockdep_hardirqs_on+0x7d/0x100 [ 1318.046487][ T1082] ? do_syscall_64+0xa0/0x180 [ 1318.046773][ T1082] ? do_syscall_64+0xa0/0x180 [ 1318.047057][ T1082] ? do_syscall_64+0xa0/0x180 [ 1318.047337][ T1082] ? do_syscall_64+0xa0/0x180 [ 1318.047611][ T1082] entry_SYSCALL_64_after_hwframe+0x46/0x4e [ 1318.047887][ T1082] RIP: 0033:0x7f480b70a39d [ 1318.048162][ T1082] Code: 91 ba 0d 00 f7 d8 64 89 02 b8 ff ff ff ff eb b2 e8 18 a3 01 00 0f 1f 84 00 00 00 00 00 80 3d a9 3c 0e 00 00 74 17 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 5b c3 66 2e 0f 1f 84 00 00 00 00 00 53 48 83 [ 1318.048769][ T1082] RSP: 002b:00007ffde77f5c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 1318.049083][ T1082] RAX: ffffffffffffffda RBX: 0000000000000800 RCX: 00007f480b70a39d [ 1318.049392][ T1082] RDX: 0000000000000800 RSI: 000055c9f2120c00 RDI: 0000000000000008 [ 1318.049703][ T1082] RBP: 0000000000000800 R08: 000055c9f2120a94 R09: 0000000000000007 [ 1318.050011][ T1082] R10: 0000000000000000 R11: 0000000000000246 R12: 000055c9f2120c00 [ 1318.050324][ T1082] R13: 0000000000000008 R14: 0000000000000008 R15: 0000000000000800 [ 1318.050638][ T1082] </TASK> amdgpu_debugfs_mqd_read() holds a reservation when it calls put_user(), which may fault and acquire the mmap_sem. This violates the established locking order. Bounce the mqd data through a kernel buffer to get put_user() out of the illegal section. Fixes: 445d85e ("drm/amdgpu: add debugfs interface for reading MQDs") Cc: stable@vger.kernel.org # v6.5+ Reviewed-by: Shashank Sharma <shashank.sharma@amd.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 68a2afb commit 8678b10

File tree

1 file changed

+29
-17
lines changed

1 file changed

+29
-17
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

+29-17
Original file line numberDiff line numberDiff line change
@@ -524,46 +524,58 @@ static ssize_t amdgpu_debugfs_mqd_read(struct file *f, char __user *buf,
524524
{
525525
struct amdgpu_ring *ring = file_inode(f)->i_private;
526526
volatile u32 *mqd;
527-
int r;
527+
u32 *kbuf;
528+
int r, i;
528529
uint32_t value, result;
529530

530531
if (*pos & 3 || size & 3)
531532
return -EINVAL;
532533

533-
result = 0;
534+
kbuf = kmalloc(ring->mqd_size, GFP_KERNEL);
535+
if (!kbuf)
536+
return -ENOMEM;
534537

535538
r = amdgpu_bo_reserve(ring->mqd_obj, false);
536539
if (unlikely(r != 0))
537-
return r;
540+
goto err_free;
538541

539542
r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&mqd);
540-
if (r) {
541-
amdgpu_bo_unreserve(ring->mqd_obj);
542-
return r;
543-
}
543+
if (r)
544+
goto err_unreserve;
544545

546+
/*
547+
* Copy to local buffer to avoid put_user(), which might fault
548+
* and acquire mmap_sem, under reservation_ww_class_mutex.
549+
*/
550+
for (i = 0; i < ring->mqd_size/sizeof(u32); i++)
551+
kbuf[i] = mqd[i];
552+
553+
amdgpu_bo_kunmap(ring->mqd_obj);
554+
amdgpu_bo_unreserve(ring->mqd_obj);
555+
556+
result = 0;
545557
while (size) {
546558
if (*pos >= ring->mqd_size)
547-
goto done;
559+
break;
548560

549-
value = mqd[*pos/4];
561+
value = kbuf[*pos/4];
550562
r = put_user(value, (uint32_t *)buf);
551563
if (r)
552-
goto done;
564+
goto err_free;
553565
buf += 4;
554566
result += 4;
555567
size -= 4;
556568
*pos += 4;
557569
}
558570

559-
done:
560-
amdgpu_bo_kunmap(ring->mqd_obj);
561-
mqd = NULL;
562-
amdgpu_bo_unreserve(ring->mqd_obj);
563-
if (r)
564-
return r;
565-
571+
kfree(kbuf);
566572
return result;
573+
574+
err_unreserve:
575+
amdgpu_bo_unreserve(ring->mqd_obj);
576+
err_free:
577+
kfree(kbuf);
578+
return r;
567579
}
568580

569581
static const struct file_operations amdgpu_debugfs_mqd_fops = {

0 commit comments

Comments
 (0)