Skip to content

Commit e9c1299

Browse files
mrpregregkh
authored andcommitted
bpf, sockmap: Fix panic when calling skb_linearize
[ Upstream commit 5ca2e29 ] The panic can be reproduced by executing the command: ./bench sockmap -c 2 -p 1 -a --rx-verdict-ingress --rx-strp 100000 Then a kernel panic was captured: ''' [ 657.460555] kernel BUG at net/core/skbuff.c:2178! [ 657.462680] Tainted: [W]=WARN [ 657.463287] Workqueue: events sk_psock_backlog ... [ 657.469610] <TASK> [ 657.469738] ? die+0x36/0x90 [ 657.469916] ? do_trap+0x1d0/0x270 [ 657.470118] ? pskb_expand_head+0x612/0xf40 [ 657.470376] ? pskb_expand_head+0x612/0xf40 [ 657.470620] ? do_error_trap+0xa3/0x170 [ 657.470846] ? pskb_expand_head+0x612/0xf40 [ 657.471092] ? handle_invalid_op+0x2c/0x40 [ 657.471335] ? pskb_expand_head+0x612/0xf40 [ 657.471579] ? exc_invalid_op+0x2d/0x40 [ 657.471805] ? asm_exc_invalid_op+0x1a/0x20 [ 657.472052] ? pskb_expand_head+0xd1/0xf40 [ 657.472292] ? pskb_expand_head+0x612/0xf40 [ 657.472540] ? lock_acquire+0x18f/0x4e0 [ 657.472766] ? find_held_lock+0x2d/0x110 [ 657.472999] ? __pfx_pskb_expand_head+0x10/0x10 [ 657.473263] ? __kmalloc_cache_noprof+0x5b/0x470 [ 657.473537] ? __pfx___lock_release.isra.0+0x10/0x10 [ 657.473826] __pskb_pull_tail+0xfd/0x1d20 [ 657.474062] ? __kasan_slab_alloc+0x4e/0x90 [ 657.474707] sk_psock_skb_ingress_enqueue+0x3bf/0x510 [ 657.475392] ? __kasan_kmalloc+0xaa/0xb0 [ 657.476010] sk_psock_backlog+0x5cf/0xd70 [ 657.476637] process_one_work+0x858/0x1a20 ''' The panic originates from the assertion BUG_ON(skb_shared(skb)) in skb_linearize(). A previous commit(see Fixes tag) introduced skb_get() to avoid race conditions between skb operations in the backlog and skb release in the recvmsg path. However, this caused the panic to always occur when skb_linearize is executed. The "--rx-strp 100000" parameter forces the RX path to use the strparser module which aggregates data until it reaches 100KB before calling sockmap logic. The 100KB payload exceeds MAX_MSG_FRAGS, triggering skb_linearize. To fix this issue, just move skb_get into sk_psock_skb_ingress_enqueue. ''' sk_psock_backlog: sk_psock_handle_skb skb_get(skb) <== we move it into 'sk_psock_skb_ingress_enqueue' sk_psock_skb_ingress____________ ↓ | | → sk_psock_skb_ingress_self | sk_psock_skb_ingress_enqueue sk_psock_verdict_apply_________________↑ skb_linearize ''' Note that for verdict_apply path, the skb_get operation is unnecessary so we add 'take_ref' param to control it's behavior. Fixes: a454d84 ("bpf, sockmap: Fix skb refcnt race after locking changes") Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev> Link: https://lore.kernel.org/r/20250407142234.47591-4-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent f585747 commit e9c1299

File tree

1 file changed

+16
-15
lines changed

1 file changed

+16
-15
lines changed

net/core/skmsg.c

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -530,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
530530
u32 off, u32 len,
531531
struct sk_psock *psock,
532532
struct sock *sk,
533-
struct sk_msg *msg)
533+
struct sk_msg *msg,
534+
bool take_ref)
534535
{
535536
int num_sge, copied;
536537

538+
/* skb_to_sgvec will fail when the total number of fragments in
539+
* frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
540+
* caller may aggregate multiple skbs.
541+
*/
537542
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
538543
if (num_sge < 0) {
539544
/* skb linearize may fail with ENOMEM, but lets simply try again
540545
* later if this happens. Under memory pressure we don't want to
541546
* drop the skb. We need to linearize the skb so that the mapping
542547
* in skb_to_sgvec can not error.
548+
* Note that skb_linearize requires the skb not to be shared.
543549
*/
544550
if (skb_linearize(skb))
545551
return -EAGAIN;
@@ -556,15 +562,15 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
556562
msg->sg.start = 0;
557563
msg->sg.size = copied;
558564
msg->sg.end = num_sge;
559-
msg->skb = skb;
565+
msg->skb = take_ref ? skb_get(skb) : skb;
560566

561567
sk_psock_queue_msg(psock, msg);
562568
sk_psock_data_ready(sk, psock);
563569
return copied;
564570
}
565571

566572
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
567-
u32 off, u32 len);
573+
u32 off, u32 len, bool take_ref);
568574

569575
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
570576
u32 off, u32 len)
@@ -578,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
578584
* correctly.
579585
*/
580586
if (unlikely(skb->sk == sk))
581-
return sk_psock_skb_ingress_self(psock, skb, off, len);
587+
return sk_psock_skb_ingress_self(psock, skb, off, len, true);
582588
msg = sk_psock_create_ingress_msg(sk, skb);
583589
if (!msg)
584590
return -EAGAIN;
@@ -590,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
590596
* into user buffers.
591597
*/
592598
skb_set_owner_r(skb, sk);
593-
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
599+
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
594600
if (err < 0)
595601
kfree(msg);
596602
return err;
@@ -601,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
601607
* because the skb is already accounted for here.
602608
*/
603609
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
604-
u32 off, u32 len)
610+
u32 off, u32 len, bool take_ref)
605611
{
606612
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
607613
struct sock *sk = psock->sk;
@@ -610,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
610616
if (unlikely(!msg))
611617
return -EAGAIN;
612618
skb_set_owner_r(skb, sk);
613-
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
619+
err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
614620
if (err < 0)
615621
kfree(msg);
616622
return err;
@@ -619,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
619625
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
620626
u32 off, u32 len, bool ingress)
621627
{
622-
int err = 0;
623-
624628
if (!ingress) {
625629
if (!sock_writeable(psock->sk))
626630
return -EAGAIN;
627631
return skb_send_sock(psock->sk, skb, off, len);
628632
}
629-
skb_get(skb);
630-
err = sk_psock_skb_ingress(psock, skb, off, len);
631-
if (err < 0)
632-
kfree_skb(skb);
633-
return err;
633+
634+
return sk_psock_skb_ingress(psock, skb, off, len);
634635
}
635636

636637
static void sk_psock_skb_state(struct sk_psock *psock,
@@ -1018,7 +1019,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
10181019
off = stm->offset;
10191020
len = stm->full_len;
10201021
}
1021-
err = sk_psock_skb_ingress_self(psock, skb, off, len);
1022+
err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
10221023
}
10231024
if (err < 0) {
10241025
spin_lock_bh(&psock->ingress_lock);

0 commit comments

Comments
 (0)