Skip to content

Commit 04c9c7f

Browse files
foxdroddNipaLocal
authored andcommitted
net: ps3_gelic_net: handle skb allocation failures
Handle skb allocation failures in RX path, to avoid NULL pointer dereference and RX stalls under memory pressure. If the refill fails with -ENOMEM, complete napi polling and wake up later to retry via timer. Also explicitly re-enable RX DMA after oom, so the dmac doesn't remain stopped in this situation. Previously, memory pressure could lead to skb allocation failures and subsequent Oops like: Oops: Kernel access of bad area, sig: 11 [kernel-patches#2] Hardware name: SonyPS3 Cell Broadband Engine 0x701000 PS3 NIP [c0003d0000065900] gelic_net_poll+0x6c/0x2d0 [ps3_gelic] (unreliable) LR [c0003d00000659c4] gelic_net_poll+0x130/0x2d0 [ps3_gelic] Call Trace: gelic_net_poll+0x130/0x2d0 [ps3_gelic] (unreliable) __napi_poll+0x44/0x168 net_rx_action+0x178/0x290 Steps to reproduce the issue: 1. Start a continuous network traffic, like scp of a 20GB file 2. Inject failslab errors using the kernel fault injection: echo -1 > /sys/kernel/debug/failslab/times echo 30 > /sys/kernel/debug/failslab/interval echo 100 > /sys/kernel/debug/failslab/probability 3. After some time, traces start to appear, kernel Oopses and the system stops Step 2 is not always necessary, as it is usually already triggered by the transfer of a big enough file. Fixes: 02c1889 ("ps3: gigabit ethernet driver for PS3, take3") Signed-off-by: Florian Fuchs <fuchsfl@gmail.com> Signed-off-by: NipaLocal <nipa@local>
1 parent 86bbed8 commit 04c9c7f

File tree

2 files changed

+42
-13
lines changed

2 files changed

+42
-13
lines changed

drivers/net/ethernet/toshiba/ps3_gelic_net.c

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ void gelic_card_down(struct gelic_card *card)
259259
mutex_lock(&card->updown_lock);
260260
if (atomic_dec_if_positive(&card->users) == 0) {
261261
pr_debug("%s: real do\n", __func__);
262+
timer_delete_sync(&card->rx_oom_timer);
262263
napi_disable(&card->napi);
263264
/*
264265
* Disable irq. Wireless interrupts will
@@ -970,7 +971,8 @@ static void gelic_net_pass_skb_up(struct gelic_descr *descr,
970971
* gelic_card_decode_one_descr - processes an rx descriptor
971972
* @card: card structure
972973
*
973-
* returns 1 if a packet has been sent to the stack, otherwise 0
974+
* returns 1 if a packet has been sent to the stack, -ENOMEM on skb alloc
975+
* failure, otherwise 0
974976
*
975977
* processes an rx descriptor by iommu-unmapping the data buffer and passing
976978
* the packet up to the stack
@@ -981,16 +983,17 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
981983
struct gelic_descr_chain *chain = &card->rx_chain;
982984
struct gelic_descr *descr = chain->head;
983985
struct net_device *netdev = NULL;
984-
int dmac_chain_ended;
986+
int dmac_chain_ended = 0;
985987

986988
status = gelic_descr_get_status(descr);
987989

988990
if (status == GELIC_DESCR_DMA_CARDOWNED)
989991
return 0;
990992

991-
if (status == GELIC_DESCR_DMA_NOT_IN_USE) {
993+
if (status == GELIC_DESCR_DMA_NOT_IN_USE || !descr->skb) {
992994
dev_dbg(ctodev(card), "dormant descr? %p\n", descr);
993-
return 0;
995+
dmac_chain_ended = 1;
996+
goto refill;
994997
}
995998

996999
/* netdevice select */
@@ -1048,9 +1051,10 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
10481051
refill:
10491052

10501053
/* is the current descriptor terminated with next_descr == NULL? */
1051-
dmac_chain_ended =
1052-
be32_to_cpu(descr->hw_regs.dmac_cmd_status) &
1053-
GELIC_DESCR_RX_DMA_CHAIN_END;
1054+
if (!dmac_chain_ended)
1055+
dmac_chain_ended =
1056+
be32_to_cpu(descr->hw_regs.dmac_cmd_status) &
1057+
GELIC_DESCR_RX_DMA_CHAIN_END;
10541058
/*
10551059
* So that always DMAC can see the end
10561060
* of the descriptor chain to avoid
@@ -1062,10 +1066,12 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
10621066
gelic_descr_set_status(descr, GELIC_DESCR_DMA_NOT_IN_USE);
10631067

10641068
/*
1065-
* this call can fail, but for now, just leave this
1066-
* descriptor without skb
1069+
* this call can fail, propagate the error
10671070
*/
1068-
gelic_descr_prepare_rx(card, descr);
1071+
int ret = gelic_descr_prepare_rx(card, descr);
1072+
1073+
if (ret)
1074+
return ret;
10691075

10701076
chain->tail = descr;
10711077
chain->head = descr->next;
@@ -1087,6 +1093,17 @@ static int gelic_card_decode_one_descr(struct gelic_card *card)
10871093
return 1;
10881094
}
10891095

1096+
/**
1097+
* gelic_rx_oom_timer - Restart napi poll if oom occurred
1098+
* @t: timer list
1099+
*/
1100+
static void gelic_rx_oom_timer(struct timer_list *t)
1101+
{
1102+
struct gelic_card *card = timer_container_of(card, t, rx_oom_timer);
1103+
1104+
napi_schedule(&card->napi);
1105+
}
1106+
10901107
/**
10911108
* gelic_net_poll - NAPI poll function called by the stack to return packets
10921109
* @napi: napi structure
@@ -1099,12 +1116,21 @@ static int gelic_net_poll(struct napi_struct *napi, int budget)
10991116
{
11001117
struct gelic_card *card = container_of(napi, struct gelic_card, napi);
11011118
int packets_done = 0;
1119+
int work_result = 0;
11021120

11031121
while (packets_done < budget) {
1104-
if (!gelic_card_decode_one_descr(card))
1105-
break;
1122+
work_result = gelic_card_decode_one_descr(card);
1123+
if (work_result == 1) {
1124+
packets_done++;
1125+
continue;
1126+
}
1127+
break;
1128+
}
11061129

1107-
packets_done++;
1130+
if (work_result == -ENOMEM) {
1131+
napi_complete_done(napi, packets_done);
1132+
mod_timer(&card->rx_oom_timer, jiffies + 1);
1133+
return packets_done;
11081134
}
11091135

11101136
if (packets_done < budget) {
@@ -1576,6 +1602,8 @@ static struct gelic_card *gelic_alloc_card_net(struct net_device **netdev)
15761602
mutex_init(&card->updown_lock);
15771603
atomic_set(&card->users, 0);
15781604

1605+
timer_setup(&card->rx_oom_timer, gelic_rx_oom_timer, 0);
1606+
15791607
return card;
15801608
}
15811609

drivers/net/ethernet/toshiba/ps3_gelic_net.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ struct gelic_vlan_id {
268268
struct gelic_card {
269269
struct napi_struct napi;
270270
struct net_device *netdev[GELIC_PORT_MAX];
271+
struct timer_list rx_oom_timer;
271272
/*
272273
* hypervisor requires irq_status should be
273274
* 8 bytes aligned, but u64 member is

0 commit comments

Comments
 (0)