Skip to content

Commit 6771bfd

Browse files
Florian Westphaldavem330
authored andcommitted
mptcp: update mptcp ack sequence from work queue
If userspace is not reading data, all the mptcp-level acks contain the ack_seq from the last time userspace read data rather than the most recent in-sequence value. This causes pointless retransmissions for data that is already queued. The reason for this is that all the mptcp protocol level processing happens at mptcp_recv time. This adds work queue to move skbs from the subflow sockets receive queue on the mptcp socket receive queue (which was not used so far). This allows us to announce the correct mptcp ack sequence in a timely fashion, even when the application does not call recv() on the mptcp socket for some time. We still wake userspace tasks waiting for POLLIN immediately: If the mptcp level receive queue is empty (because the work queue is still pending) it can be filled from in-sequence subflow sockets at recv time without a need to wait for the worker. The skb_orphan when moving skbs from subflow to mptcp level is needed, because the destructor (sock_rfree) relies on skb->sk (ssk!) lock being taken. A followup patch will add needed rmem accouting for the moved skbs. Other problem: In case application behaves as expected, and calls recv() as soon as mptcp socket becomes readable, the work queue will only waste cpu cycles. This will also be addressed in followup patches. Signed-off-by: Florian Westphal <fw@strlen.de> Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 8099201 commit 6771bfd

File tree

1 file changed

+165
-69
lines changed

1 file changed

+165
-69
lines changed

net/mptcp/protocol.c

Lines changed: 165 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ struct mptcp6_sock {
3131
};
3232
#endif
3333

34+
struct mptcp_skb_cb {
35+
u32 offset;
36+
};
37+
38+
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
39+
3440
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
3541
* completed yet or has failed, return the subflow socket.
3642
* Otherwise return NULL.
@@ -111,11 +117,88 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
111117
return NULL;
112118
}
113119

120+
static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
121+
struct sk_buff *skb,
122+
unsigned int offset, size_t copy_len)
123+
{
124+
struct sock *sk = (struct sock *)msk;
125+
126+
__skb_unlink(skb, &ssk->sk_receive_queue);
127+
skb_orphan(skb);
128+
__skb_queue_tail(&sk->sk_receive_queue, skb);
129+
130+
msk->ack_seq += copy_len;
131+
MPTCP_SKB_CB(skb)->offset = offset;
132+
}
133+
134+
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
135+
struct sock *ssk,
136+
unsigned int *bytes)
137+
{
138+
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
139+
unsigned int moved = 0;
140+
bool more_data_avail;
141+
struct tcp_sock *tp;
142+
bool done = false;
143+
144+
tp = tcp_sk(ssk);
145+
do {
146+
u32 map_remaining, offset;
147+
u32 seq = tp->copied_seq;
148+
struct sk_buff *skb;
149+
bool fin;
150+
151+
/* try to move as much data as available */
152+
map_remaining = subflow->map_data_len -
153+
mptcp_subflow_get_map_offset(subflow);
154+
155+
skb = skb_peek(&ssk->sk_receive_queue);
156+
if (!skb)
157+
break;
158+
159+
offset = seq - TCP_SKB_CB(skb)->seq;
160+
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
161+
if (fin) {
162+
done = true;
163+
seq++;
164+
}
165+
166+
if (offset < skb->len) {
167+
size_t len = skb->len - offset;
168+
169+
if (tp->urg_data)
170+
done = true;
171+
172+
__mptcp_move_skb(msk, ssk, skb, offset, len);
173+
seq += len;
174+
moved += len;
175+
176+
if (WARN_ON_ONCE(map_remaining < len))
177+
break;
178+
} else {
179+
WARN_ON_ONCE(!fin);
180+
sk_eat_skb(ssk, skb);
181+
done = true;
182+
}
183+
184+
WRITE_ONCE(tp->copied_seq, seq);
185+
more_data_avail = mptcp_subflow_data_available(ssk);
186+
} while (more_data_avail);
187+
188+
*bytes = moved;
189+
190+
return done;
191+
}
192+
114193
void mptcp_data_ready(struct sock *sk)
115194
{
116195
struct mptcp_sock *msk = mptcp_sk(sk);
117196

118197
set_bit(MPTCP_DATA_READY, &msk->flags);
198+
199+
if (schedule_work(&msk->work))
200+
sock_hold((struct sock *)msk);
201+
119202
sk->sk_data_ready(sk);
120203
}
121204

@@ -373,19 +456,68 @@ static void mptcp_wait_data(struct sock *sk, long *timeo)
373456
remove_wait_queue(sk_sleep(sk), &wait);
374457
}
375458

459+
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
460+
struct msghdr *msg,
461+
size_t len)
462+
{
463+
struct sock *sk = (struct sock *)msk;
464+
struct sk_buff *skb;
465+
int copied = 0;
466+
467+
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
468+
u32 offset = MPTCP_SKB_CB(skb)->offset;
469+
u32 data_len = skb->len - offset;
470+
u32 count = min_t(size_t, len - copied, data_len);
471+
int err;
472+
473+
err = skb_copy_datagram_msg(skb, offset, msg, count);
474+
if (unlikely(err < 0)) {
475+
if (!copied)
476+
return err;
477+
break;
478+
}
479+
480+
copied += count;
481+
482+
if (count < data_len) {
483+
MPTCP_SKB_CB(skb)->offset += count;
484+
break;
485+
}
486+
487+
__skb_unlink(skb, &sk->sk_receive_queue);
488+
__kfree_skb(skb);
489+
490+
if (copied >= len)
491+
break;
492+
}
493+
494+
return copied;
495+
}
496+
497+
static bool __mptcp_move_skbs(struct mptcp_sock *msk)
498+
{
499+
unsigned int moved = 0;
500+
bool done;
501+
502+
do {
503+
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
504+
505+
if (!ssk)
506+
break;
507+
508+
lock_sock(ssk);
509+
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
510+
release_sock(ssk);
511+
} while (!done);
512+
513+
return moved > 0;
514+
}
515+
376516
static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
377517
int nonblock, int flags, int *addr_len)
378518
{
379519
struct mptcp_sock *msk = mptcp_sk(sk);
380-
struct mptcp_subflow_context *subflow;
381-
bool more_data_avail = false;
382-
struct mptcp_read_arg arg;
383-
read_descriptor_t desc;
384-
bool wait_data = false;
385520
struct socket *ssock;
386-
struct tcp_sock *tp;
387-
bool done = false;
388-
struct sock *ssk;
389521
int copied = 0;
390522
int target;
391523
long timeo;
@@ -403,65 +535,26 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
403535
return copied;
404536
}
405537

406-
arg.msg = msg;
407-
desc.arg.data = &arg;
408-
desc.error = 0;
409-
410538
timeo = sock_rcvtimeo(sk, nonblock);
411539

412540
len = min_t(size_t, len, INT_MAX);
413541
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
414542

415-
while (!done) {
416-
u32 map_remaining;
543+
while (len > (size_t)copied) {
417544
int bytes_read;
418545

419-
ssk = mptcp_subflow_recv_lookup(msk);
420-
pr_debug("msk=%p ssk=%p", msk, ssk);
421-
if (!ssk)
422-
goto wait_for_data;
546+
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
547+
if (unlikely(bytes_read < 0)) {
548+
if (!copied)
549+
copied = bytes_read;
550+
goto out_err;
551+
}
423552

424-
subflow = mptcp_subflow_ctx(ssk);
425-
tp = tcp_sk(ssk);
553+
copied += bytes_read;
426554

427-
lock_sock(ssk);
428-
do {
429-
/* try to read as much data as available */
430-
map_remaining = subflow->map_data_len -
431-
mptcp_subflow_get_map_offset(subflow);
432-
desc.count = min_t(size_t, len - copied, map_remaining);
433-
pr_debug("reading %zu bytes, copied %d", desc.count,
434-
copied);
435-
bytes_read = tcp_read_sock(ssk, &desc,
436-
mptcp_read_actor);
437-
if (bytes_read < 0) {
438-
if (!copied)
439-
copied = bytes_read;
440-
done = true;
441-
goto next;
442-
}
443-
444-
pr_debug("msk ack_seq=%llx -> %llx", msk->ack_seq,
445-
msk->ack_seq + bytes_read);
446-
msk->ack_seq += bytes_read;
447-
copied += bytes_read;
448-
if (copied >= len) {
449-
done = true;
450-
goto next;
451-
}
452-
if (tp->urg_data && tp->urg_seq == tp->copied_seq) {
453-
pr_err("Urgent data present, cannot proceed");
454-
done = true;
455-
goto next;
456-
}
457-
next:
458-
more_data_avail = mptcp_subflow_data_available(ssk);
459-
} while (more_data_avail && !done);
460-
release_sock(ssk);
461-
continue;
462-
463-
wait_for_data:
464-
more_data_avail = false;
555+
if (skb_queue_empty(&sk->sk_receive_queue) &&
556+
__mptcp_move_skbs(msk))
557+
continue;
465558

466559
/* only the master socket status is relevant here. The exit
467560
* conditions mirror closely tcp_recvmsg()
@@ -502,26 +595,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
502595
}
503596

504597
pr_debug("block timeout %ld", timeo);
505-
wait_data = true;
506598
mptcp_wait_data(sk, &timeo);
507599
if (unlikely(__mptcp_tcp_fallback(msk)))
508600
goto fallback;
509601
}
510602

511-
if (more_data_avail) {
512-
if (!test_bit(MPTCP_DATA_READY, &msk->flags))
513-
set_bit(MPTCP_DATA_READY, &msk->flags);
514-
} else if (!wait_data) {
603+
if (skb_queue_empty(&sk->sk_receive_queue)) {
604+
/* entire backlog drained, clear DATA_READY. */
515605
clear_bit(MPTCP_DATA_READY, &msk->flags);
516606

517-
/* .. race-breaker: ssk might get new data after last
518-
* data_available() returns false.
607+
/* .. race-breaker: ssk might have gotten new data
608+
* after last __mptcp_move_skbs() returned false.
519609
*/
520-
ssk = mptcp_subflow_recv_lookup(msk);
521-
if (unlikely(ssk))
610+
if (unlikely(__mptcp_move_skbs(msk)))
522611
set_bit(MPTCP_DATA_READY, &msk->flags);
612+
} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
613+
/* data to read but mptcp_wait_data() cleared DATA_READY */
614+
set_bit(MPTCP_DATA_READY, &msk->flags);
523615
}
524-
616+
out_err:
525617
release_sock(sk);
526618
return copied;
527619
}
@@ -557,7 +649,7 @@ static void mptcp_worker(struct work_struct *work)
557649
struct sock *sk = &msk->sk.icsk_inet.sk;
558650

559651
lock_sock(sk);
560-
652+
__mptcp_move_skbs(msk);
561653
release_sock(sk);
562654
sock_put(sk);
563655
}
@@ -638,6 +730,8 @@ static void mptcp_close(struct sock *sk, long timeout)
638730

639731
mptcp_cancel_work(sk);
640732

733+
__skb_queue_purge(&sk->sk_receive_queue);
734+
641735
sk_common_release(sk);
642736
}
643737

@@ -1204,6 +1298,8 @@ void mptcp_proto_init(void)
12041298
panic("Failed to register MPTCP proto.\n");
12051299

12061300
inet_register_protosw(&mptcp_protosw);
1301+
1302+
BUILD_BUG_ON(sizeof(struct mptcp_skb_cb) > sizeof_field(struct sk_buff, cb));
12071303
}
12081304

12091305
#if IS_ENABLED(CONFIG_MPTCP_IPV6)

0 commit comments

Comments
 (0)