@@ -31,6 +31,12 @@ struct mptcp6_sock {
3131};
3232#endif
3333
34+ struct mptcp_skb_cb {
35+ u32 offset ;
36+ };
37+
38+ #define MPTCP_SKB_CB (__skb ) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
39+
3440/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
3541 * completed yet or has failed, return the subflow socket.
3642 * Otherwise return NULL.
@@ -111,11 +117,88 @@ static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
111117 return NULL ;
112118}
113119
120+ static void __mptcp_move_skb (struct mptcp_sock * msk , struct sock * ssk ,
121+ struct sk_buff * skb ,
122+ unsigned int offset , size_t copy_len )
123+ {
124+ struct sock * sk = (struct sock * )msk ;
125+
126+ __skb_unlink (skb , & ssk -> sk_receive_queue );
127+ skb_orphan (skb );
128+ __skb_queue_tail (& sk -> sk_receive_queue , skb );
129+
130+ msk -> ack_seq += copy_len ;
131+ MPTCP_SKB_CB (skb )-> offset = offset ;
132+ }
133+
134+ static bool __mptcp_move_skbs_from_subflow (struct mptcp_sock * msk ,
135+ struct sock * ssk ,
136+ unsigned int * bytes )
137+ {
138+ struct mptcp_subflow_context * subflow = mptcp_subflow_ctx (ssk );
139+ unsigned int moved = 0 ;
140+ bool more_data_avail ;
141+ struct tcp_sock * tp ;
142+ bool done = false;
143+
144+ tp = tcp_sk (ssk );
145+ do {
146+ u32 map_remaining , offset ;
147+ u32 seq = tp -> copied_seq ;
148+ struct sk_buff * skb ;
149+ bool fin ;
150+
151+ /* try to move as much data as available */
152+ map_remaining = subflow -> map_data_len -
153+ mptcp_subflow_get_map_offset (subflow );
154+
155+ skb = skb_peek (& ssk -> sk_receive_queue );
156+ if (!skb )
157+ break ;
158+
159+ offset = seq - TCP_SKB_CB (skb )-> seq ;
160+ fin = TCP_SKB_CB (skb )-> tcp_flags & TCPHDR_FIN ;
161+ if (fin ) {
162+ done = true;
163+ seq ++ ;
164+ }
165+
166+ if (offset < skb -> len ) {
167+ size_t len = skb -> len - offset ;
168+
169+ if (tp -> urg_data )
170+ done = true;
171+
172+ __mptcp_move_skb (msk , ssk , skb , offset , len );
173+ seq += len ;
174+ moved += len ;
175+
176+ if (WARN_ON_ONCE (map_remaining < len ))
177+ break ;
178+ } else {
179+ WARN_ON_ONCE (!fin );
180+ sk_eat_skb (ssk , skb );
181+ done = true;
182+ }
183+
184+ WRITE_ONCE (tp -> copied_seq , seq );
185+ more_data_avail = mptcp_subflow_data_available (ssk );
186+ } while (more_data_avail );
187+
188+ * bytes = moved ;
189+
190+ return done ;
191+ }
192+
114193void mptcp_data_ready (struct sock * sk )
115194{
116195 struct mptcp_sock * msk = mptcp_sk (sk );
117196
118197 set_bit (MPTCP_DATA_READY , & msk -> flags );
198+
199+ if (schedule_work (& msk -> work ))
200+ sock_hold ((struct sock * )msk );
201+
119202 sk -> sk_data_ready (sk );
120203}
121204
@@ -373,19 +456,68 @@ static void mptcp_wait_data(struct sock *sk, long *timeo)
373456 remove_wait_queue (sk_sleep (sk ), & wait );
374457}
375458
459+ static int __mptcp_recvmsg_mskq (struct mptcp_sock * msk ,
460+ struct msghdr * msg ,
461+ size_t len )
462+ {
463+ struct sock * sk = (struct sock * )msk ;
464+ struct sk_buff * skb ;
465+ int copied = 0 ;
466+
467+ while ((skb = skb_peek (& sk -> sk_receive_queue )) != NULL ) {
468+ u32 offset = MPTCP_SKB_CB (skb )-> offset ;
469+ u32 data_len = skb -> len - offset ;
470+ u32 count = min_t (size_t , len - copied , data_len );
471+ int err ;
472+
473+ err = skb_copy_datagram_msg (skb , offset , msg , count );
474+ if (unlikely (err < 0 )) {
475+ if (!copied )
476+ return err ;
477+ break ;
478+ }
479+
480+ copied += count ;
481+
482+ if (count < data_len ) {
483+ MPTCP_SKB_CB (skb )-> offset += count ;
484+ break ;
485+ }
486+
487+ __skb_unlink (skb , & sk -> sk_receive_queue );
488+ __kfree_skb (skb );
489+
490+ if (copied >= len )
491+ break ;
492+ }
493+
494+ return copied ;
495+ }
496+
497+ static bool __mptcp_move_skbs (struct mptcp_sock * msk )
498+ {
499+ unsigned int moved = 0 ;
500+ bool done ;
501+
502+ do {
503+ struct sock * ssk = mptcp_subflow_recv_lookup (msk );
504+
505+ if (!ssk )
506+ break ;
507+
508+ lock_sock (ssk );
509+ done = __mptcp_move_skbs_from_subflow (msk , ssk , & moved );
510+ release_sock (ssk );
511+ } while (!done );
512+
513+ return moved > 0 ;
514+ }
515+
376516static int mptcp_recvmsg (struct sock * sk , struct msghdr * msg , size_t len ,
377517 int nonblock , int flags , int * addr_len )
378518{
379519 struct mptcp_sock * msk = mptcp_sk (sk );
380- struct mptcp_subflow_context * subflow ;
381- bool more_data_avail = false;
382- struct mptcp_read_arg arg ;
383- read_descriptor_t desc ;
384- bool wait_data = false;
385520 struct socket * ssock ;
386- struct tcp_sock * tp ;
387- bool done = false;
388- struct sock * ssk ;
389521 int copied = 0 ;
390522 int target ;
391523 long timeo ;
@@ -403,65 +535,26 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
403535 return copied ;
404536 }
405537
406- arg .msg = msg ;
407- desc .arg .data = & arg ;
408- desc .error = 0 ;
409-
410538 timeo = sock_rcvtimeo (sk , nonblock );
411539
412540 len = min_t (size_t , len , INT_MAX );
413541 target = sock_rcvlowat (sk , flags & MSG_WAITALL , len );
414542
415- while (!done ) {
416- u32 map_remaining ;
543+ while (len > (size_t )copied ) {
417544 int bytes_read ;
418545
419- ssk = mptcp_subflow_recv_lookup (msk );
420- pr_debug ("msk=%p ssk=%p" , msk , ssk );
421- if (!ssk )
422- goto wait_for_data ;
546+ bytes_read = __mptcp_recvmsg_mskq (msk , msg , len - copied );
547+ if (unlikely (bytes_read < 0 )) {
548+ if (!copied )
549+ copied = bytes_read ;
550+ goto out_err ;
551+ }
423552
424- subflow = mptcp_subflow_ctx (ssk );
425- tp = tcp_sk (ssk );
553+ copied += bytes_read ;
426554
427- lock_sock (ssk );
428- do {
429- /* try to read as much data as available */
430- map_remaining = subflow -> map_data_len -
431- mptcp_subflow_get_map_offset (subflow );
432- desc .count = min_t (size_t , len - copied , map_remaining );
433- pr_debug ("reading %zu bytes, copied %d" , desc .count ,
434- copied );
435- bytes_read = tcp_read_sock (ssk , & desc ,
436- mptcp_read_actor );
437- if (bytes_read < 0 ) {
438- if (!copied )
439- copied = bytes_read ;
440- done = true;
441- goto next ;
442- }
443-
444- pr_debug ("msk ack_seq=%llx -> %llx" , msk -> ack_seq ,
445- msk -> ack_seq + bytes_read );
446- msk -> ack_seq += bytes_read ;
447- copied += bytes_read ;
448- if (copied >= len ) {
449- done = true;
450- goto next ;
451- }
452- if (tp -> urg_data && tp -> urg_seq == tp -> copied_seq ) {
453- pr_err ("Urgent data present, cannot proceed" );
454- done = true;
455- goto next ;
456- }
457- next :
458- more_data_avail = mptcp_subflow_data_available (ssk );
459- } while (more_data_avail && !done );
460- release_sock (ssk );
461- continue ;
462-
463- wait_for_data :
464- more_data_avail = false;
555+ if (skb_queue_empty (& sk -> sk_receive_queue ) &&
556+ __mptcp_move_skbs (msk ))
557+ continue ;
465558
466559 /* only the master socket status is relevant here. The exit
467560 * conditions mirror closely tcp_recvmsg()
@@ -502,26 +595,25 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
502595 }
503596
504597 pr_debug ("block timeout %ld" , timeo );
505- wait_data = true;
506598 mptcp_wait_data (sk , & timeo );
507599 if (unlikely (__mptcp_tcp_fallback (msk )))
508600 goto fallback ;
509601 }
510602
511- if (more_data_avail ) {
512- if (!test_bit (MPTCP_DATA_READY , & msk -> flags ))
513- set_bit (MPTCP_DATA_READY , & msk -> flags );
514- } else if (!wait_data ) {
603+ if (skb_queue_empty (& sk -> sk_receive_queue )) {
604+ /* entire backlog drained, clear DATA_READY. */
515605 clear_bit (MPTCP_DATA_READY , & msk -> flags );
516606
517- /* .. race-breaker: ssk might get new data after last
518- * data_available () returns false.
607+ /* .. race-breaker: ssk might have gotten new data
608+ * after last __mptcp_move_skbs () returned false.
519609 */
520- ssk = mptcp_subflow_recv_lookup (msk );
521- if (unlikely (ssk ))
610+ if (unlikely (__mptcp_move_skbs (msk )))
522611 set_bit (MPTCP_DATA_READY , & msk -> flags );
612+ } else if (unlikely (!test_bit (MPTCP_DATA_READY , & msk -> flags ))) {
613+ /* data to read but mptcp_wait_data() cleared DATA_READY */
614+ set_bit (MPTCP_DATA_READY , & msk -> flags );
523615 }
524-
616+ out_err :
525617 release_sock (sk );
526618 return copied ;
527619}
@@ -557,7 +649,7 @@ static void mptcp_worker(struct work_struct *work)
557649 struct sock * sk = & msk -> sk .icsk_inet .sk ;
558650
559651 lock_sock (sk );
560-
652+ __mptcp_move_skbs ( msk );
561653 release_sock (sk );
562654 sock_put (sk );
563655}
@@ -638,6 +730,8 @@ static void mptcp_close(struct sock *sk, long timeout)
638730
639731 mptcp_cancel_work (sk );
640732
733+ __skb_queue_purge (& sk -> sk_receive_queue );
734+
641735 sk_common_release (sk );
642736}
643737
@@ -1204,6 +1298,8 @@ void mptcp_proto_init(void)
12041298 panic ("Failed to register MPTCP proto.\n" );
12051299
12061300 inet_register_protosw (& mptcp_protosw );
1301+
1302+ BUILD_BUG_ON (sizeof (struct mptcp_skb_cb ) > sizeof_field (struct sk_buff , cb ));
12071303}
12081304
12091305#if IS_ENABLED (CONFIG_MPTCP_IPV6 )
0 commit comments