From 53c731ff238b80e1516ed3c18700898ea5ba1dd0 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 6 Jan 2019 20:16:57 +0300 Subject: [PATCH 01/14] TLS: fix TCP socket write memory accounting - we may actually allocate less bytes than the TLS overhead as well as allocate an extra skb. --- tempesta_fw/tls.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index 5943d4f44a..e25bed22a5 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -183,7 +183,6 @@ tfw_tls_tcp_add_overhead(struct sock *sk, unsigned int overhead) { sk->sk_wmem_queued += overhead; sk_mem_charge(sk, overhead); - tcp_sk(sk)->write_seq += overhead; } /** @@ -231,6 +230,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) int r = -ENOMEM; unsigned int head_sz, tag_sz, len, frags; + unsigned int t_sz_curr, t_sz_next; unsigned char type; struct sk_buff *next = skb, *skb_tail = skb; struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -297,6 +297,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * if there is no free frag slot in skb_tail, a new skb is allocated. */ next = skb_tail->next; + t_sz_curr = skb_tail->truesize; + t_sz_next = next != skb ? next->truesize : 0; if (skb_tail == skb) { r = ss_skb_expand_head_tail(skb->next, skb, head_sz, tag_sz); if (r < 0) @@ -321,9 +323,26 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) */ if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; - } else { + + /* + * A new frag is added to the end of the current skb or + * begin of the next skb. + */ + WARN_ON_ONCE(t_sz_curr > skb_tail->truesize); + WARN_ON_ONCE(t_sz_next > next->truesize); + t_sz_curr = skb_tail->truesize - t_sz_curr; + t_sz_next = next->truesize - t_sz_next; + } + else { WARN_ON_ONCE(skb_tail->next->len != tag_sz); + WARN_ON_ONCE(skb_tail->truesize != t_sz_curr); + tfw_tls_tcp_propagate_dseq(sk, skb_tail); + + /* A new skb is added to the socket wmem. */ + t_sz_curr = 0; + t_sz_next = skb_tail->next->truesize; + skb_tail = skb_tail->next; } @@ -336,8 +355,17 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * consistent state. */ tfw_tls_tcp_propagate_dseq(sk, skb_tail); + tcp_sk(sk)->write_seq += head_sz + tag_sz; - tfw_tls_tcp_add_overhead(sk, head_sz + tag_sz); + /* + * TLS record header is always allocated form the reserved skb headroom. + * The room for the tag may also be allocated from the reserved tailroom + * or in a new page frament in slb_tail or next, probably new, skb. + * So to adjust the socket write memory we have to check the both skbs + * and only for tag_sz. + */ + WARN_ON_ONCE(t_sz_curr + t_sz_next < tag_sz); + tfw_tls_tcp_add_overhead(sk, t_sz_curr + t_sz_next); if (likely(sgt.nents <= AUTO_SEGS_N)) { sgt.sgl = sg; From 07fbe5c6063a1393b87f038215cb35a0f386792b Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 6 Jan 2019 20:40:25 +0300 Subject: [PATCH 02/14] Remove unnecessary kernel comment - now we clearly reset TCP connections --- linux-4.14.32.patch | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index a25e2255d3..c8864d67b1 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..4e79cb5e 100644 +index 83d11cd2..b676c6a5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1892,7 +1892,7 @@ index 83d11cd2..4e79cb5e 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2355,34 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2355,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1916,10 +1916,6 @@ index 83d11cd2..4e79cb5e 100644 + net_warn_ratelimited( + "Tempesta: cannot encrypt data (%d)," + " reset a TLS connection.\n", result); -+ /* -+ * FIXME #984 WARNING: at net/core/stream.c:205 -+ * sk_stream_kill_queues+0x106/0x120 -+ */ + tcp_reset(sk); + break; + } @@ -1928,7 +1924,7 @@ index 83d11cd2..4e79cb5e 100644 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2564,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2560,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1936,7 +1932,7 @@ index 83d11cd2..4e79cb5e 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,9 +2886,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,9 +2882,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { @@ -1956,7 +1952,7 @@ index 83d11cd2..4e79cb5e 100644 diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); -@@ -3129,6 +3186,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3182,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } From a0f59f21403cbd18418fb113ae39d0cf952388ed Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 11 Jan 2019 02:59:05 +0300 Subject: [PATCH 03/14] Several fixes for ss_skb_split(): 1. @nsize was copy&pasted from tcp_fragment(), but the last one uses it only for fast path with skb w/o frags. 2. reserved_tailroom is in union with mark which we process separately, so the field isn't compatible with current Tempesta code. Also it's used for egress path only and we don't need it on ingress path where ss_skb_split() is called. 3. GSO segementation for skb wasn't accounted: make couple of comments in TLS code and initialize it for split skb. (Later kernel patch will bring small logic on it as well.) Some cleanups. -jN builds sometetimes still fail in libtdb/tdbq dependence (see commit 1fc007dcdb78ffdb0c243433a8bcac727ac4a4a2). --- tempesta_db/Makefile | 2 -- tempesta_fw/sock.c | 12 ++++++++---- tempesta_fw/ss_skb.c | 28 +++++++++++++++------------- tempesta_fw/tls.c | 12 ++++++++++-- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/tempesta_db/Makefile b/tempesta_db/Makefile index 4cc09b3d8f..b1e9a58049 100644 --- a/tempesta_db/Makefile +++ b/tempesta_db/Makefile @@ -19,11 +19,9 @@ all: libtdb tdbq -.PHONY: libtdb libtdb: $(MAKE) -C libtdb -.PHONY: tdbq tdbq: libtdb $(MAKE) -C tdbq diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 74d03dba74..1a4e0427e2 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -350,7 +350,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); unsigned int mark = (*skb_head)->mark; - TFW_DBG3("[%d]: %s: sk=%p queue_empty=%d send_head=%p" + TFW_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK" " sk_state=%d mss=%d size=%d\n", smp_processor_id(), __func__, sk, tcp_write_queue_empty(sk), tcp_send_head(sk), @@ -369,7 +369,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) * these SKBs. */ if (!skb->len) { - TFW_DBG3("[%d]: %s: drop skb=%p data_len=%u len=%u\n", + TFW_DBG3("[%d]: %s: drop skb=%pK data_len=%u len=%u\n", smp_processor_id(), __func__, skb, skb->data_len, skb->len); kfree_skb(skb); @@ -382,7 +382,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) /* Propagate mark of message head skb.*/ skb->mark = mark; - TFW_DBG3("[%d]: %s: entail skb=%p data_len=%u len=%u mark=%u" + TFW_DBG3("[%d]: %s: entail skb=%pK data_len=%u len=%u mark=%u" " tls_type=%x\n", smp_processor_id(), __func__, skb, skb->data_len, skb->len, skb->mark, tempesta_tls_skb_type(skb)); @@ -449,7 +449,11 @@ ss_send(struct sock *sk, struct sk_buff **skb_head, int flags) * or copy them if they're going to be used by Tempesta during * and after the transmission. */ - if (flags & SS_F_KEEP_SKB) { + /* + * FIXME #984 the `true ||` statement at the below fixes the issue + * (at least basic tests are passed now). + */ + if (/*true ||*/ flags & SS_F_KEEP_SKB) { skb = *skb_head; do { /* tcp_transmit_skb() will clone the skb. */ diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index 732ace516c..2756939b53 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -7,7 +7,7 @@ * on top on native Linux socket buffers. The helpers provide common and * convenient wrappers for skb processing. * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2019 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -1102,27 +1102,29 @@ struct sk_buff * ss_skb_split(struct sk_buff *skb, int len) { struct sk_buff *buff; - int nsize, asize, nlen; + int n = 0; /* Assert that the SKB is orphaned. */ WARN_ON_ONCE(skb->destructor); - nsize = skb_headlen(skb) - len; - if (nsize < 0) - nsize = 0; - asize = ALIGN(nsize, 4); + if (len < skb_headlen(skb)) + n = skb_headlen(skb) - len; - buff = alloc_skb_fclone(asize + MAX_TCP_HEADER, GFP_ATOMIC); - if (buff == NULL) + buff = alloc_skb_fclone(ALIGN(n, 4) + MAX_TCP_HEADER, GFP_ATOMIC); + if (!buff) return NULL; skb_reserve(buff, MAX_TCP_HEADER); - /* Make sure there's exactly asize bytes available. */ - buff->reserved_tailroom = buff->end - buff->tail - asize; - nlen = skb->len - len - nsize; - buff->truesize += nlen; - skb->truesize -= nlen; + n = skb->len - len; + buff->truesize += n; + skb->truesize -= n; + + /* + * Initialize GSO segments counter to let TCP set it accoring to + * the current MSS on egress path. + */ + tcp_skb_pcount_set(skb, 0); /* * These are orphaned SKBs that are taken out of the TCP/IP diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index e25bed22a5..c42bcb8efc 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -3,7 +3,7 @@ * * Transport Layer Security (TLS) interfaces to Tempesta TLS. * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2019 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -320,6 +320,9 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * The last skb in our list will bring TLS tag - add it to end_seqno. * Otherwise (in worst case), a new skb was inserted to fit TLS tag * - fix end_seqno's for @skb_tail and this new skb. + * + * @limit = mss_now - tls_overhead, so {tso,tcp}_fragment() called from + * tcp_write_xmit() should set proper skb->tcp_gso_segs. */ if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; @@ -339,7 +342,12 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) tfw_tls_tcp_propagate_dseq(sk, skb_tail); - /* A new skb is added to the socket wmem. */ + /* + * A new skb is added to the socket wmem. + * + * pcount for a new skb is zero, to tcp_write_xmit() will + * set TSO segs to proper value on next iteration. + */ t_sz_curr = 0; t_sz_next = skb_tail->next->truesize; From 4f6d208af7f2af54e92124e8f5eee1efacc8121e Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 11 Jan 2019 12:50:30 +0300 Subject: [PATCH 04/14] Declare the while target `all` as phony to make clean build make -j4 clean all. --- tempesta_db/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tempesta_db/Makefile b/tempesta_db/Makefile index b1e9a58049..15dd9e2754 100644 --- a/tempesta_db/Makefile +++ b/tempesta_db/Makefile @@ -18,6 +18,7 @@ # Temple Place - Suite 330, Boston, MA 02111-1307, USA. all: libtdb tdbq +.PHONY: all libtdb: $(MAKE) -C libtdb From 74556b18bc50a1a18a612b0bd9ae2f1fac1160bd Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 21 Jan 2019 00:59:55 +0300 Subject: [PATCH 05/14] Fix #984: 1. accurately fix skb->truesize and TCP write memory in kernel by tcp_skb_unclone(); 2. __split_pgfrag_del() if we just move pointers, then we do not free TCP write memory, so do not change skb->truesize. 3. ss_skb_unroll(): truesize and data_len/len are completely different counters, so do not mix them in ss_skb_adjust_data_len(). By the way, during the tests I saw crazy skb overheads - truesize can be larger than len in tens kilobytes. The explanation for such overheads is various fragments stoling (e.g. our __split_pgfrag_del) and cloning. 4. cleanup: move ss_skb coalescing functions closer to their calls. --- linux-4.14.32.patch | 83 +++++++++++++++++++++++++--------- tempesta_fw/sock.c | 13 +++--- tempesta_fw/ss_skb.c | 105 ++++++++++++++++++++++--------------------- 3 files changed, 123 insertions(+), 78 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index c8864d67b1..75c6f6eec8 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..b676c6a5 100644 +index 83d11cd2..14918c14 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1862,7 +1862,58 @@ index 83d11cd2..b676c6a5 100644 /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) -@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk) +@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) + TCP_SKB_CB(skb)->eor = 0; + } + ++/** ++ * Tempesta uses page fragments for all skb allocations, so if an skb was ++ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may ++ * return larger skb and we have to adjust skb->truesize and memory accounting ++ * for TCP write queue. ++ */ ++static int ++tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri) ++{ ++ int r, delta_truesize = skb->truesize; ++ ++ if ((r = skb_unclone(skb, pri))) ++ return r; ++ ++ delta_truesize -= skb->truesize; ++ sk->sk_wmem_queued -= delta_truesize; ++ if (delta_truesize > 0) { ++ sk_mem_uncharge(sk, delta_truesize); ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); ++ } else { ++ sk_mem_charge(sk, -delta_truesize); ++ } ++ ++ return 0; ++} ++ + /* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. +@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + if (nsize < 0) + nsize = 0; + +- if (skb_unclone(skb, gfp)) ++ if (tcp_skb_unclone(sk, skb, gfp)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ +@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) + { + u32 delta_truesize; + +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) + return -ENOMEM; + + delta_truesize = __pskb_trim_head(skb, len); +@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } @@ -1870,7 +1921,7 @@ index 83d11cd2..b676c6a5 100644 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, -@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota, max_segs), nonagle); @@ -1892,7 +1943,7 @@ index 83d11cd2..b676c6a5 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2355,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2381,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1919,12 +1970,15 @@ index 83d11cd2..b676c6a5 100644 + tcp_reset(sk); + break; + } ++ /* We must not break TSO. */ ++ WARN_ON_ONCE(tcp_skb_pcount(skb) ++ != DIV_ROUND_UP(skb->len, mss_now)); + } +#endif if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2560,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2589,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1932,27 +1986,16 @@ index 83d11cd2..b676c6a5 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,9 +2882,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,7 +2911,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { -+ int delta_truesize = skb->truesize; -+ - if (skb_unclone(skb, GFP_ATOMIC)) +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) return -ENOMEM; -+ delta_truesize -= skb->truesize; -+ sk->sk_wmem_queued -= delta_truesize; -+ if (delta_truesize > 0) { -+ sk_mem_uncharge(sk, delta_truesize); -+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -+ } else { -+ sk_mem_charge(sk, -delta_truesize); -+ } diff = tcp_skb_pcount(skb); - tcp_set_skb_tso_segs(skb, cur_mss); - diff -= tcp_skb_pcount(skb); -@@ -3129,6 +3182,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3201,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 1a4e0427e2..9e3ad5581e 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) /* Propagate mark of message head skb.*/ skb->mark = mark; - TFW_DBG3("[%d]: %s: entail skb=%pK data_len=%u len=%u mark=%u" - " tls_type=%x\n", smp_processor_id(), __func__, - skb, skb->data_len, skb->len, skb->mark, + TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u" + " truesize=%u mark=%u tls_type=%x\n", + smp_processor_id(), __func__, sk, + skb, skb->data_len, skb->len, skb->truesize, skb->mark, tempesta_tls_skb_type(skb)); skb_entail(sk, skb); @@ -449,11 +450,7 @@ ss_send(struct sock *sk, struct sk_buff **skb_head, int flags) * or copy them if they're going to be used by Tempesta during * and after the transmission. */ - /* - * FIXME #984 the `true ||` statement at the below fixes the issue - * (at least basic tests are passed now). - */ - if (/*true ||*/ flags & SS_F_KEEP_SKB) { + if (flags & SS_F_KEEP_SKB) { skb = *skb_head; do { /* tcp_transmit_skb() will clone the skb. */ diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index 2756939b53..f94aac5209 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -634,7 +634,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off if (likely(!off)) { frag->page_offset += len; skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; it->ptr = skb_frag_address(frag); it->skb = skb; return 0; @@ -642,7 +643,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off /* Fast path (e.g. TLS tag): delete the tail part of a fragment. */ if (likely(off + len == skb_frag_size(frag))) { skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; __it_next_data(skb, i + 1, it); return 0; } @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off ss_skb_adjust_data_len(skb, -tail_len); ss_skb_adjust_data_len(skb_dst, tail_len); } - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; /* Get the SKB and the address for data after the deleted data. */ it->ptr = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]); @@ -1138,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len) return buff; } -static inline int -__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, - const struct sk_buff *orig_skb) -{ - struct sk_buff *skb = ss_skb_peek_tail(skb_head); - - if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { - skb = ss_skb_alloc(0); - if (!skb) - return -ENOMEM; - ss_skb_queue_tail(skb_head, skb); - skb->mark = orig_skb->mark; - } - - skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; - ss_skb_adjust_data_len(skb, frag->size); - __skb_frag_ref(frag); - - return 0; -} - -static int -ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) -{ - int i; - skb_frag_t head_frag; - unsigned int headlen = skb_headlen(skb); - - if (headlen) { - BUG_ON(!skb->head_frag); - head_frag.size = headlen; - head_frag.page.p = virt_to_page(skb->head); - head_frag.page_offset = skb->data - - (unsigned char *)page_address(head_frag.page.p); - if (__coalesce_frag(skb_head, &head_frag, skb)) - return -ENOMEM; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) - return -ENOMEM; - } - - return 0; -} - /** * Tempesta FW forwards skbs with application and transport payload as is, * so initialize such skbs such that TCP/IP stack won't stumble on dirty @@ -1236,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb) skb->ip_summed = CHECKSUM_PARTIAL; } +static inline int +__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, + const struct sk_buff *orig_skb) +{ + struct sk_buff *skb = ss_skb_peek_tail(skb_head); + + if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { + skb = ss_skb_alloc(0); + if (!skb) + return -ENOMEM; + ss_skb_queue_tail(skb_head, skb); + skb->mark = orig_skb->mark; + } + + skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; + ss_skb_adjust_data_len(skb, frag->size); + __skb_frag_ref(frag); + + return 0; +} + +static int +ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) +{ + int i; + skb_frag_t head_frag; + unsigned int headlen = skb_headlen(skb); + + if (headlen) { + BUG_ON(!skb->head_frag); + head_frag.size = headlen; + head_frag.page.p = virt_to_page(skb->head); + head_frag.page_offset = skb->data - + (unsigned char *)page_address(head_frag.page.p); + if (__coalesce_frag(skb_head, &head_frag, skb)) + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) + return -ENOMEM; + } + + return 0; +} + /* * When the original SKB is a clone then its shinfo and payload cannot be * modified as they are shared with other SKB users. As the SKB is unrolled, @@ -1328,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb) * when we track whitelist requests during HTTP processing. */ f_skb->mark = skb->mark; - ss_skb_adjust_data_len(skb, -f_skb->len); + skb->len -= f_skb->len; + skb->data_len -= f_skb->len; + skb->truesize -= f_skb->truesize; f_skb->prev = prev_skb; prev_skb = f_skb; } From baf7961377e86df399e6b1a53281c427368e7094 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 28 Jan 2019 01:33:35 +0300 Subject: [PATCH 06/14] Fix code review comments --- tempesta_fw/ss_skb.c | 8 ++++---- tempesta_fw/tls.c | 31 ++++++++++++------------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index f94aac5209..1b96fc239b 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -1119,12 +1119,12 @@ ss_skb_split(struct sk_buff *skb, int len) skb_reserve(buff, MAX_TCP_HEADER); - n = skb->len - len; - buff->truesize += n; - skb->truesize -= n; + /* @buff already accounts @n in truesize. */ + buff->truesize += skb->len - len - n; + skb->truesize -= skb->len - len; /* - * Initialize GSO segments counter to let TCP set it accoring to + * Initialize GSO segments counter to let TCP set it according to * the current MSS on egress path. */ tcp_skb_pcount_set(skb, 0); diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index c42bcb8efc..84670be302 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -229,8 +229,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) #define AUTO_SEGS_N 8 int r = -ENOMEM; - unsigned int head_sz, tag_sz, len, frags; - unsigned int t_sz_curr, t_sz_next; + unsigned int head_sz, tag_sz, len, frags, t_sz; unsigned char type; struct sk_buff *next = skb, *skb_tail = skb; struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -297,8 +296,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * if there is no free frag slot in skb_tail, a new skb is allocated. */ next = skb_tail->next; - t_sz_curr = skb_tail->truesize; - t_sz_next = next != skb ? next->truesize : 0; + t_sz = skb_tail->truesize; + WARN_ON_ONCE(next == skb); if (skb_tail == skb) { r = ss_skb_expand_head_tail(skb->next, skb, head_sz, tag_sz); if (r < 0) @@ -327,18 +326,13 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; - /* - * A new frag is added to the end of the current skb or - * begin of the next skb. - */ - WARN_ON_ONCE(t_sz_curr > skb_tail->truesize); - WARN_ON_ONCE(t_sz_next > next->truesize); - t_sz_curr = skb_tail->truesize - t_sz_curr; - t_sz_next = next->truesize - t_sz_next; + /* A new frag is added to the end of the current skb. */ + WARN_ON_ONCE(t_sz >= skb_tail->truesize); + t_sz = skb_tail->truesize - t_sz; } else { WARN_ON_ONCE(skb_tail->next->len != tag_sz); - WARN_ON_ONCE(skb_tail->truesize != t_sz_curr); + WARN_ON_ONCE(skb_tail->truesize != t_sz); tfw_tls_tcp_propagate_dseq(sk, skb_tail); @@ -348,8 +342,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * pcount for a new skb is zero, to tcp_write_xmit() will * set TSO segs to proper value on next iteration. */ - t_sz_curr = 0; - t_sz_next = skb_tail->next->truesize; + t_sz = skb_tail->next->truesize; skb_tail = skb_tail->next; } @@ -366,14 +359,14 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) tcp_sk(sk)->write_seq += head_sz + tag_sz; /* - * TLS record header is always allocated form the reserved skb headroom. + * TLS record header is always allocated from the reserved skb headroom. * The room for the tag may also be allocated from the reserved tailroom - * or in a new page frament in slb_tail or next, probably new, skb. + * or in a new page fragment in skb_tail or next, probably new, skb. * So to adjust the socket write memory we have to check the both skbs * and only for tag_sz. */ - WARN_ON_ONCE(t_sz_curr + t_sz_next < tag_sz); - tfw_tls_tcp_add_overhead(sk, t_sz_curr + t_sz_next); + WARN_ON_ONCE(t_sz < tag_sz); + tfw_tls_tcp_add_overhead(sk, t_sz); if (likely(sgt.nents <= AUTO_SEGS_N)) { sgt.sgl = sg; From 49871b61db9ca19d8eda1e8b238183dabda34426 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 28 Jan 2019 02:19:28 +0300 Subject: [PATCH 07/14] Fix up TSO segments after TLS overhead; the previous warning may fail. --- linux-4.14.32.patch | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index 75c6f6eec8..8c77a0e5e3 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..14918c14 100644 +index 83d11cd2..3066c3ac 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1943,7 +1943,7 @@ index 83d11cd2..14918c14 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2381,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2381,32 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1970,15 +1970,14 @@ index 83d11cd2..14918c14 100644 + tcp_reset(sk); + break; + } -+ /* We must not break TSO. */ -+ WARN_ON_ONCE(tcp_skb_pcount(skb) -+ != DIV_ROUND_UP(skb->len, mss_now)); ++ /* Fix up TSO segments after TLS overhead. */ ++ tcp_set_skb_tso_segs(skb, mss_now); + } +#endif if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2589,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2588,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1986,7 +1985,7 @@ index 83d11cd2..14918c14 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,7 +2911,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,7 +2910,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { @@ -1995,7 +1994,7 @@ index 83d11cd2..14918c14 100644 return -ENOMEM; diff = tcp_skb_pcount(skb); -@@ -3129,6 +3201,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3200,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } From 6a4613b3400ab141f57414fcc480ca5b530c1543 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 6 Jan 2019 20:16:57 +0300 Subject: [PATCH 08/14] TLS: fix TCP socket write memory accounting - we may actually allocate less bytes than the TLS overhead as well as allocate an extra skb. --- tempesta_fw/tls.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index f9a5583c47..216910cca8 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -183,7 +183,6 @@ tfw_tls_tcp_add_overhead(struct sock *sk, unsigned int overhead) { sk->sk_wmem_queued += overhead; sk_mem_charge(sk, overhead); - tcp_sk(sk)->write_seq += overhead; } /** @@ -231,6 +230,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) int r = -ENOMEM; unsigned int head_sz, tag_sz, len, frags; + unsigned int t_sz_curr, t_sz_next; unsigned char type; struct sk_buff *next = skb, *skb_tail = skb; struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -297,6 +297,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * if there is no free frag slot in skb_tail, a new skb is allocated. */ next = skb_tail->next; + t_sz_curr = skb_tail->truesize; + t_sz_next = next != skb ? next->truesize : 0; if (skb_tail == skb) { r = ss_skb_expand_head_tail(skb->next, skb, head_sz, tag_sz); if (r < 0) @@ -321,9 +323,26 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) */ if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; - } else { + + /* + * A new frag is added to the end of the current skb or + * begin of the next skb. + */ + WARN_ON_ONCE(t_sz_curr > skb_tail->truesize); + WARN_ON_ONCE(t_sz_next > next->truesize); + t_sz_curr = skb_tail->truesize - t_sz_curr; + t_sz_next = next->truesize - t_sz_next; + } + else { WARN_ON_ONCE(skb_tail->next->len != tag_sz); + WARN_ON_ONCE(skb_tail->truesize != t_sz_curr); + tfw_tls_tcp_propagate_dseq(sk, skb_tail); + + /* A new skb is added to the socket wmem. */ + t_sz_curr = 0; + t_sz_next = skb_tail->next->truesize; + skb_tail = skb_tail->next; } @@ -336,8 +355,17 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * consistent state. */ tfw_tls_tcp_propagate_dseq(sk, skb_tail); + tcp_sk(sk)->write_seq += head_sz + tag_sz; - tfw_tls_tcp_add_overhead(sk, head_sz + tag_sz); + /* + * TLS record header is always allocated form the reserved skb headroom. + * The room for the tag may also be allocated from the reserved tailroom + * or in a new page frament in slb_tail or next, probably new, skb. + * So to adjust the socket write memory we have to check the both skbs + * and only for tag_sz. + */ + WARN_ON_ONCE(t_sz_curr + t_sz_next < tag_sz); + tfw_tls_tcp_add_overhead(sk, t_sz_curr + t_sz_next); if (likely(sgt.nents <= AUTO_SEGS_N)) { sgt.sgl = sg; From 34eefd0e94e9111b297b36a26513d2d665778296 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Sun, 6 Jan 2019 20:40:25 +0300 Subject: [PATCH 09/14] Remove unnecessary kernel comment - now we clearly reset TCP connections --- linux-4.14.32.patch | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index a25e2255d3..c8864d67b1 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..4e79cb5e 100644 +index 83d11cd2..b676c6a5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1892,7 +1892,7 @@ index 83d11cd2..4e79cb5e 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2355,34 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2355,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1916,10 +1916,6 @@ index 83d11cd2..4e79cb5e 100644 + net_warn_ratelimited( + "Tempesta: cannot encrypt data (%d)," + " reset a TLS connection.\n", result); -+ /* -+ * FIXME #984 WARNING: at net/core/stream.c:205 -+ * sk_stream_kill_queues+0x106/0x120 -+ */ + tcp_reset(sk); + break; + } @@ -1928,7 +1924,7 @@ index 83d11cd2..4e79cb5e 100644 if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2564,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2560,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1936,7 +1932,7 @@ index 83d11cd2..4e79cb5e 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,9 +2886,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,9 +2882,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { @@ -1956,7 +1952,7 @@ index 83d11cd2..4e79cb5e 100644 diff = tcp_skb_pcount(skb); tcp_set_skb_tso_segs(skb, cur_mss); diff -= tcp_skb_pcount(skb); -@@ -3129,6 +3186,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3182,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } From 2b90280b711bccc3783e309b3aa30804939f5730 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 11 Jan 2019 02:59:05 +0300 Subject: [PATCH 10/14] Several fixes for ss_skb_split(): 1. @nsize was copy&pasted from tcp_fragment(), but the last one uses it only for fast path with skb w/o frags. 2. reserved_tailroom is in union with mark which we process separately, so the field isn't compatible with current Tempesta code. Also it's used for egress path only and we don't need it on ingress path where ss_skb_split() is called. 3. GSO segementation for skb wasn't accounted: make couple of comments in TLS code and initialize it for split skb. (Later kernel patch will bring small logic on it as well.) Some cleanups. -jN builds sometetimes still fail in libtdb/tdbq dependence (see commit 1fc007dcdb78ffdb0c243433a8bcac727ac4a4a2). --- tempesta_db/Makefile | 2 -- tempesta_fw/sock.c | 12 ++++++++---- tempesta_fw/ss_skb.c | 28 +++++++++++++++------------- tempesta_fw/tls.c | 12 ++++++++++-- 4 files changed, 33 insertions(+), 21 deletions(-) diff --git a/tempesta_db/Makefile b/tempesta_db/Makefile index 4cc09b3d8f..b1e9a58049 100644 --- a/tempesta_db/Makefile +++ b/tempesta_db/Makefile @@ -19,11 +19,9 @@ all: libtdb tdbq -.PHONY: libtdb libtdb: $(MAKE) -C libtdb -.PHONY: tdbq tdbq: libtdb $(MAKE) -C tdbq diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 74d03dba74..1a4e0427e2 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -350,7 +350,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT); unsigned int mark = (*skb_head)->mark; - TFW_DBG3("[%d]: %s: sk=%p queue_empty=%d send_head=%p" + TFW_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK" " sk_state=%d mss=%d size=%d\n", smp_processor_id(), __func__, sk, tcp_write_queue_empty(sk), tcp_send_head(sk), @@ -369,7 +369,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) * these SKBs. */ if (!skb->len) { - TFW_DBG3("[%d]: %s: drop skb=%p data_len=%u len=%u\n", + TFW_DBG3("[%d]: %s: drop skb=%pK data_len=%u len=%u\n", smp_processor_id(), __func__, skb, skb->data_len, skb->len); kfree_skb(skb); @@ -382,7 +382,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) /* Propagate mark of message head skb.*/ skb->mark = mark; - TFW_DBG3("[%d]: %s: entail skb=%p data_len=%u len=%u mark=%u" + TFW_DBG3("[%d]: %s: entail skb=%pK data_len=%u len=%u mark=%u" " tls_type=%x\n", smp_processor_id(), __func__, skb, skb->data_len, skb->len, skb->mark, tempesta_tls_skb_type(skb)); @@ -449,7 +449,11 @@ ss_send(struct sock *sk, struct sk_buff **skb_head, int flags) * or copy them if they're going to be used by Tempesta during * and after the transmission. */ - if (flags & SS_F_KEEP_SKB) { + /* + * FIXME #984 the `true ||` statement at the below fixes the issue + * (at least basic tests are passed now). + */ + if (/*true ||*/ flags & SS_F_KEEP_SKB) { skb = *skb_head; do { /* tcp_transmit_skb() will clone the skb. */ diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index a1abd88ef9..a8c0ceb1d2 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -7,7 +7,7 @@ * on top on native Linux socket buffers. The helpers provide common and * convenient wrappers for skb processing. * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2019 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -1102,27 +1102,29 @@ struct sk_buff * ss_skb_split(struct sk_buff *skb, int len) { struct sk_buff *buff; - int nsize, asize, nlen; + int n = 0; /* Assert that the SKB is orphaned. */ WARN_ON_ONCE(skb->destructor); - nsize = skb_headlen(skb) - len; - if (nsize < 0) - nsize = 0; - asize = ALIGN(nsize, 4); + if (len < skb_headlen(skb)) + n = skb_headlen(skb) - len; - buff = alloc_skb_fclone(asize + MAX_TCP_HEADER, GFP_ATOMIC); - if (buff == NULL) + buff = alloc_skb_fclone(ALIGN(n, 4) + MAX_TCP_HEADER, GFP_ATOMIC); + if (!buff) return NULL; skb_reserve(buff, MAX_TCP_HEADER); - /* Make sure there's exactly asize bytes available. */ - buff->reserved_tailroom = buff->end - buff->tail - asize; - nlen = skb->len - len - nsize; - buff->truesize += nlen; - skb->truesize -= nlen; + n = skb->len - len; + buff->truesize += n; + skb->truesize -= n; + + /* + * Initialize GSO segments counter to let TCP set it accoring to + * the current MSS on egress path. + */ + tcp_skb_pcount_set(skb, 0); /* * These are orphaned SKBs that are taken out of the TCP/IP diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index 216910cca8..38ebe2b868 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -3,7 +3,7 @@ * * Transport Layer Security (TLS) interfaces to Tempesta TLS. * - * Copyright (C) 2015-2018 Tempesta Technologies, Inc. + * Copyright (C) 2015-2019 Tempesta Technologies, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by @@ -320,6 +320,9 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * The last skb in our list will bring TLS tag - add it to end_seqno. * Otherwise (in worst case), a new skb was inserted to fit TLS tag * - fix end_seqno's for @skb_tail and this new skb. + * + * @limit = mss_now - tls_overhead, so {tso,tcp}_fragment() called from + * tcp_write_xmit() should set proper skb->tcp_gso_segs. */ if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; @@ -339,7 +342,12 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) tfw_tls_tcp_propagate_dseq(sk, skb_tail); - /* A new skb is added to the socket wmem. */ + /* + * A new skb is added to the socket wmem. + * + * pcount for a new skb is zero, to tcp_write_xmit() will + * set TSO segs to proper value on next iteration. + */ t_sz_curr = 0; t_sz_next = skb_tail->next->truesize; From 0d51ca7d0ba3d2b23af8714c9d8a66eec7c69c86 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Fri, 11 Jan 2019 12:50:30 +0300 Subject: [PATCH 11/14] Declare the while target `all` as phony to make clean build make -j4 clean all. --- tempesta_db/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tempesta_db/Makefile b/tempesta_db/Makefile index b1e9a58049..15dd9e2754 100644 --- a/tempesta_db/Makefile +++ b/tempesta_db/Makefile @@ -18,6 +18,7 @@ # Temple Place - Suite 330, Boston, MA 02111-1307, USA. all: libtdb tdbq +.PHONY: all libtdb: $(MAKE) -C libtdb From 41dc0d1da971555e8dd5b7ddc54dd3d8a638e19c Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 21 Jan 2019 00:59:55 +0300 Subject: [PATCH 12/14] Fix #984: 1. accurately fix skb->truesize and TCP write memory in kernel by tcp_skb_unclone(); 2. __split_pgfrag_del() if we just move pointers, then we do not free TCP write memory, so do not change skb->truesize. 3. ss_skb_unroll(): truesize and data_len/len are completely different counters, so do not mix them in ss_skb_adjust_data_len(). By the way, during the tests I saw crazy skb overheads - truesize can be larger than len in tens kilobytes. The explanation for such overheads is various fragments stoling (e.g. our __split_pgfrag_del) and cloning. 4. cleanup: move ss_skb coalescing functions closer to their calls. --- linux-4.14.32.patch | 83 +++++++++++++++++++++++++--------- tempesta_fw/sock.c | 13 +++--- tempesta_fw/ss_skb.c | 105 ++++++++++++++++++++++--------------------- 3 files changed, 123 insertions(+), 78 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index c8864d67b1..75c6f6eec8 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..b676c6a5 100644 +index 83d11cd2..14918c14 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1862,7 +1862,58 @@ index 83d11cd2..b676c6a5 100644 /* Initialize TSO segments for a packet. */ static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now) -@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk) +@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) + TCP_SKB_CB(skb)->eor = 0; + } + ++/** ++ * Tempesta uses page fragments for all skb allocations, so if an skb was ++ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may ++ * return larger skb and we have to adjust skb->truesize and memory accounting ++ * for TCP write queue. ++ */ ++static int ++tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri) ++{ ++ int r, delta_truesize = skb->truesize; ++ ++ if ((r = skb_unclone(skb, pri))) ++ return r; ++ ++ delta_truesize -= skb->truesize; ++ sk->sk_wmem_queued -= delta_truesize; ++ if (delta_truesize > 0) { ++ sk_mem_uncharge(sk, delta_truesize); ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); ++ } else { ++ sk_mem_charge(sk, -delta_truesize); ++ } ++ ++ return 0; ++} ++ + /* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. +@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + if (nsize < 0) + nsize = 0; + +- if (skb_unclone(skb, gfp)) ++ if (tcp_skb_unclone(sk, skb, gfp)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ +@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) + { + u32 delta_truesize; + +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) + return -ENOMEM; + + delta_truesize = __pskb_trim_head(skb, len); +@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk) return mss_now; } @@ -1870,7 +1921,7 @@ index 83d11cd2..b676c6a5 100644 /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. * As additional protections, we do not touch cwnd in retransmission phases, -@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota, max_segs), nonagle); @@ -1892,7 +1943,7 @@ index 83d11cd2..b676c6a5 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2355,30 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2381,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1919,12 +1970,15 @@ index 83d11cd2..b676c6a5 100644 + tcp_reset(sk); + break; + } ++ /* We must not break TSO. */ ++ WARN_ON_ONCE(tcp_skb_pcount(skb) ++ != DIV_ROUND_UP(skb->len, mss_now)); + } +#endif if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2560,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2589,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1932,27 +1986,16 @@ index 83d11cd2..b676c6a5 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,9 +2882,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,7 +2911,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { -+ int delta_truesize = skb->truesize; -+ - if (skb_unclone(skb, GFP_ATOMIC)) +- if (skb_unclone(skb, GFP_ATOMIC)) ++ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC)) return -ENOMEM; -+ delta_truesize -= skb->truesize; -+ sk->sk_wmem_queued -= delta_truesize; -+ if (delta_truesize > 0) { -+ sk_mem_uncharge(sk, delta_truesize); -+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK); -+ } else { -+ sk_mem_charge(sk, -delta_truesize); -+ } diff = tcp_skb_pcount(skb); - tcp_set_skb_tso_segs(skb, cur_mss); - diff -= tcp_skb_pcount(skb); -@@ -3129,6 +3182,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3201,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } diff --git a/tempesta_fw/sock.c b/tempesta_fw/sock.c index 1a4e0427e2..9e3ad5581e 100644 --- a/tempesta_fw/sock.c +++ b/tempesta_fw/sock.c @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags) /* Propagate mark of message head skb.*/ skb->mark = mark; - TFW_DBG3("[%d]: %s: entail skb=%pK data_len=%u len=%u mark=%u" - " tls_type=%x\n", smp_processor_id(), __func__, - skb, skb->data_len, skb->len, skb->mark, + TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u" + " truesize=%u mark=%u tls_type=%x\n", + smp_processor_id(), __func__, sk, + skb, skb->data_len, skb->len, skb->truesize, skb->mark, tempesta_tls_skb_type(skb)); skb_entail(sk, skb); @@ -449,11 +450,7 @@ ss_send(struct sock *sk, struct sk_buff **skb_head, int flags) * or copy them if they're going to be used by Tempesta during * and after the transmission. */ - /* - * FIXME #984 the `true ||` statement at the below fixes the issue - * (at least basic tests are passed now). - */ - if (/*true ||*/ flags & SS_F_KEEP_SKB) { + if (flags & SS_F_KEEP_SKB) { skb = *skb_head; do { /* tcp_transmit_skb() will clone the skb. */ diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index a8c0ceb1d2..682ee2c673 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -634,7 +634,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off if (likely(!off)) { frag->page_offset += len; skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; it->data = skb_frag_address(frag); it->skb = skb; return 0; @@ -642,7 +643,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off /* Fast path (e.g. TLS tag): delete the tail part of a fragment. */ if (likely(off + len == skb_frag_size(frag))) { skb_frag_size_sub(frag, len); - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; __it_next_data(skb, i + 1, it); return 0; } @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off ss_skb_adjust_data_len(skb, -tail_len); ss_skb_adjust_data_len(skb_dst, tail_len); } - ss_skb_adjust_data_len(skb, -len); + skb->len -= len; + skb->data_len -= len; /* Get the SKB and the address for data after the deleted data. */ it->data = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]); @@ -1138,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len) return buff; } -static inline int -__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, - const struct sk_buff *orig_skb) -{ - struct sk_buff *skb = ss_skb_peek_tail(skb_head); - - if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { - skb = ss_skb_alloc(0); - if (!skb) - return -ENOMEM; - ss_skb_queue_tail(skb_head, skb); - skb->mark = orig_skb->mark; - } - - skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; - ss_skb_adjust_data_len(skb, frag->size); - __skb_frag_ref(frag); - - return 0; -} - -static int -ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) -{ - int i; - skb_frag_t head_frag; - unsigned int headlen = skb_headlen(skb); - - if (headlen) { - BUG_ON(!skb->head_frag); - head_frag.size = headlen; - head_frag.page.p = virt_to_page(skb->head); - head_frag.page_offset = skb->data - - (unsigned char *)page_address(head_frag.page.p); - if (__coalesce_frag(skb_head, &head_frag, skb)) - return -ENOMEM; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) - return -ENOMEM; - } - - return 0; -} - /** * Tempesta FW forwards skbs with application and transport payload as is, * so initialize such skbs such that TCP/IP stack won't stumble on dirty @@ -1236,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb) skb->ip_summed = CHECKSUM_PARTIAL; } +static inline int +__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag, + const struct sk_buff *orig_skb) +{ + struct sk_buff *skb = ss_skb_peek_tail(skb_head); + + if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) { + skb = ss_skb_alloc(0); + if (!skb) + return -ENOMEM; + ss_skb_queue_tail(skb_head, skb); + skb->mark = orig_skb->mark; + } + + skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag; + ss_skb_adjust_data_len(skb, frag->size); + __skb_frag_ref(frag); + + return 0; +} + +static int +ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb) +{ + int i; + skb_frag_t head_frag; + unsigned int headlen = skb_headlen(skb); + + if (headlen) { + BUG_ON(!skb->head_frag); + head_frag.size = headlen; + head_frag.page.p = virt_to_page(skb->head); + head_frag.page_offset = skb->data - + (unsigned char *)page_address(head_frag.page.p); + if (__coalesce_frag(skb_head, &head_frag, skb)) + return -ENOMEM; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb)) + return -ENOMEM; + } + + return 0; +} + /* * When the original SKB is a clone then its shinfo and payload cannot be * modified as they are shared with other SKB users. As the SKB is unrolled, @@ -1328,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb) * when we track whitelist requests during HTTP processing. */ f_skb->mark = skb->mark; - ss_skb_adjust_data_len(skb, -f_skb->len); + skb->len -= f_skb->len; + skb->data_len -= f_skb->len; + skb->truesize -= f_skb->truesize; f_skb->prev = prev_skb; prev_skb = f_skb; } From ab82faee8083b3cedb49e6cc7be306c58b618b53 Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 28 Jan 2019 01:33:35 +0300 Subject: [PATCH 13/14] Fix code review comments --- tempesta_fw/ss_skb.c | 8 ++++---- tempesta_fw/tls.c | 31 ++++++++++++------------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/tempesta_fw/ss_skb.c b/tempesta_fw/ss_skb.c index 682ee2c673..973003b44c 100644 --- a/tempesta_fw/ss_skb.c +++ b/tempesta_fw/ss_skb.c @@ -1119,12 +1119,12 @@ ss_skb_split(struct sk_buff *skb, int len) skb_reserve(buff, MAX_TCP_HEADER); - n = skb->len - len; - buff->truesize += n; - skb->truesize -= n; + /* @buff already accounts @n in truesize. */ + buff->truesize += skb->len - len - n; + skb->truesize -= skb->len - len; /* - * Initialize GSO segments counter to let TCP set it accoring to + * Initialize GSO segments counter to let TCP set it according to * the current MSS on egress path. */ tcp_skb_pcount_set(skb, 0); diff --git a/tempesta_fw/tls.c b/tempesta_fw/tls.c index 38ebe2b868..0e64ba2594 100644 --- a/tempesta_fw/tls.c +++ b/tempesta_fw/tls.c @@ -229,8 +229,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) #define AUTO_SEGS_N 8 int r = -ENOMEM; - unsigned int head_sz, tag_sz, len, frags; - unsigned int t_sz_curr, t_sz_next; + unsigned int head_sz, tag_sz, len, frags, t_sz; unsigned char type; struct sk_buff *next = skb, *skb_tail = skb; struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -297,8 +296,8 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * if there is no free frag slot in skb_tail, a new skb is allocated. */ next = skb_tail->next; - t_sz_curr = skb_tail->truesize; - t_sz_next = next != skb ? next->truesize : 0; + t_sz = skb_tail->truesize; + WARN_ON_ONCE(next == skb); if (skb_tail == skb) { r = ss_skb_expand_head_tail(skb->next, skb, head_sz, tag_sz); if (r < 0) @@ -327,18 +326,13 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) if (likely(skb_tail->next == next)) { TCP_SKB_CB(skb_tail)->end_seq += tag_sz; - /* - * A new frag is added to the end of the current skb or - * begin of the next skb. - */ - WARN_ON_ONCE(t_sz_curr > skb_tail->truesize); - WARN_ON_ONCE(t_sz_next > next->truesize); - t_sz_curr = skb_tail->truesize - t_sz_curr; - t_sz_next = next->truesize - t_sz_next; + /* A new frag is added to the end of the current skb. */ + WARN_ON_ONCE(t_sz >= skb_tail->truesize); + t_sz = skb_tail->truesize - t_sz; } else { WARN_ON_ONCE(skb_tail->next->len != tag_sz); - WARN_ON_ONCE(skb_tail->truesize != t_sz_curr); + WARN_ON_ONCE(skb_tail->truesize != t_sz); tfw_tls_tcp_propagate_dseq(sk, skb_tail); @@ -348,8 +342,7 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) * pcount for a new skb is zero, to tcp_write_xmit() will * set TSO segs to proper value on next iteration. */ - t_sz_curr = 0; - t_sz_next = skb_tail->next->truesize; + t_sz = skb_tail->next->truesize; skb_tail = skb_tail->next; } @@ -366,14 +359,14 @@ tfw_tls_encrypt(struct sock *sk, struct sk_buff *skb, unsigned int limit) tcp_sk(sk)->write_seq += head_sz + tag_sz; /* - * TLS record header is always allocated form the reserved skb headroom. + * TLS record header is always allocated from the reserved skb headroom. * The room for the tag may also be allocated from the reserved tailroom - * or in a new page frament in slb_tail or next, probably new, skb. + * or in a new page fragment in skb_tail or next, probably new, skb. * So to adjust the socket write memory we have to check the both skbs * and only for tag_sz. */ - WARN_ON_ONCE(t_sz_curr + t_sz_next < tag_sz); - tfw_tls_tcp_add_overhead(sk, t_sz_curr + t_sz_next); + WARN_ON_ONCE(t_sz < tag_sz); + tfw_tls_tcp_add_overhead(sk, t_sz); if (likely(sgt.nents <= AUTO_SEGS_N)) { sgt.sgl = sg; From d62d01526156520587ad041f21538711f560812a Mon Sep 17 00:00:00 2001 From: Alexander K Date: Mon, 28 Jan 2019 02:19:28 +0300 Subject: [PATCH 14/14] Fix up TSO segments after TLS overhead; the previous warning may fail. --- linux-4.14.32.patch | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/linux-4.14.32.patch b/linux-4.14.32.patch index 75c6f6eec8..8c77a0e5e3 100644 --- a/linux-4.14.32.patch +++ b/linux-4.14.32.patch @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644 void tcp_twsk_destructor(struct sock *sk) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 83d11cd2..14918c14 100644 +index 83d11cd2..3066c3ac 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,9 @@ @@ -1943,7 +1943,7 @@ index 83d11cd2..14918c14 100644 if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) break; -@@ -2336,7 +2381,33 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2336,7 +2381,32 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags); if (tcp_small_queue_check(sk, skb, 0)) break; @@ -1970,15 +1970,14 @@ index 83d11cd2..14918c14 100644 + tcp_reset(sk); + break; + } -+ /* We must not break TSO. */ -+ WARN_ON_ONCE(tcp_skb_pcount(skb) -+ != DIV_ROUND_UP(skb->len, mss_now)); ++ /* Fix up TSO segments after TLS overhead. */ ++ tcp_set_skb_tso_segs(skb, mss_now); + } +#endif if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; -@@ -2518,6 +2589,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, +@@ -2518,6 +2588,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, sk_gfp_mask(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -1986,7 +1985,7 @@ index 83d11cd2..14918c14 100644 /* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. -@@ -2839,7 +2911,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) +@@ -2839,7 +2910,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { @@ -1995,7 +1994,7 @@ index 83d11cd2..14918c14 100644 return -ENOMEM; diff = tcp_skb_pcount(skb); -@@ -3129,6 +3201,7 @@ int tcp_send_synack(struct sock *sk) +@@ -3129,6 +3200,7 @@ int tcp_send_synack(struct sock *sk) } return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); }