Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #984 #1161

Merged
merged 15 commits into from
Jan 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 62 additions & 24 deletions linux-4.14.32.patch
Original file line number Diff line number Diff line change
Expand Up @@ -1815,7 +1815,7 @@ index 420fecbb..67e0513a 100644
void tcp_twsk_destructor(struct sock *sk)
{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 83d11cd2..4e79cb5e 100644
index 83d11cd2..3066c3ac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,9 @@
Expand Down Expand Up @@ -1862,15 +1862,66 @@ index 83d11cd2..4e79cb5e 100644

/* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
@@ -1560,6 +1565,7 @@ unsigned int tcp_current_mss(struct sock *sk)
@@ -1241,6 +1246,32 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
TCP_SKB_CB(skb)->eor = 0;
}

+/**
+ * Tempesta uses page fragments for all skb allocations, so if an skb was
+ * allocated in standard Linux way, then pskb_expand_head( , 0, 0, ) may
+ * return larger skb and we have to adjust skb->truesize and memory accounting
+ * for TCP write queue.
+ */
+static int
+tcp_skb_unclone(struct sock *sk, struct sk_buff *skb, gfp_t pri)
+{
+ int r, delta_truesize = skb->truesize;
+
+ if ((r = skb_unclone(skb, pri)))
+ return r;
+
+ delta_truesize -= skb->truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ if (delta_truesize > 0) {
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ } else {
+ sk_mem_charge(sk, -delta_truesize);
+ }
+
+ return 0;
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1262,7 +1293,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
if (nsize < 0)
nsize = 0;

- if (skb_unclone(skb, gfp))
+ if (tcp_skb_unclone(sk, skb, gfp))
return -ENOMEM;

/* Get a new skb... force flag on. */
@@ -1380,7 +1411,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;

- if (skb_unclone(skb, GFP_ATOMIC))
+ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
return -ENOMEM;

delta_truesize = __pskb_trim_head(skb, len);
@@ -1560,6 +1591,7 @@ unsigned int tcp_current_mss(struct sock *sk)

return mss_now;
}
+EXPORT_SYMBOL(tcp_current_mss);

/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
* As additional protections, we do not touch cwnd in retransmission phases,
@@ -2327,7 +2333,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
@@ -2327,7 +2359,20 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
cwnd_quota,
max_segs),
nonagle);
Expand All @@ -1892,7 +1943,7 @@ index 83d11cd2..4e79cb5e 100644
if (skb->len > limit &&
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
@@ -2336,7 +2355,34 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
@@ -2336,7 +2381,32 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
if (tcp_small_queue_check(sk, skb, 0))
break;
Expand All @@ -1916,47 +1967,34 @@ index 83d11cd2..4e79cb5e 100644
+ net_warn_ratelimited(
+ "Tempesta: cannot encrypt data (%d),"
+ " reset a TLS connection.\n", result);
+ /*
+ * FIXME #984 WARNING: at net/core/stream.c:205
+ * sk_stream_kill_queues+0x106/0x120
+ */
+ tcp_reset(sk);
+ break;
+ }
+ /* Fix up TSO segments after TLS overhead. */
+ tcp_set_skb_tso_segs(skb, mss_now);
+ }
+#endif
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;

@@ -2518,6 +2564,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
@@ -2518,6 +2588,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
+EXPORT_SYMBOL(__tcp_push_pending_frames);

/* Send _single_ skb sitting at the send head. This function requires
* true push pending frames to setup probe timer etc.
@@ -2839,9 +2886,19 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
@@ -2839,7 +2910,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
+ int delta_truesize = skb->truesize;
+
if (skb_unclone(skb, GFP_ATOMIC))
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (tcp_skb_unclone(sk, skb, GFP_ATOMIC))
return -ENOMEM;

+ delta_truesize -= skb->truesize;
+ sk->sk_wmem_queued -= delta_truesize;
+ if (delta_truesize > 0) {
+ sk_mem_uncharge(sk, delta_truesize);
+ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+ } else {
+ sk_mem_charge(sk, -delta_truesize);
+ }
diff = tcp_skb_pcount(skb);
tcp_set_skb_tso_segs(skb, cur_mss);
diff -= tcp_skb_pcount(skb);
@@ -3129,6 +3186,7 @@ int tcp_send_synack(struct sock *sk)
@@ -3129,6 +3200,7 @@ int tcp_send_synack(struct sock *sk)
}
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
Expand Down
3 changes: 1 addition & 2 deletions tempesta_db/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@
# Temple Place - Suite 330, Boston, MA 02111-1307, USA.

all: libtdb tdbq
.PHONY: all

.PHONY: libtdb
libtdb:
$(MAKE) -C libtdb

.PHONY: tdbq
tdbq: libtdb
$(MAKE) -C tdbq

Expand Down
11 changes: 6 additions & 5 deletions tempesta_fw/sock.c
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
int size, mss = tcp_send_mss(sk, &size, MSG_DONTWAIT);
unsigned int mark = (*skb_head)->mark;

TFW_DBG3("[%d]: %s: sk=%p queue_empty=%d send_head=%p"
TFW_DBG3("[%d]: %s: sk=%pK queue_empty=%d send_head=%pK"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The year in the copyright notice isn't updated in the file. Can we just bump the year in all the sources just to forget about the problem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just didn't update copyright for the very small changes. I think at the end it's not a problem at all if you just forget to update a copyright - if you or a reviewer recall this, then just update the copyright, otherwise if you forget - not a big deal.

" sk_state=%d mss=%d size=%d\n",
smp_processor_id(), __func__,
sk, tcp_write_queue_empty(sk), tcp_send_head(sk),
Expand All @@ -369,7 +369,7 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
* these SKBs.
*/
if (!skb->len) {
TFW_DBG3("[%d]: %s: drop skb=%p data_len=%u len=%u\n",
TFW_DBG3("[%d]: %s: drop skb=%pK data_len=%u len=%u\n",
smp_processor_id(), __func__,
skb, skb->data_len, skb->len);
kfree_skb(skb);
Expand All @@ -382,9 +382,10 @@ ss_do_send(struct sock *sk, struct sk_buff **skb_head, int flags)
/* Propagate mark of message head skb.*/
skb->mark = mark;

TFW_DBG3("[%d]: %s: entail skb=%p data_len=%u len=%u mark=%u"
" tls_type=%x\n", smp_processor_id(), __func__,
skb, skb->data_len, skb->len, skb->mark,
TFW_DBG3("[%d]: %s: entail sk=%pK skb=%pK data_len=%u len=%u"
" truesize=%u mark=%u tls_type=%x\n",
smp_processor_id(), __func__, sk,
skb, skb->data_len, skb->len, skb->truesize, skb->mark,
tempesta_tls_skb_type(skb));

skb_entail(sk, skb);
Expand Down
133 changes: 70 additions & 63 deletions tempesta_fw/ss_skb.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* on top on native Linux socket buffers. The helpers provide common and
* convenient wrappers for skb processing.
*
* Copyright (C) 2015-2018 Tempesta Technologies, Inc.
* Copyright (C) 2015-2019 Tempesta Technologies, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -634,15 +634,17 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off
if (likely(!off)) {
frag->page_offset += len;
skb_frag_size_sub(frag, len);
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;
it->data = skb_frag_address(frag);
it->skb = skb;
return 0;
}
/* Fast path (e.g. TLS tag): delete the tail part of a fragment. */
if (likely(off + len == skb_frag_size(frag))) {
skb_frag_size_sub(frag, len);
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;
__it_next_data(skb, i + 1, it);
return 0;
}
Expand Down Expand Up @@ -679,7 +681,8 @@ __split_pgfrag_del(struct sk_buff *skb_head, struct sk_buff *skb, int i, int off
ss_skb_adjust_data_len(skb, -tail_len);
ss_skb_adjust_data_len(skb_dst, tail_len);
}
ss_skb_adjust_data_len(skb, -len);
skb->len -= len;
skb->data_len -= len;

/* Get the SKB and the address for data after the deleted data. */
it->data = skb_frag_address(&skb_shinfo(skb_dst)->frags[i]);
Expand Down Expand Up @@ -1102,27 +1105,29 @@ struct sk_buff *
ss_skb_split(struct sk_buff *skb, int len)
{
struct sk_buff *buff;
int nsize, asize, nlen;
int n = 0;

/* Assert that the SKB is orphaned. */
WARN_ON_ONCE(skb->destructor);

nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
asize = ALIGN(nsize, 4);
if (len < skb_headlen(skb))
n = skb_headlen(skb) - len;

buff = alloc_skb_fclone(asize + MAX_TCP_HEADER, GFP_ATOMIC);
if (buff == NULL)
buff = alloc_skb_fclone(ALIGN(n, 4) + MAX_TCP_HEADER, GFP_ATOMIC);
if (!buff)
return NULL;

skb_reserve(buff, MAX_TCP_HEADER);
/* Make sure there's exactly asize bytes available. */
buff->reserved_tailroom = buff->end - buff->tail - asize;

nlen = skb->len - len - nsize;
buff->truesize += nlen;
skb->truesize -= nlen;
/* @buff already accounts @n in truesize. */
buff->truesize += skb->len - len - n;
skb->truesize -= skb->len - len;

/*
* Initialize GSO segments counter to let TCP set it according to
* the current MSS on egress path.
*/
tcp_skb_pcount_set(skb, 0);

/*
* These are orphaned SKBs that are taken out of the TCP/IP
Expand All @@ -1136,52 +1141,6 @@ ss_skb_split(struct sk_buff *skb, int len)
return buff;
}

static inline int
__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
const struct sk_buff *orig_skb)
{
struct sk_buff *skb = ss_skb_peek_tail(skb_head);

if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
skb = ss_skb_alloc(0);
if (!skb)
return -ENOMEM;
ss_skb_queue_tail(skb_head, skb);
skb->mark = orig_skb->mark;
}

skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag;
ss_skb_adjust_data_len(skb, frag->size);
__skb_frag_ref(frag);

return 0;
}

static int
ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
{
int i;
skb_frag_t head_frag;
unsigned int headlen = skb_headlen(skb);

if (headlen) {
BUG_ON(!skb->head_frag);
head_frag.size = headlen;
head_frag.page.p = virt_to_page(skb->head);
head_frag.page_offset = skb->data -
(unsigned char *)page_address(head_frag.page.p);
if (__coalesce_frag(skb_head, &head_frag, skb))
return -ENOMEM;
}

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb))
return -ENOMEM;
}

return 0;
}

/**
* Tempesta FW forwards skbs with application and transport payload as is,
* so initialize such skbs such that TCP/IP stack won't stumble on dirty
Expand Down Expand Up @@ -1234,6 +1193,52 @@ ss_skb_init_for_xmit(struct sk_buff *skb)
skb->ip_summed = CHECKSUM_PARTIAL;
}

static inline int
__coalesce_frag(struct sk_buff **skb_head, skb_frag_t *frag,
const struct sk_buff *orig_skb)
{
struct sk_buff *skb = ss_skb_peek_tail(skb_head);

if (!skb || skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
skb = ss_skb_alloc(0);
if (!skb)
return -ENOMEM;
ss_skb_queue_tail(skb_head, skb);
skb->mark = orig_skb->mark;
}

skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags++] = *frag;
ss_skb_adjust_data_len(skb, frag->size);
__skb_frag_ref(frag);

return 0;
}

static int
ss_skb_queue_coalesce_tail(struct sk_buff **skb_head, const struct sk_buff *skb)
{
int i;
skb_frag_t head_frag;
unsigned int headlen = skb_headlen(skb);

if (headlen) {
BUG_ON(!skb->head_frag);
head_frag.size = headlen;
head_frag.page.p = virt_to_page(skb->head);
head_frag.page_offset = skb->data -
(unsigned char *)page_address(head_frag.page.p);
if (__coalesce_frag(skb_head, &head_frag, skb))
return -ENOMEM;
}

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
if (__coalesce_frag(skb_head, &skb_shinfo(skb)->frags[i], skb))
return -ENOMEM;
}

return 0;
}

/*
* When the original SKB is a clone then its shinfo and payload cannot be
* modified as they are shared with other SKB users. As the SKB is unrolled,
Expand Down Expand Up @@ -1326,7 +1331,9 @@ ss_skb_unroll(struct sk_buff **skb_head, struct sk_buff *skb)
* when we track whitelist requests during HTTP processing.
*/
f_skb->mark = skb->mark;
ss_skb_adjust_data_len(skb, -f_skb->len);
skb->len -= f_skb->len;
skb->data_len -= f_skb->len;
skb->truesize -= f_skb->truesize;
f_skb->prev = prev_skb;
prev_skb = f_skb;
}
Expand Down
Loading