From e69d2eb86175e6d70cea68b159cea549e1038b60 Mon Sep 17 00:00:00 2001 From: dormando Date: Tue, 12 Feb 2013 20:41:57 -0800 Subject: [PATCH 01/10] initcwnd from userspace tunable --- include/net/tcp.h | 1 + include/uapi/linux/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 7 +++++++ net/ipv4/tcp.c | 18 ++++++++++++++++++ net/ipv4/tcp_output.c | 2 ++ 5 files changed, 29 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 5bba80fbd1d9d9..90a780ef4b1b7a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -287,6 +287,7 @@ extern int sysctl_tcp_thin_dupack; extern int sysctl_tcp_early_retrans; extern int sysctl_tcp_limit_output_bytes; extern int sysctl_tcp_challenge_ack_limit; +extern int sysctl_tcp_user_cwnd_max; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8d776ebc4829df..20ec42135a4075 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -111,6 +111,7 @@ enum { #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ #define TCP_TIMESTAMP 24 +#define TCP_CWND 24 /* Set congestion window */ struct tcp_repair_opt { __u32 opt_code; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3f25e75ae692e9..38a470f86887c7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -722,6 +722,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_allowed_congestion_control, }, + { + .procname = "tcp_user_cwnd_max", + .data = &sysctl_tcp_user_cwnd_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "tcp_max_ssthresh", .data = &sysctl_tcp_max_ssthresh, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2005561861ad03..e5598716798bc0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2602,6 +2602,24 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_CWND: + if (sysctl_tcp_user_cwnd_max <= 0) + err = -EPERM; + else if (val > 0 && sk->sk_state == TCP_ESTABLISHED && + icsk->icsk_ca_state == TCP_CA_Open) { + u32 cwnd = val; + cwnd = min(cwnd, (u32)sysctl_tcp_user_cwnd_max); + cwnd = min(cwnd, tp->snd_cwnd_clamp); + + if (tp->snd_cwnd != cwnd) { + tp->snd_cwnd = cwnd; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_cnt = 0; + } + } else + err = -EINVAL; + break; + #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: /* Read the IP->Key mappings from userspace */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0145ce7e609881..7f6a9c7f17988f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -65,6 +65,8 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +int sysctl_tcp_user_cwnd_max __read_mostly; + static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); From b8590e0cf5abe6edbb8dbb209f8c7c5331d0c4b0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 12 Feb 2013 18:46:32 -0800 Subject: [PATCH 02/10] add extra free kbytes tunable Add a userspace visible knob to tell the VM to keep an extra amount of memory free, by increasing the gap between each zone's min and low watermarks. This is useful for realtime applications that call system calls and have a bound on the number of allocations that happen in any short time period. In this application, extra_free_kbytes would be left at an amount equal to or larger than than the maximum number of allocations that happen in any burst. It may also be useful to reduce the memory use of virtual machines (temporarily?), in a way that does not cause memory fragmentation like ballooning does. --- Documentation/sysctl/vm.txt | 16 +++++++++++++++ include/linux/mmzone.h | 2 +- include/linux/swap.h | 2 ++ kernel/sysctl.c | 10 +++++++++- mm/page_alloc.c | 39 +++++++++++++++++++++++++++---------- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dcc75a9ed91961..b81fca90f7fe4a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold +- extra_free_kbytes - hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode @@ -198,6 +199,21 @@ fragmentation index is <= extfrag_threshold. The default value is 500. ============================================================== +extra_free_kbytes + +This parameter tells the VM to keep extra free memory between the threshold +where background reclaim (kswapd) kicks in, and the threshold where direct +reclaim (by allocating processes) kicks in. + +This is useful for workloads that require low latency memory allocations +and have a bounded burstiness in memory allocations, for example a +realtime application that receives and transmits network traffic +(causing in-kernel memory allocations) with a maximum total message burst +size of 200MB may need 200MB of extra free memory to avoid direct reclaim +related latencies. + +============================================================== + hugepages_treat_as_movable This parameter is only useful when kernelcore= is specified at boot time to diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b1e..cf75a9d2e04513 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -891,7 +891,7 @@ static inline int is_dma(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, +int free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, diff --git a/include/linux/swap.h b/include/linux/swap.h index 1701ce4be74650..3b3da79c3d2c4b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -224,6 +224,8 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern int min_free_kbytes; +extern int extra_free_kbytes; extern unsigned long dirty_balance_reserve; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fcaa..e7ba63c8eda80c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1262,9 +1262,17 @@ static struct ctl_table vm_table[] = { .data = &min_free_kbytes, .maxlen = sizeof(min_free_kbytes), .mode = 0644, - .proc_handler = min_free_kbytes_sysctl_handler, + .proc_handler = free_kbytes_sysctl_handler, .extra1 = &zero, }, + { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = free_kbytes_sysctl_handler, + .extra1 = &zero, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ee0fd313f036e..a6becb8adb111c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -196,8 +196,21 @@ static char * const zone_names[MAX_NR_ZONES] = { "Movable", }; +/* + * Try to keep at least this much lowmem free. Do not allow normal + * allocations below this point, only high priority ones. Automatically + * tuned according to the amount of memory in the system. + */ int min_free_kbytes = 1024; +/* + * Extra memory for the system to try freeing between the min and + * low watermarks. Useful for workloads that require low latency + * memory allocations in bursts larger than the normal gap between + * low and min. + */ +int extra_free_kbytes; + static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; @@ -5320,6 +5333,7 @@ static void setup_per_zone_lowmem_reserve(void) static void __setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -5331,11 +5345,14 @@ static void __setup_per_zone_wmarks(void) } for_each_zone(zone) { - u64 tmp; + u64 min, low; spin_lock_irqsave(&zone->lock, flags); - tmp = (u64)pages_min * zone->managed_pages; - do_div(tmp, lowmem_pages); + min = (u64)pages_min * zone->managed_pages; + do_div(min, lowmem_pages); + low = (u64)pages_low * zone->managed_pages; + do_div(low, vm_total_pages); + if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -5356,11 +5373,13 @@ static void __setup_per_zone_wmarks(void) * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = tmp; + zone->watermark[WMARK_MIN] = min; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + low + (min >> 2); + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + low + (min >> 1); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); @@ -5471,11 +5490,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call two helper functions whenever min_free_kbytes - * changes. + * free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * or extra_free_kbytes changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); From b6fa0cd0646a6e241479a816e07961de8c2d597e Mon Sep 17 00:00:00 2001 From: dormando Date: Wed, 13 Feb 2013 11:27:55 -0800 Subject: [PATCH 03/10] Don't change initcwnd's magic number fucks us right up, it does. --- include/uapi/linux/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 20ec42135a4075..588390cf442380 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -105,13 +105,13 @@ enum { #define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ #define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ -#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR 24 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 #define TCP_QUEUE_SEQ 21 #define TCP_REPAIR_OPTIONS 22 #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ -#define TCP_TIMESTAMP 24 -#define TCP_CWND 24 /* Set congestion window */ +#define TCP_TIMESTAMP 25 +#define TCP_CWND 19 /* Set congestion window */ struct tcp_repair_opt { __u32 opt_code; From 63901e8a5dfc5602204995dac81b450ec45fc181 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 31 Jul 2013 09:22:35 -0700 Subject: [PATCH 04/10] tcp: debug fastretrans warning printk a ton of states to debug fastretrans warning Signed-off-by: Yuchung Cheng --- net/ipv4/tcp_input.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4b75aad14b04a0..28f538a934e5e3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2773,7 +2773,20 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { - WARN_ON(tp->retrans_out != 0); + if (WARN_ON(tp->retrans_out != 0)) { + printk(KERN_DEBUG "%pI4:%u F0x%x S%u s%d IF%u+%u-%u-%u" + "f%u ur%u rr%u rt%u um%u hs%u nxt%u\n", + &inet_sk(sk)->inet_daddr, + ntohs(inet_sk(sk)->inet_dport), + flag, sk->sk_state, tp->rx_opt.sack_ok, + tp->packets_out, tp->retrans_out, + tp->sacked_out, tp->lost_out, + tp->frto, tp->undo_retrans, + tp->reordering, icsk->icsk_retransmits, + tp->undo_marker ? tp->undo_marker-tp->snd_una:0, + tp->high_seq - tp->snd_una, + tp->snd_nxt - tp->snd_una); + } tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (icsk->icsk_ca_state) { From f438ef95b7a52c1f5847614b9b9c3508ab70a9af Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 9 Oct 2013 10:08:52 -0700 Subject: [PATCH 05/10] tcp: fix incorrect ca_state in tail loss probe On receiving an ACK that covers the loss probe sequence, TLP immediately sets the congestion state to Open, even though some packets are not recovered and retransmisssion are on the way. The later ACks may trigger a WARN_ON check of step D in tcp_fastretrans_alert(). The fix is to follow the similar procedure in recovery by calling tcp_try_keep_open(). The sender switches to Open state if no packets are retransmissted. Otherwise it goes to Disorder and let subsequent ACKs move the state to Recovery or Open. Signed-off-by: Yuchung Cheng --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 28f538a934e5e3..28bdc144cab642 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3327,7 +3327,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tcp_init_cwnd_reduction(sk, true); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_Open); + tcp_try_keep_open(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBERECOVERY); } From 74097628a581d24d9953a4813d7c2070c7cba163 Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Mon, 21 Oct 2013 23:43:37 +0000 Subject: [PATCH 06/10] match the reuseport sk based on the smp_processor_id of the kernel thread dealing with the irq --- net/ipv4/inet_hashtables.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 6af375afeeef1e..721ac3388f4a4b 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -198,15 +198,26 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { - phash = inet_ehashfn(net, daddr, hnum, + /* phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - matches = 1; + matches = 1; */ } } else if (score == hiscore && reuseport) { - matches++; + + /* goes through the sks and find the one corresponding to our cpu + it is critical that a RSS queue is bound to a specific cpu + */ + + if (matches++ == smp_processor_id()) { + result = sk; + } + + /* if (((u64)phash * matches) >> 32 == 0) result = sk; phash = next_pseudo_random32(phash); + */ + /* this has not been assigned to a cpu yet */ } } /* From 3c77c764d4b8b26aeaafb77e8e1a2cba9b52bcfe Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Mon, 21 Oct 2013 23:49:52 +0000 Subject: [PATCH 07/10] save the cpu id so we dont have to go over it again --- net/ipv4/inet_hashtables.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 721ac3388f4a4b..da794715add10a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -185,7 +185,8 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; - u32 phash = 0; + // u32 phash = 0; + int curr_cpu = smp_processor_id(); rcu_read_lock(); begin: @@ -208,7 +209,7 @@ struct sock *__inet_lookup_listener(struct net *net, it is critical that a RSS queue is bound to a specific cpu */ - if (matches++ == smp_processor_id()) { + if (matches++ == curr_cpu) { result = sk; } From 653c7635e777df460107cdd97f2842a8930c6769 Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Tue, 22 Oct 2013 06:33:14 +0000 Subject: [PATCH 08/10] add debug message --- net/ipv4/inet_hashtables.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index da794715add10a..3aadb2af5e9116 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -199,6 +199,9 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { + + matches++; + /* phash = inet_ehashfn(net, daddr, hnum, saddr, sport); matches = 1; */ @@ -207,8 +210,8 @@ struct sock *__inet_lookup_listener(struct net *net, /* goes through the sks and find the one corresponding to our cpu it is critical that a RSS queue is bound to a specific cpu - */ - + */ + pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); if (matches++ == curr_cpu) { result = sk; } @@ -218,7 +221,6 @@ struct sock *__inet_lookup_listener(struct net *net, result = sk; phash = next_pseudo_random32(phash); */ - /* this has not been assigned to a cpu yet */ } } /* From 2bc29502e61729396cf931ccd236eedb392d31d5 Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Tue, 22 Oct 2013 07:06:55 +0000 Subject: [PATCH 09/10] break out of the loop, remove debug --- net/ipv4/inet_hashtables.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 3aadb2af5e9116..5edfe2823cb704 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -199,7 +199,8 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { - + if (curr_cpu == 0) + break; matches++; /* phash = inet_ehashfn(net, daddr, hnum, @@ -211,9 +212,10 @@ struct sock *__inet_lookup_listener(struct net *net, /* goes through the sks and find the one corresponding to our cpu it is critical that a RSS queue is bound to a specific cpu */ - pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); + // pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); if (matches++ == curr_cpu) { result = sk; + break; } /* From 4005ce4666327843e4d95fe4b1dccc8bc18413ff Mon Sep 17 00:00:00 2001 From: Artur Bergman Date: Tue, 22 Oct 2013 07:24:28 +0000 Subject: [PATCH 10/10] get rid of the breaks --- net/ipv4/inet_hashtables.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5edfe2823cb704..970c2180132a33 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -199,9 +199,7 @@ struct sock *__inet_lookup_listener(struct net *net, hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { - if (curr_cpu == 0) - break; - matches++; + // matches++; /* phash = inet_ehashfn(net, daddr, hnum, saddr, sport); @@ -215,7 +213,6 @@ struct sock *__inet_lookup_listener(struct net *net, // pr_info("Matching sk %p match %d to cpu %d\n", sk, matches, curr_cpu); if (matches++ == curr_cpu) { result = sk; - break; } /*