From 6648bd7e0e62c0c8c03b15e00c9e7015e232feff Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 21 Jun 2012 13:58:31 +0000 Subject: ipv4: Add sysctl knob to control early socket demux This change is meant to add a control for disabling early socket demux. The main motivation behind this patch is to provide an option to disable the feature as it adds an additional cost to routing that reduces overall throughput by up to 5%. For example one of my systems went from 12.1Mpps to 11.6 after the early socket demux was added. It looks like the reason for the regression is that we are now having to perform two lookups, first the one for an established socket, and then the one for the routing table. By adding this patch and toggling the value for ip_early_demux to 0 I am able to get back to the 12.1Mpps I was previously seeing. [ Move local variables in ip_rcv_finish() down into the basic block in which they are actually used. -DaveM ] Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef32956ed655..12aa0c5867c4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -300,6 +300,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ip_early_demux", + .data = &sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, -- cgit v1.2.3 From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Jul 2012 05:50:31 +0000 Subject: tcp: TCP Small Queues This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet Cc: Dave Taht Cc: Tom Herbert Cc: Matt Mathis Cc: Yuchung Cheng Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 12aa0c5867c4..70730f7aeafe 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -598,6 +598,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_limit_output_bytes", + .data = &sysctl_tcp_limit_output_bytes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", -- cgit v1.2.3 From 282f23c6ee343126156dd41218b22ece96d747e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Jul 2012 10:13:05 +0200 Subject: tcp: implement RFC 5961 3.2 Implement the RFC 5691 mitigation against Blind Reset attack using RST bit. Idea is to validate incoming RST sequence, to match RCV.NXT value, instead of previouly accepted window : (RCV.NXT <= SEG.SEQ < RCV.NXT+RCV.WND) If sequence is in window but not an exact match, send a "challenge ACK", so that the other part can resend an RST with the appropriate sequence. Add a new sysctl, tcp_challenge_ack_limit, to limit number of challenge ACK sent per second. Add a new SNMP counter to count number of challenge acks sent. (netstat -s | grep TCPChallengeACK) Signed-off-by: Eric Dumazet Cc: Kiran Kumar Kella Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 70730f7aeafe..3f6a1e762e9c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -605,6 +605,13 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "tcp_challenge_ack_limit", + .data = &sysctl_tcp_challenge_ack_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, #ifdef CONFIG_NET_DMA { .procname = "tcp_dma_copybreak", -- cgit v1.2.3 From 2100c8d2d9db23c0a09901a782bb4e3b21bee298 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 19 Jul 2012 06:43:05 +0000 Subject: net-tcp: Fast Open base This patch impelements the common code for both the client and server. 1. TCP Fast Open option processing. Since Fast Open does not have an option number assigned by IANA yet, it shares the experiment option code 254 by implementing draft-ietf-tcpm-experimental-options with a 16 bits magic number 0xF989. This enables global experiments without clashing the scarce(2) experimental options available for TCP. When the draft status becomes standard (maybe), the client should switch to the new option number assigned while the server supports both numbers for transistion. 2. The new sysctl tcp_fastopen 3. A place holder init function Signed-off-by: Yuchung Cheng Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3f6a1e762e9c..5840c3255721 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -366,6 +366,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, #endif + { + .procname = "tcp_fastopen", + .data = &sysctl_tcp_fastopen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, -- cgit v1.2.3 From 0c7462a2351b4cc502f326aad7fedd04909928be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jul 2012 07:14:29 +0000 Subject: ipv4: remove rt_cache_rebuild_count After IP route cache removal, rt_cache_rebuild_count is no longer used. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5840c3255721..4b6487a68279 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -783,13 +783,6 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "rt_cache_rebuild_count", - .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "ping_group_range", .data = &init_net.ipv4.sysctl_ping_group_range, @@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) table[5].data = &net->ipv4.sysctl_icmp_ratemask; table[6].data = - &net->ipv4.sysctl_rt_cache_rebuild_count; - table[7].data = &net->ipv4.sysctl_ping_group_range; } @@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_ping_group_range[0] = 1; net->ipv4.sysctl_ping_group_range[1] = 0; - net->ipv4.sysctl_rt_cache_rebuild_count = 4; - tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); -- cgit v1.2.3 From c255a458055e459f65eb7b7f51dc5dbdd0caf1d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jul 2012 16:43:02 -0700 Subject: memcg: rename config variables Sanity: CONFIG_CGROUP_MEM_RES_CTLR -> CONFIG_MEMCG CONFIG_CGROUP_MEM_RES_CTLR_SWAP -> CONFIG_MEMCG_SWAP CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED -> CONFIG_MEMCG_SWAP_ENABLED CONFIG_CGROUP_MEM_RES_CTLR_KMEM -> CONFIG_MEMCG_KMEM [mhocko@suse.cz: fix missed bits] Cc: Glauber Costa Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Aneesh Kumar K.V Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5840c3255721..ed7db3f1b6f2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, int ret; unsigned long vec[3]; struct net *net = current->nsproxy->net_ns; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM struct mem_cgroup *memcg; #endif @@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, if (ret) return ret; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM rcu_read_lock(); memcg = mem_cgroup_from_task(current); -- cgit v1.2.3