From 7a9546ee354ec6f23af403992b8c07baa50a23d2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 12 Nov 2008 00:54:20 -0800 Subject: net: ib_net pointer should depends on CONFIG_NET_NS We can shrink size of "struct inet_bind_bucket" by 50%, using read_pnet() and write_pnet() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net/inet_hashtables.h') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5cc182f9ecae..cb31fbf8ae2a 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -77,13 +77,20 @@ struct inet_ehash_bucket { * ports are created in O(1) time? I thought so. ;-) -DaveM */ struct inet_bind_bucket { +#ifdef CONFIG_NET_NS struct net *ib_net; +#endif unsigned short port; signed short fastreuse; struct hlist_node node; struct hlist_head owners; }; +static inline struct net *ib_net(struct inet_bind_bucket *ib) +{ + return read_pnet(&ib->ib_net); +} + #define inet_bind_bucket_for_each(tb, node, head) \ hlist_for_each_entry(tb, node, head, node) -- cgit v1.2.3 From 3ab5aee7fe840b5b1b35a8d1ac11c3de5281e611 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Nov 2008 19:40:17 -0800 Subject: net: Convert TCP & DCCP hash tables to use RCU / hlist_nulls RCU was added to UDP lookups, using a fast infrastructure : - sockets kmem_cache use SLAB_DESTROY_BY_RCU and dont pay the price of call_rcu() at freeing time. - hlist_nulls permits to use few memory barriers. This patch uses same infrastructure for TCP/DCCP established and timewait sockets. Thanks to SLAB_DESTROY_BY_RCU, no slowdown for applications using short lived TCP connections. A followup patch, converting rwlocks to spinlocks will even speedup this case. __inet_lookup_established() is pretty fast now we dont have to dirty a contended cache line (read_lock/read_unlock) Only established and timewait hashtable are converted to RCU (bind table and listen table are still using traditional locking) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net/inet_hashtables.h') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index cb31fbf8ae2a..481896045111 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -41,8 +41,8 @@ * I'll experiment with dynamic table growth later. */ struct inet_ehash_bucket { - struct hlist_head chain; - struct hlist_head twchain; + struct hlist_nulls_head chain; + struct hlist_nulls_head twchain; }; /* There are a few simple rules, which allow for local port reuse by -- cgit v1.2.3 From 5caea4ea7088e80ac5410d04660346094608b909 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Nov 2008 00:40:07 -0800 Subject: net: listening_hash get a spinlock per bucket This patch prepares RCU migration of listening_hash table for TCP/DCCP protocols. listening_hash table being small (32 slots per protocol), we add a spinlock for each slot, instead of a single rwlock for whole table. This should reduce hold time of readers, and writers concurrency. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 45 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) (limited to 'include/net/inet_hashtables.h') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 481896045111..62d2dd0d7860 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -99,6 +99,11 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; +struct inet_listen_hashbucket { + spinlock_t lock; + struct hlist_head head; +}; + /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ @@ -123,22 +128,21 @@ struct inet_hashinfo { unsigned int bhash_size; /* Note : 4 bytes padding on 64 bit arches */ - /* All sockets in TCP_LISTEN state will be in here. This is the only - * table where wildcard'd TCP sockets can exist. Hash function here - * is just local port number. - */ - struct hlist_head listening_hash[INET_LHTABLE_SIZE]; + struct kmem_cache *bind_bucket_cachep; /* All the above members are written once at bootup and * never written again _or_ are predominantly read-access. * * Now align to a new cache line as all the following members - * are often dirty. + * might be often dirty. + */ + /* All sockets in TCP_LISTEN state will be in here. This is the only + * table where wildcard'd TCP sockets can exist. Hash function here + * is just local port number. */ - rwlock_t lhash_lock ____cacheline_aligned; - atomic_t lhash_users; - wait_queue_head_t lhash_wait; - struct kmem_cache *bind_bucket_cachep; + struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE] + ____cacheline_aligned_in_smp; + }; static inline struct inet_ehash_bucket *inet_ehash_bucket( @@ -236,26 +240,7 @@ extern void __inet_inherit_port(struct sock *sk, struct sock *child); extern void inet_put_port(struct sock *sk); -extern void inet_listen_wlock(struct inet_hashinfo *hashinfo); - -/* - * - We may sleep inside this lock. - * - If sleeping is not required (or called from BH), - * use plain read_(un)lock(&inet_hashinfo.lhash_lock). - */ -static inline void inet_listen_lock(struct inet_hashinfo *hashinfo) -{ - /* read_lock synchronizes to candidates to writers */ - read_lock(&hashinfo->lhash_lock); - atomic_inc(&hashinfo->lhash_users); - read_unlock(&hashinfo->lhash_lock); -} - -static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo) -{ - if (atomic_dec_and_test(&hashinfo->lhash_users)) - wake_up(&hashinfo->lhash_wait); -} +void inet_hashinfo_init(struct inet_hashinfo *h); extern void __inet_hash_nolisten(struct sock *sk); extern void inet_hash(struct sock *sk); -- cgit v1.2.3 From 9db66bdcc83749affe61c61eb8ff3cf08f42afec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Nov 2008 20:39:09 -0800 Subject: net: convert TCP/DCCP ehash rwlocks to spinlocks Now TCP & DCCP use RCU lookups, we can convert ehash rwlocks to spinlocks. /proc/net/tcp and other seq_file 'readers' can safely be converted to 'writers'. This should speedup writers, since spin_lock()/spin_unlock() only use one atomic operation instead of two for write_lock()/write_unlock() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net/inet_hashtables.h') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 62d2dd0d7860..28b3ee3e8d6d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -116,7 +116,7 @@ struct inet_hashinfo { * TIME_WAIT sockets use a separate chain (twchain). */ struct inet_ehash_bucket *ehash; - rwlock_t *ehash_locks; + spinlock_t *ehash_locks; unsigned int ehash_size; unsigned int ehash_locks_mask; @@ -152,7 +152,7 @@ static inline struct inet_ehash_bucket *inet_ehash_bucket( return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)]; } -static inline rwlock_t *inet_ehash_lockp( +static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { @@ -177,16 +177,16 @@ static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) size = 4096; if (sizeof(rwlock_t) != 0) { #ifdef CONFIG_NUMA - if (size * sizeof(rwlock_t) > PAGE_SIZE) - hashinfo->ehash_locks = vmalloc(size * sizeof(rwlock_t)); + if (size * sizeof(spinlock_t) > PAGE_SIZE) + hashinfo->ehash_locks = vmalloc(size * sizeof(spinlock_t)); else #endif - hashinfo->ehash_locks = kmalloc(size * sizeof(rwlock_t), + hashinfo->ehash_locks = kmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!hashinfo->ehash_locks) return ENOMEM; for (i = 0; i < size; i++) - rwlock_init(&hashinfo->ehash_locks[i]); + spin_lock_init(&hashinfo->ehash_locks[i]); } hashinfo->ehash_locks_mask = size - 1; return 0; @@ -197,7 +197,7 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) if (hashinfo->ehash_locks) { #ifdef CONFIG_NUMA unsigned int size = (hashinfo->ehash_locks_mask + 1) * - sizeof(rwlock_t); + sizeof(spinlock_t); if (size > PAGE_SIZE) vfree(hashinfo->ehash_locks); else -- cgit v1.2.3 From f757fec4b0d45dfcb52f9a914a12225a6a0a3e05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2008 15:49:19 -0800 Subject: net: use net_eq() in INET_MATCH and INET_TW_MATCH We can avoid some useless instructions if !CONFIG_NET_NS Because of RCU, we use INET_MATCH or INET_TW_MATCH twice for the found socket, so thats six instructions less per incoming TCP packet. Yet another tbench speedup :) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/net/inet_hashtables.h') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 28b3ee3e8d6d..ec7ee2e46d8c 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -291,25 +291,25 @@ typedef __u64 __bitwise __addrpair; ((__force __u64)(__be32)(__saddr))); #endif /* __BIG_ENDIAN */ #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ - (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ + (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \ ((*((__addrpair *)&(inet_sk(__sk)->daddr))) == (__cookie)) && \ ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #define INET_TW_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\ - (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ + (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \ ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \ ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #else /* 32-bit arch */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif) \ - (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ + (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \ (inet_sk(__sk)->daddr == (__saddr)) && \ (inet_sk(__sk)->rcv_saddr == (__daddr)) && \ ((*((__portpair *)&(inet_sk(__sk)->dport))) == (__ports)) && \ (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif)))) #define INET_TW_MATCH(__sk, __net, __hash,__cookie, __saddr, __daddr, __ports, __dif) \ - (((__sk)->sk_hash == (__hash)) && sock_net((__sk)) == (__net) && \ + (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \ (inet_twsk(__sk)->tw_daddr == (__saddr)) && \ (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \ ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \ -- cgit v1.2.3 From c25eb3bfb97294d0543a81230fbc237046b4b84c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Nov 2008 17:22:55 -0800 Subject: net: Convert TCP/DCCP listening hash tables to use RCU This is the last step to be able to perform full RCU lookups in __inet_lookup() : After established/timewait tables, we add RCU lookups to listening hash table. The only trick here is that a socket of a given type (TCP ipv4, TCP ipv6, ...) can now flight between two different tables (established and listening) during a RCU grace period, so we must use different 'nulls' end-of-chain values for two tables. We define a large value : #define LISTENING_NULLS_BASE (1U << 29) So that slots in listening table are guaranteed to have different end-of-chain values than slots in established table. A reader can still detect it finished its lookup in the right chain. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/net/inet_hashtables.h') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ec7ee2e46d8c..f44bb5c77a70 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -99,9 +99,16 @@ struct inet_bind_hashbucket { struct hlist_head chain; }; +/* + * Sockets can be hashed in established or listening table + * We must use different 'nulls' end-of-chain value for listening + * hash table, or we might find a socket that was closed and + * reallocated/inserted into established hash table + */ +#define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; - struct hlist_head head; + struct hlist_nulls_head head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ -- cgit v1.2.3