From 7123aaa3a1416529ce461e98108e6b343b294643 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 8 Jun 2012 05:03:21 +0000
Subject: af_unix: speedup /proc/net/unix

/proc/net/unix has quadratic behavior, and can hold unix_table_lock for
a while if high number of unix sockets are alive. (90 ms for 200k
sockets...)

We already have a hash table, so its quite easy to use it.

Problem is unbound sockets are still hashed in a single hash slot
(unix_socket_table[UNIX_HASH_TABLE])

This patch also spreads unbound sockets to 256 hash slots, to speedup
both /proc/net/unix and unix_diag.

Time to read /proc/net/unix with 200k unix sockets :
(time dd if=/proc/net/unix of=/dev/null bs=4k)

before : 520 secs
after : 2 secs

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 110 +++++++++++++++++++++++++++++++----------------------
 net/unix/diag.c    |   6 ++-
 2 files changed, 68 insertions(+), 48 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 641f2e47f165..cf83f6b5ac91 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -115,15 +115,24 @@
 #include <net/checksum.h>
 #include <linux/security.h>
 
-struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
+struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 EXPORT_SYMBOL_GPL(unix_socket_table);
 DEFINE_SPINLOCK(unix_table_lock);
 EXPORT_SYMBOL_GPL(unix_table_lock);
 static atomic_long_t unix_nr_socks;
 
-#define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
-#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
+static struct hlist_head *unix_sockets_unbound(void *addr)
+{
+	unsigned long hash = (unsigned long)addr;
+
+	hash ^= hash >> 16;
+	hash ^= hash >> 8;
+	hash %= UNIX_HASH_SIZE;
+	return &unix_socket_table[UNIX_HASH_SIZE + hash];
+}
+
+#define UNIX_ABSTRACT(sk)	(unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 
 #ifdef CONFIG_SECURITY_NETWORK
 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
@@ -645,7 +654,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
-	unix_insert_socket(unix_sockets_unbound, sk);
+	unix_insert_socket(unix_sockets_unbound(sk), sk);
 out:
 	if (sk == NULL)
 		atomic_long_dec(&unix_nr_socks);
@@ -2239,47 +2248,58 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 }
 
 #ifdef CONFIG_PROC_FS
-static struct sock *first_unix_socket(int *i)
-{
-	for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) {
-		if (!hlist_empty(&unix_socket_table[*i]))
-			return __sk_head(&unix_socket_table[*i]);
-	}
-	return NULL;
-}
 
-static struct sock *next_unix_socket(int *i, struct sock *s)
-{
-	struct sock *next = sk_next(s);
-	/* More in this chain? */
-	if (next)
-		return next;
-	/* Look for next non-empty chain. */
-	for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) {
-		if (!hlist_empty(&unix_socket_table[*i]))
-			return __sk_head(&unix_socket_table[*i]);
-	}
-	return NULL;
-}
+#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
 
 struct unix_iter_state {
 	struct seq_net_private p;
-	int i;
 };
 
-static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos)
+static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
 {
-	struct unix_iter_state *iter = seq->private;
-	loff_t off = 0;
-	struct sock *s;
+	unsigned long offset = get_offset(*pos);
+	unsigned long bucket = get_bucket(*pos);
+	struct sock *sk;
+	unsigned long count = 0;
 
-	for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
-		if (sock_net(s) != seq_file_net(seq))
+	for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
+		if (sock_net(sk) != seq_file_net(seq))
 			continue;
-		if (off == pos)
-			return s;
-		++off;
+		if (++count == offset)
+			break;
 	}
+
+	return sk;
+}
+
+static struct sock *unix_next_socket(struct seq_file *seq,
+				     struct sock *sk,
+				     loff_t *pos)
+{
+	unsigned long bucket;
+
+	while (sk > (struct sock *)SEQ_START_TOKEN) {
+		sk = sk_next(sk);
+		if (!sk)
+			goto next_bucket;
+		if (sock_net(sk) == seq_file_net(seq))
+			return sk;
+	}
+
+	do {
+		sk = unix_from_bucket(seq, pos);
+		if (sk)
+			return sk;
+
+next_bucket:
+		bucket = get_bucket(*pos) + 1;
+		*pos = set_bucket_offset(bucket, 1);
+	} while (bucket < ARRAY_SIZE(unix_socket_table));
+
 	return NULL;
 }
 
@@ -2287,22 +2307,20 @@ static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(unix_table_lock)
 {
 	spin_lock(&unix_table_lock);
-	return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+
+	if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
+		return NULL;
+
+	return unix_next_socket(seq, NULL, pos);
 }
 
 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	struct unix_iter_state *iter = seq->private;
-	struct sock *sk = v;
 	++*pos;
-
-	if (v == SEQ_START_TOKEN)
-		sk = first_unix_socket(&iter->i);
-	else
-		sk = next_unix_socket(&iter->i, sk);
-	while (sk && (sock_net(sk) != seq_file_net(seq)))
-		sk = next_unix_socket(&iter->i, sk);
-	return sk;
+	return unix_next_socket(seq, v, pos);
 }
 
 static void unix_seq_stop(struct seq_file *seq, void *v)
diff --git a/net/unix/diag.c b/net/unix/diag.c
index 47d3002737f5..7e8a24bff34a 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -195,7 +195,9 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	num = s_num = cb->args[1];
 
 	spin_lock(&unix_table_lock);
-	for (slot = s_slot; slot <= UNIX_HASH_SIZE; s_num = 0, slot++) {
+	for (slot = s_slot;
+	     slot < ARRAY_SIZE(unix_socket_table);
+	     s_num = 0, slot++) {
 		struct sock *sk;
 		struct hlist_node *node;
 
@@ -228,7 +230,7 @@ static struct sock *unix_lookup_by_ino(int ino)
 	struct sock *sk;
 
 	spin_lock(&unix_table_lock);
-	for (i = 0; i <= UNIX_HASH_SIZE; i++) {
+	for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
 		struct hlist_node *node;
 
 		sk_for_each(sk, node, &unix_socket_table[i])
-- 
cgit v1.2.3


From 8b51b064a6da90c68af5385a874968829a2a0ed7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 8 Jun 2012 22:10:20 +0000
Subject: af_unix: remove unix_iter_state

As pointed out by Michael Tokarev , struct unix_iter_state is no longer
needed.

Suggested-by: Michael Tokarev <mjt@tls.msk.ru>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cf83f6b5ac91..79981d97bc9c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2255,10 +2255,6 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
 
-struct unix_iter_state {
-	struct seq_net_private p;
-};
-
 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
 {
 	unsigned long offset = get_offset(*pos);
@@ -2383,7 +2379,7 @@ static const struct seq_operations unix_seq_ops = {
 static int unix_seq_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &unix_seq_ops,
-			    sizeof(struct unix_iter_state));
+			    sizeof(struct seq_net_private));
 }
 
 static const struct file_operations unix_seq_fops = {
-- 
cgit v1.2.3


From b61bb01974730e2fd7d36ab4cc848ca6f44cffd4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 26 Jun 2012 21:41:00 -0700
Subject: unix_diag: Move away from NLMSG_PUT().

And use nlmsg_data() while we're here too and remove useless
casts.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/diag.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/diag.c b/net/unix/diag.c
index 7e8a24bff34a..977ca317550d 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -126,10 +126,12 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	struct nlmsghdr *nlh;
 	struct unix_diag_msg *rep;
 
-	nlh = NLMSG_PUT(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep));
+	nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), 0);
+	if (!nlh)
+		goto out_nlmsg_trim;
 	nlh->nlmsg_flags = flags;
 
-	rep = NLMSG_DATA(nlh);
+	rep = nlmsg_data(nlh);
 
 	rep->udiag_family = AF_UNIX;
 	rep->udiag_type = sk->sk_type;
@@ -139,32 +141,32 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 
 	if ((req->udiag_show & UDIAG_SHOW_NAME) &&
 	    sk_diag_dump_name(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_VFS) &&
 	    sk_diag_dump_vfs(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_PEER) &&
 	    sk_diag_dump_peer(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_ICONS) &&
 	    sk_diag_dump_icons(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_RQLEN) &&
 	    sk_diag_show_rqlen(sk, skb))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	if ((req->udiag_show & UDIAG_SHOW_MEMINFO) &&
 	    sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO))
-		goto nlmsg_failure;
+		goto out_nlmsg_trim;
 
 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
 	return skb->len;
 
-nlmsg_failure:
+out_nlmsg_trim:
 	nlmsg_trim(skb, b);
 	return -EMSGSIZE;
 }
@@ -189,7 +191,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	struct unix_diag_req *req;
 	int num, s_num, slot, s_slot;
 
-	req = NLMSG_DATA(cb->nlh);
+	req = nlmsg_data(cb->nlh);
 
 	s_slot = cb->args[0];
 	num = s_num = cb->args[1];
@@ -309,7 +311,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		};
 		return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
 	} else
-		return unix_diag_get_exact(skb, h, (struct unix_diag_req *)NLMSG_DATA(h));
+		return unix_diag_get_exact(skb, h, nlmsg_data(h));
 }
 
 static const struct sock_diag_handler unix_diag_handler = {
-- 
cgit v1.2.3


From 4245375db87767aacaad16f07040b5d89a9056c8 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 26 Jun 2012 23:36:10 +0000
Subject: unix_diag: Do not use RTA_PUT() macros

Also, no need to trim on nlmsg_put() failure, nothing has been added
yet.  We also want to use nlmsg_end(), nlmsg_new() and nlmsg_free().

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/diag.c | 80 ++++++++++++++++++++++++---------------------------------
 1 file changed, 33 insertions(+), 47 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/diag.c b/net/unix/diag.c
index 977ca317550d..a74864eedfcd 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -8,40 +8,31 @@
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
 
-#define UNIX_DIAG_PUT(skb, attrtype, attrlen) \
-	RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-
 static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct unix_address *addr = unix_sk(sk)->addr;
-	char *s;
-
-	if (addr) {
-		s = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short));
-		memcpy(s, addr->name->sun_path, addr->len - sizeof(short));
-	}
 
-	return 0;
+	if (!addr)
+		return 0;
 
-rtattr_failure:
-	return -EMSGSIZE;
+	return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short),
+		       addr->name->sun_path);
 }
 
 static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct dentry *dentry = unix_sk(sk)->path.dentry;
-	struct unix_diag_vfs *uv;
 
 	if (dentry) {
-		uv = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_VFS, sizeof(*uv));
-		uv->udiag_vfs_ino = dentry->d_inode->i_ino;
-		uv->udiag_vfs_dev = dentry->d_sb->s_dev;
+		struct unix_diag_vfs uv = {
+			.udiag_vfs_ino = dentry->d_inode->i_ino,
+			.udiag_vfs_dev = dentry->d_sb->s_dev,
+		};
+
+		return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv);
 	}
 
 	return 0;
-
-rtattr_failure:
-	return -EMSGSIZE;
 }
 
 static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb)
@@ -56,24 +47,28 @@ static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb)
 		unix_state_unlock(peer);
 		sock_put(peer);
 
-		RTA_PUT_U32(nlskb, UNIX_DIAG_PEER, ino);
+		return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino);
 	}
 
 	return 0;
-rtattr_failure:
-	return -EMSGSIZE;
 }
 
 static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct sk_buff *skb;
+	struct nlattr *attr;
 	u32 *buf;
 	int i;
 
 	if (sk->sk_state == TCP_LISTEN) {
 		spin_lock(&sk->sk_receive_queue.lock);
-		buf = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_ICONS,
-				sk->sk_receive_queue.qlen * sizeof(u32));
+
+		attr = nla_reserve(nlskb, UNIX_DIAG_ICONS,
+				   sk->sk_receive_queue.qlen * sizeof(u32));
+		if (!attr)
+			goto errout;
+
+		buf = nla_data(attr);
 		i = 0;
 		skb_queue_walk(&sk->sk_receive_queue, skb) {
 			struct sock *req, *peer;
@@ -94,45 +89,38 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb)
 
 	return 0;
 
-rtattr_failure:
+errout:
 	spin_unlock(&sk->sk_receive_queue.lock);
 	return -EMSGSIZE;
 }
 
 static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb)
 {
-	struct unix_diag_rqlen *rql;
-
-	rql = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_RQLEN, sizeof(*rql));
+	struct unix_diag_rqlen rql;
 
 	if (sk->sk_state == TCP_LISTEN) {
-		rql->udiag_rqueue = sk->sk_receive_queue.qlen;
-		rql->udiag_wqueue = sk->sk_max_ack_backlog;
+		rql.udiag_rqueue = sk->sk_receive_queue.qlen;
+		rql.udiag_wqueue = sk->sk_max_ack_backlog;
 	} else {
-		rql->udiag_rqueue = (__u32)unix_inq_len(sk);
-		rql->udiag_wqueue = (__u32)unix_outq_len(sk);
+		rql.udiag_rqueue = (u32) unix_inq_len(sk);
+		rql.udiag_wqueue = (u32) unix_outq_len(sk);
 	}
 
-	return 0;
-
-rtattr_failure:
-	return -EMSGSIZE;
+	return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql);
 }
 
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req,
 		u32 pid, u32 seq, u32 flags, int sk_ino)
 {
-	unsigned char *b = skb_tail_pointer(skb);
 	struct nlmsghdr *nlh;
 	struct unix_diag_msg *rep;
 
-	nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), 0);
+	nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
+			flags);
 	if (!nlh)
-		goto out_nlmsg_trim;
-	nlh->nlmsg_flags = flags;
+		return -EMSGSIZE;
 
 	rep = nlmsg_data(nlh);
-
 	rep->udiag_family = AF_UNIX;
 	rep->udiag_type = sk->sk_type;
 	rep->udiag_state = sk->sk_state;
@@ -163,11 +151,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
 	    sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
-	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
-	return skb->len;
+	return nlmsg_end(skb, nlh);
 
 out_nlmsg_trim:
-	nlmsg_trim(skb, b);
+	nlmsg_cancel(skb, nlh);
 	return -EMSGSIZE;
 }
 
@@ -272,15 +259,14 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
 	extra_len = 256;
 again:
 	err = -ENOMEM;
-	rep = alloc_skb(NLMSG_SPACE((sizeof(struct unix_diag_msg) + extra_len)),
-			GFP_KERNEL);
+	rep = nlmsg_new(sizeof(struct unix_diag_msg) + extra_len, GFP_KERNEL);
 	if (!rep)
 		goto out;
 
 	err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).pid,
 			   nlh->nlmsg_seq, 0, req->udiag_ino);
 	if (err < 0) {
-		kfree_skb(rep);
+		nlmsg_free(rep);
 		extra_len += 256;
 		if (extra_len >= PAGE_SIZE)
 			goto out;
-- 
cgit v1.2.3


From 51d7cccf07238f5236c5b9269231a30dd5f8e714 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Mon, 16 Jul 2012 04:28:49 +0000
Subject: net: make sock diag per-namespace

Before this patch sock_diag works for init_net only and dumps
information about sockets from all namespaces.

This patch expands sock_diag for all name-spaces.
It creates a netlink kernel socket for each netns and filters
data during dumping.

v2: filter accoding with netns in all places
    remove an unused variable.

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: James Morris <jmorris@namei.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Pavel Emelyanov <xemul@parallels.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Andrew Vagin <avagin@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/diag.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/diag.c b/net/unix/diag.c
index a74864eedfcd..750b13408449 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -177,6 +177,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct unix_diag_req *req;
 	int num, s_num, slot, s_slot;
+	struct net *net = sock_net(skb->sk);
 
 	req = nlmsg_data(cb->nlh);
 
@@ -192,6 +193,8 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		num = 0;
 		sk_for_each(sk, node, &unix_socket_table[slot]) {
+			if (!net_eq(sock_net(sk), net))
+				continue;
 			if (num < s_num)
 				goto next;
 			if (!(req->udiag_states & (1 << sk->sk_state)))
@@ -243,6 +246,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
 	struct sock *sk;
 	struct sk_buff *rep;
 	unsigned int extra_len;
+	struct net *net = sock_net(in_skb->sk);
 
 	if (req->udiag_ino == 0)
 		goto out_nosk;
@@ -273,7 +277,7 @@ again:
 
 		goto again;
 	}
-	err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid,
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid,
 			      MSG_DONTWAIT);
 	if (err > 0)
 		err = 0;
@@ -287,6 +291,7 @@ out_nosk:
 static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 {
 	int hdrlen = sizeof(struct unix_diag_req);
+	struct net *net = sock_net(skb->sk);
 
 	if (nlmsg_len(h) < hdrlen)
 		return -EINVAL;
@@ -295,7 +300,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
 		struct netlink_dump_control c = {
 			.dump = unix_diag_dump,
 		};
-		return netlink_dump_start(sock_diag_nlsk, skb, h, &c);
+		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
 	} else
 		return unix_diag_get_exact(skb, h, nlmsg_data(h));
 }
-- 
cgit v1.2.3


From 921a1650de9eed40dd64d681aba4a4d98856f289 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Jul 2012 01:15:31 +0400
Subject: new helper: done_path_create()

releases what needs to be released after {kern,user}_path_create()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/unix/af_unix.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 641f2e47f165..e8239540683a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -887,8 +887,9 @@ out_mknod_drop_write:
 		mnt_drop_write(path.mnt);
 		if (err)
 			goto out_mknod_dput;
-		mutex_unlock(&path.dentry->d_inode->i_mutex);
-		dput(path.dentry);
+		mntget(path.mnt);
+		dget(dentry);
+		done_path_create(&path, dentry);
 		path.dentry = dentry;
 
 		addr->hash = UNIX_HASH_SIZE;
@@ -923,9 +924,7 @@ out:
 	return err;
 
 out_mknod_dput:
-	dput(dentry);
-	mutex_unlock(&path.dentry->d_inode->i_mutex);
-	path_put(&path);
+	done_path_create(&path, dentry);
 out_mknod_parent:
 	if (err == -EEXIST)
 		err = -EADDRINUSE;
-- 
cgit v1.2.3


From a8104a9fcdeb82e22d7acd55fca20746581067d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Jul 2012 02:25:00 +0400
Subject: pull mnt_want_write()/mnt_drop_write() into
 kern_path_create()/done_path_create() resp.

One side effect - attempt to create a cross-device link on a read-only fs fails
with EROFS instead of EXDEV now.  Makes more sense, POSIX allows, etc.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/unix/af_unix.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e8239540683a..88ab72820b9f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -876,15 +876,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 */
 		mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
-		err = mnt_want_write(path.mnt);
-		if (err)
-			goto out_mknod_dput;
 		err = security_path_mknod(&path, dentry, mode, 0);
 		if (err)
 			goto out_mknod_drop_write;
 		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 out_mknod_drop_write:
-		mnt_drop_write(path.mnt);
 		if (err)
 			goto out_mknod_dput;
 		mntget(path.mnt);
-- 
cgit v1.2.3


From faf02010290e202e275c1bf94ca9dd808bf85607 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Jul 2012 02:37:29 +0400
Subject: clean unix_bind() up a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 net/unix/af_unix.c | 88 ++++++++++++++++++++++++++----------------------------
 1 file changed, 43 insertions(+), 45 deletions(-)

(limited to 'net/unix')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 88ab72820b9f..b7667f81c49c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -814,6 +814,34 @@ fail:
 	return NULL;
 }
 
+static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+{
+	struct dentry *dentry;
+	struct path path;
+	int err = 0;
+	/*
+	 * Get the parent directory, calculate the hash for last
+	 * component.
+	 */
+	dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+	err = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		return err;
+
+	/*
+	 * All right, let's create it.
+	 */
+	err = security_path_mknod(&path, dentry, mode, 0);
+	if (!err) {
+		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
+		if (!err) {
+			res->mnt = mntget(path.mnt);
+			res->dentry = dget(dentry);
+		}
+	}
+	done_path_create(&path, dentry);
+	return err;
+}
 
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
@@ -822,8 +850,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 	char *sun_path = sunaddr->sun_path;
-	struct dentry *dentry = NULL;
-	struct path path;
 	int err;
 	unsigned int hash;
 	struct unix_address *addr;
@@ -860,40 +886,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	atomic_set(&addr->refcnt, 1);
 
 	if (sun_path[0]) {
-		umode_t mode;
-		err = 0;
-		/*
-		 * Get the parent directory, calculate the hash for last
-		 * component.
-		 */
-		dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
-		err = PTR_ERR(dentry);
-		if (IS_ERR(dentry))
-			goto out_mknod_parent;
-
-		/*
-		 * All right, let's create it.
-		 */
-		mode = S_IFSOCK |
+		struct path path;
+		umode_t mode = S_IFSOCK |
 		       (SOCK_INODE(sock)->i_mode & ~current_umask());
-		err = security_path_mknod(&path, dentry, mode, 0);
-		if (err)
-			goto out_mknod_drop_write;
-		err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
-out_mknod_drop_write:
-		if (err)
-			goto out_mknod_dput;
-		mntget(path.mnt);
-		dget(dentry);
-		done_path_create(&path, dentry);
-		path.dentry = dentry;
-
+		err = unix_mknod(sun_path, mode, &path);
+		if (err) {
+			if (err == -EEXIST)
+				err = -EADDRINUSE;
+			unix_release_addr(addr);
+			goto out_up;
+		}
 		addr->hash = UNIX_HASH_SIZE;
-	}
-
-	spin_lock(&unix_table_lock);
-
-	if (!sun_path[0]) {
+		hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
+		spin_lock(&unix_table_lock);
+		u->path = path;
+		list = &unix_socket_table[hash];
+	} else {
+		spin_lock(&unix_table_lock);
 		err = -EADDRINUSE;
 		if (__unix_find_socket_byname(net, sunaddr, addr_len,
 					      sk->sk_type, hash)) {
@@ -902,9 +911,6 @@ out_mknod_drop_write:
 		}
 
 		list = &unix_socket_table[addr->hash];
-	} else {
-		list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
-		u->path = path;
 	}
 
 	err = 0;
@@ -918,14 +924,6 @@ out_up:
 	mutex_unlock(&u->readlock);
 out:
 	return err;
-
-out_mknod_dput:
-	done_path_create(&path, dentry);
-out_mknod_parent:
-	if (err == -EEXIST)
-		err = -EADDRINUSE;
-	unix_release_addr(addr);
-	goto out_up;
 }
 
 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
-- 
cgit v1.2.3