From 7123aaa3a1416529ce461e98108e6b343b294643 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jun 2012 05:03:21 +0000 Subject: af_unix: speedup /proc/net/unix /proc/net/unix has quadratic behavior, and can hold unix_table_lock for a while if high number of unix sockets are alive. (90 ms for 200k sockets...) We already have a hash table, so its quite easy to use it. Problem is unbound sockets are still hashed in a single hash slot (unix_socket_table[UNIX_HASH_TABLE]) This patch also spreads unbound sockets to 256 hash slots, to speedup both /proc/net/unix and unix_diag. Time to read /proc/net/unix with 200k unix sockets : (time dd if=/proc/net/unix of=/dev/null bs=4k) before : 520 secs after : 2 secs Signed-off-by: Eric Dumazet Cc: Steven Whitehouse Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 110 +++++++++++++++++++++++++++++++---------------------- net/unix/diag.c | 6 ++- 2 files changed, 68 insertions(+), 48 deletions(-) (limited to 'net/unix') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 641f2e47f165..cf83f6b5ac91 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -115,15 +115,24 @@ #include #include -struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); DEFINE_SPINLOCK(unix_table_lock); EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; -#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) -#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) +static struct hlist_head *unix_sockets_unbound(void *addr) +{ + unsigned long hash = (unsigned long)addr; + + hash ^= hash >> 16; + hash ^= hash >> 8; + hash %= UNIX_HASH_SIZE; + return &unix_socket_table[UNIX_HASH_SIZE + hash]; +} + +#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE) #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) @@ -645,7 +654,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) INIT_LIST_HEAD(&u->link); mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); - unix_insert_socket(unix_sockets_unbound, sk); + unix_insert_socket(unix_sockets_unbound(sk), sk); out: if (sk == NULL) atomic_long_dec(&unix_nr_socks); @@ -2239,47 +2248,58 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, } #ifdef CONFIG_PROC_FS -static struct sock *first_unix_socket(int *i) -{ - for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); - } - return NULL; -} -static struct sock *next_unix_socket(int *i, struct sock *s) -{ - struct sock *next = sk_next(s); - /* More in this chain? */ - if (next) - return next; - /* Look for next non-empty chain. */ - for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { - if (!hlist_empty(&unix_socket_table[*i])) - return __sk_head(&unix_socket_table[*i]); - } - return NULL; -} +#define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) + +#define get_bucket(x) ((x) >> BUCKET_SPACE) +#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) +#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) struct unix_iter_state { struct seq_net_private p; - int i; }; -static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos) +static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { - struct unix_iter_state *iter = seq->private; - loff_t off = 0; - struct sock *s; + unsigned long offset = get_offset(*pos); + unsigned long bucket = get_bucket(*pos); + struct sock *sk; + unsigned long count = 0; - for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { - if (sock_net(s) != seq_file_net(seq)) + for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) { + if (sock_net(sk) != seq_file_net(seq)) continue; - if (off == pos) - return s; - ++off; + if (++count == offset) + break; } + + return sk; +} + +static struct sock *unix_next_socket(struct seq_file *seq, + struct sock *sk, + loff_t *pos) +{ + unsigned long bucket; + + while (sk > (struct sock *)SEQ_START_TOKEN) { + sk = sk_next(sk); + if (!sk) + goto next_bucket; + if (sock_net(sk) == seq_file_net(seq)) + return sk; + } + + do { + sk = unix_from_bucket(seq, pos); + if (sk) + return sk; + +next_bucket: + bucket = get_bucket(*pos) + 1; + *pos = set_bucket_offset(bucket, 1); + } while (bucket < ARRAY_SIZE(unix_socket_table)); + return NULL; } @@ -2287,22 +2307,20 @@ static void *unix_seq_start(struct seq_file *seq, loff_t *pos) __acquires(unix_table_lock) { spin_lock(&unix_table_lock); - return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN; + + if (!*pos) + return SEQ_START_TOKEN; + + if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table)) + return NULL; + + return unix_next_socket(seq, NULL, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct unix_iter_state *iter = seq->private; - struct sock *sk = v; ++*pos; - - if (v == SEQ_START_TOKEN) - sk = first_unix_socket(&iter->i); - else - sk = next_unix_socket(&iter->i, sk); - while (sk && (sock_net(sk) != seq_file_net(seq))) - sk = next_unix_socket(&iter->i, sk); - return sk; + return unix_next_socket(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) diff --git a/net/unix/diag.c b/net/unix/diag.c index 47d3002737f5..7e8a24bff34a 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -195,7 +195,9 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = s_num = cb->args[1]; spin_lock(&unix_table_lock); - for (slot = s_slot; slot <= UNIX_HASH_SIZE; s_num = 0, slot++) { + for (slot = s_slot; + slot < ARRAY_SIZE(unix_socket_table); + s_num = 0, slot++) { struct sock *sk; struct hlist_node *node; @@ -228,7 +230,7 @@ static struct sock *unix_lookup_by_ino(int ino) struct sock *sk; spin_lock(&unix_table_lock); - for (i = 0; i <= UNIX_HASH_SIZE; i++) { + for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { struct hlist_node *node; sk_for_each(sk, node, &unix_socket_table[i]) -- cgit v1.2.3 From 8b51b064a6da90c68af5385a874968829a2a0ed7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Jun 2012 22:10:20 +0000 Subject: af_unix: remove unix_iter_state As pointed out by Michael Tokarev , struct unix_iter_state is no longer needed. Suggested-by: Michael Tokarev Signed-off-by: Eric Dumazet Cc: Steven Whitehouse Cc: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/unix') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cf83f6b5ac91..79981d97bc9c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2255,10 +2255,6 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) -struct unix_iter_state { - struct seq_net_private p; -}; - static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); @@ -2383,7 +2379,7 @@ static const struct seq_operations unix_seq_ops = { static int unix_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &unix_seq_ops, - sizeof(struct unix_iter_state)); + sizeof(struct seq_net_private)); } static const struct file_operations unix_seq_fops = { -- cgit v1.2.3 From b61bb01974730e2fd7d36ab4cc848ca6f44cffd4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 26 Jun 2012 21:41:00 -0700 Subject: unix_diag: Move away from NLMSG_PUT(). And use nlmsg_data() while we're here too and remove useless casts. Signed-off-by: David S. Miller --- net/unix/diag.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net/unix') diff --git a/net/unix/diag.c b/net/unix/diag.c index 7e8a24bff34a..977ca317550d 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -126,10 +126,12 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r struct nlmsghdr *nlh; struct unix_diag_msg *rep; - nlh = NLMSG_PUT(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep)); + nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), 0); + if (!nlh) + goto out_nlmsg_trim; nlh->nlmsg_flags = flags; - rep = NLMSG_DATA(nlh); + rep = nlmsg_data(nlh); rep->udiag_family = AF_UNIX; rep->udiag_type = sk->sk_type; @@ -139,32 +141,32 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r if ((req->udiag_show & UDIAG_SHOW_NAME) && sk_diag_dump_name(sk, skb)) - goto nlmsg_failure; + goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_VFS) && sk_diag_dump_vfs(sk, skb)) - goto nlmsg_failure; + goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_PEER) && sk_diag_dump_peer(sk, skb)) - goto nlmsg_failure; + goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_ICONS) && sk_diag_dump_icons(sk, skb)) - goto nlmsg_failure; + goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_RQLEN) && sk_diag_show_rqlen(sk, skb)) - goto nlmsg_failure; + goto out_nlmsg_trim; if ((req->udiag_show & UDIAG_SHOW_MEMINFO) && sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO)) - goto nlmsg_failure; + goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; -nlmsg_failure: +out_nlmsg_trim: nlmsg_trim(skb, b); return -EMSGSIZE; } @@ -189,7 +191,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct unix_diag_req *req; int num, s_num, slot, s_slot; - req = NLMSG_DATA(cb->nlh); + req = nlmsg_data(cb->nlh); s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -309,7 +311,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) }; return netlink_dump_start(sock_diag_nlsk, skb, h, &c); } else - return unix_diag_get_exact(skb, h, (struct unix_diag_req *)NLMSG_DATA(h)); + return unix_diag_get_exact(skb, h, nlmsg_data(h)); } static const struct sock_diag_handler unix_diag_handler = { -- cgit v1.2.3 From 4245375db87767aacaad16f07040b5d89a9056c8 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 26 Jun 2012 23:36:10 +0000 Subject: unix_diag: Do not use RTA_PUT() macros Also, no need to trim on nlmsg_put() failure, nothing has been added yet. We also want to use nlmsg_end(), nlmsg_new() and nlmsg_free(). Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/unix/diag.c | 80 ++++++++++++++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) (limited to 'net/unix') diff --git a/net/unix/diag.c b/net/unix/diag.c index 977ca317550d..a74864eedfcd 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -8,40 +8,31 @@ #include #include -#define UNIX_DIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) - static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { struct unix_address *addr = unix_sk(sk)->addr; - char *s; - - if (addr) { - s = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short)); - memcpy(s, addr->name->sun_path, addr->len - sizeof(short)); - } - return 0; + if (!addr) + return 0; -rtattr_failure: - return -EMSGSIZE; + return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short), + addr->name->sun_path); } static int sk_diag_dump_vfs(struct sock *sk, struct sk_buff *nlskb) { struct dentry *dentry = unix_sk(sk)->path.dentry; - struct unix_diag_vfs *uv; if (dentry) { - uv = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_VFS, sizeof(*uv)); - uv->udiag_vfs_ino = dentry->d_inode->i_ino; - uv->udiag_vfs_dev = dentry->d_sb->s_dev; + struct unix_diag_vfs uv = { + .udiag_vfs_ino = dentry->d_inode->i_ino, + .udiag_vfs_dev = dentry->d_sb->s_dev, + }; + + return nla_put(nlskb, UNIX_DIAG_VFS, sizeof(uv), &uv); } return 0; - -rtattr_failure: - return -EMSGSIZE; } static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb) @@ -56,24 +47,28 @@ static int sk_diag_dump_peer(struct sock *sk, struct sk_buff *nlskb) unix_state_unlock(peer); sock_put(peer); - RTA_PUT_U32(nlskb, UNIX_DIAG_PEER, ino); + return nla_put_u32(nlskb, UNIX_DIAG_PEER, ino); } return 0; -rtattr_failure: - return -EMSGSIZE; } static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) { struct sk_buff *skb; + struct nlattr *attr; u32 *buf; int i; if (sk->sk_state == TCP_LISTEN) { spin_lock(&sk->sk_receive_queue.lock); - buf = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_ICONS, - sk->sk_receive_queue.qlen * sizeof(u32)); + + attr = nla_reserve(nlskb, UNIX_DIAG_ICONS, + sk->sk_receive_queue.qlen * sizeof(u32)); + if (!attr) + goto errout; + + buf = nla_data(attr); i = 0; skb_queue_walk(&sk->sk_receive_queue, skb) { struct sock *req, *peer; @@ -94,45 +89,38 @@ static int sk_diag_dump_icons(struct sock *sk, struct sk_buff *nlskb) return 0; -rtattr_failure: +errout: spin_unlock(&sk->sk_receive_queue.lock); return -EMSGSIZE; } static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) { - struct unix_diag_rqlen *rql; - - rql = UNIX_DIAG_PUT(nlskb, UNIX_DIAG_RQLEN, sizeof(*rql)); + struct unix_diag_rqlen rql; if (sk->sk_state == TCP_LISTEN) { - rql->udiag_rqueue = sk->sk_receive_queue.qlen; - rql->udiag_wqueue = sk->sk_max_ack_backlog; + rql.udiag_rqueue = sk->sk_receive_queue.qlen; + rql.udiag_wqueue = sk->sk_max_ack_backlog; } else { - rql->udiag_rqueue = (__u32)unix_inq_len(sk); - rql->udiag_wqueue = (__u32)unix_outq_len(sk); + rql.udiag_rqueue = (u32) unix_inq_len(sk); + rql.udiag_wqueue = (u32) unix_outq_len(sk); } - return 0; - -rtattr_failure: - return -EMSGSIZE; + return nla_put(nlskb, UNIX_DIAG_RQLEN, sizeof(rql), &rql); } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_req *req, u32 pid, u32 seq, u32 flags, int sk_ino) { - unsigned char *b = skb_tail_pointer(skb); struct nlmsghdr *nlh; struct unix_diag_msg *rep; - nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), 0); + nlh = nlmsg_put(skb, pid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep), + flags); if (!nlh) - goto out_nlmsg_trim; - nlh->nlmsg_flags = flags; + return -EMSGSIZE; rep = nlmsg_data(nlh); - rep->udiag_family = AF_UNIX; rep->udiag_type = sk->sk_type; rep->udiag_state = sk->sk_state; @@ -163,11 +151,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct unix_diag_r sock_diag_put_meminfo(sk, skb, UNIX_DIAG_MEMINFO)) goto out_nlmsg_trim; - nlh->nlmsg_len = skb_tail_pointer(skb) - b; - return skb->len; + return nlmsg_end(skb, nlh); out_nlmsg_trim: - nlmsg_trim(skb, b); + nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -272,15 +259,14 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, extra_len = 256; again: err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct unix_diag_msg) + extra_len)), - GFP_KERNEL); + rep = nlmsg_new(sizeof(struct unix_diag_msg) + extra_len, GFP_KERNEL); if (!rep) goto out; err = sk_diag_fill(sk, rep, req, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 0, req->udiag_ino); if (err < 0) { - kfree_skb(rep); + nlmsg_free(rep); extra_len += 256; if (extra_len >= PAGE_SIZE) goto out; -- cgit v1.2.3 From 51d7cccf07238f5236c5b9269231a30dd5f8e714 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 16 Jul 2012 04:28:49 +0000 Subject: net: make sock diag per-namespace Before this patch sock_diag works for init_net only and dumps information about sockets from all namespaces. This patch expands sock_diag for all name-spaces. It creates a netlink kernel socket for each netns and filters data during dumping. v2: filter accoding with netns in all places remove an unused variable. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Pavel Emelyanov CC: Eric Dumazet Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Andrew Vagin Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/diag.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/unix') diff --git a/net/unix/diag.c b/net/unix/diag.c index a74864eedfcd..750b13408449 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -177,6 +177,7 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct unix_diag_req *req; int num, s_num, slot, s_slot; + struct net *net = sock_net(skb->sk); req = nlmsg_data(cb->nlh); @@ -192,6 +193,8 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) num = 0; sk_for_each(sk, node, &unix_socket_table[slot]) { + if (!net_eq(sock_net(sk), net)) + continue; if (num < s_num) goto next; if (!(req->udiag_states & (1 << sk->sk_state))) @@ -243,6 +246,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, struct sock *sk; struct sk_buff *rep; unsigned int extra_len; + struct net *net = sock_net(in_skb->sk); if (req->udiag_ino == 0) goto out_nosk; @@ -273,7 +277,7 @@ again: goto again; } - err = netlink_unicast(sock_diag_nlsk, rep, NETLINK_CB(in_skb).pid, + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); if (err > 0) err = 0; @@ -287,6 +291,7 @@ out_nosk: static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct unix_diag_req); + struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; @@ -295,7 +300,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) struct netlink_dump_control c = { .dump = unix_diag_dump, }; - return netlink_dump_start(sock_diag_nlsk, skb, h, &c); + return netlink_dump_start(net->diag_nlsk, skb, h, &c); } else return unix_diag_get_exact(skb, h, nlmsg_data(h)); } -- cgit v1.2.3 From 921a1650de9eed40dd64d681aba4a4d98856f289 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 01:15:31 +0400 Subject: new helper: done_path_create() releases what needs to be released after {kern,user}_path_create() Signed-off-by: Al Viro --- net/unix/af_unix.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/unix') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 641f2e47f165..e8239540683a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -887,8 +887,9 @@ out_mknod_drop_write: mnt_drop_write(path.mnt); if (err) goto out_mknod_dput; - mutex_unlock(&path.dentry->d_inode->i_mutex); - dput(path.dentry); + mntget(path.mnt); + dget(dentry); + done_path_create(&path, dentry); path.dentry = dentry; addr->hash = UNIX_HASH_SIZE; @@ -923,9 +924,7 @@ out: return err; out_mknod_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); out_mknod_parent: if (err == -EEXIST) err = -EADDRINUSE; -- cgit v1.2.3 From a8104a9fcdeb82e22d7acd55fca20746581067d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 02:25:00 +0400 Subject: pull mnt_want_write()/mnt_drop_write() into kern_path_create()/done_path_create() resp. One side effect - attempt to create a cross-device link on a read-only fs fails with EROFS instead of EXDEV now. Makes more sense, POSIX allows, etc. Signed-off-by: Al Viro --- net/unix/af_unix.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net/unix') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e8239540683a..88ab72820b9f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -876,15 +876,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = mnt_want_write(path.mnt); - if (err) - goto out_mknod_dput; err = security_path_mknod(&path, dentry, mode, 0); if (err) goto out_mknod_drop_write; err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); out_mknod_drop_write: - mnt_drop_write(path.mnt); if (err) goto out_mknod_dput; mntget(path.mnt); -- cgit v1.2.3 From faf02010290e202e275c1bf94ca9dd808bf85607 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Jul 2012 02:37:29 +0400 Subject: clean unix_bind() up a bit Signed-off-by: Al Viro --- net/unix/af_unix.c | 88 ++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 45 deletions(-) (limited to 'net/unix') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 88ab72820b9f..b7667f81c49c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -814,6 +814,34 @@ fail: return NULL; } +static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +{ + struct dentry *dentry; + struct path path; + int err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + return err; + + /* + * All right, let's create it. + */ + err = security_path_mknod(&path, dentry, mode, 0); + if (!err) { + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + if (!err) { + res->mnt = mntget(path.mnt); + res->dentry = dget(dentry); + } + } + done_path_create(&path, dentry); + return err; +} static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { @@ -822,8 +850,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - struct dentry *dentry = NULL; - struct path path; int err; unsigned int hash; struct unix_address *addr; @@ -860,40 +886,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) atomic_set(&addr->refcnt, 1); if (sun_path[0]) { - umode_t mode; - err = 0; - /* - * Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_mknod_parent; - - /* - * All right, let's create it. - */ - mode = S_IFSOCK | + struct path path; + umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = security_path_mknod(&path, dentry, mode, 0); - if (err) - goto out_mknod_drop_write; - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); -out_mknod_drop_write: - if (err) - goto out_mknod_dput; - mntget(path.mnt); - dget(dentry); - done_path_create(&path, dentry); - path.dentry = dentry; - + err = unix_mknod(sun_path, mode, &path); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + unix_release_addr(addr); + goto out_up; + } addr->hash = UNIX_HASH_SIZE; - } - - spin_lock(&unix_table_lock); - - if (!sun_path[0]) { + hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + spin_lock(&unix_table_lock); + u->path = path; + list = &unix_socket_table[hash]; + } else { + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { @@ -902,9 +911,6 @@ out_mknod_drop_write: } list = &unix_socket_table[addr->hash]; - } else { - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->path = path; } err = 0; @@ -918,14 +924,6 @@ out_up: mutex_unlock(&u->readlock); out: return err; - -out_mknod_dput: - done_path_create(&path, dentry); -out_mknod_parent: - if (err == -EEXIST) - err = -EADDRINUSE; - unix_release_addr(addr); - goto out_up; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) -- cgit v1.2.3