From 7e767aae0ed129f6e67f5fec09fa870be452788c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 21 Nov 2013 15:41:06 +1100
Subject: xfs: growfs overruns AGFL buffer on V4 filesystems

commit f94c44573e7c22860e2c3dfe349c45f72ba35ad3 upstream.

This loop in xfs_growfs_data_private() is incorrect for V4
superblocks filesystems:

		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);

For V4 filesystems, we don't have a agfl header structure, and so
XFS_AGFL_SIZE() returns an entire sector's worth of entries, which
we then index from an offset into the sector. Hence: buffer overrun.

This problem was introduced in 3.10 by commit 77c95bba ("xfs: add
CRC checks to the AGFL") which changed the AGFL structure but failed
to update the growfs code to handle the different structures.

Fix it by using the correct offset into the buffer for both V4 and
V5 filesystems.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_fsops.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3c3644ea825b..2288db4e1784 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -216,6 +216,8 @@ xfs_growfs_data_private(
 	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		__be32	*agfl_bno;
+
 		/*
 		 * AG freespace header block
 		 */
@@ -275,8 +277,10 @@ xfs_growfs_data_private(
 			agfl->agfl_seqno = cpu_to_be32(agno);
 			uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
 		}
+
+		agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
 		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
-			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+			agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
-- 
cgit v1.2.3


From f75eb9d4085192dc58c30a9384cf4496194be851 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 31 Oct 2013 21:00:10 +0300
Subject: xfs: underflow bug in xfs_attrlist_by_handle()

commit 31978b5cc66b8ba8a7e8eef60b12395d41b7b890 upstream.

If we allocate less than sizeof(struct attrlist) then we end up
corrupting memory or doing a ZERO_PTR_SIZE dereference.

This can only be triggered with CAP_SYS_ADMIN.

Reported-by: Nico Golde <nico@ngolde.de>
Reported-by: Fabian Yamaguchi <fabs@goesec.de>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_ioctl.c   | 3 ++-
 fs/xfs/xfs_ioctl32.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ca01d830e989..83dfe6e73235 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -409,7 +409,8 @@ xfs_attrlist_by_handle(
 		return -XFS_ERROR(EPERM);
 	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
 	/*
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c0c66259cc91..68799d7f02cc 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -359,7 +359,8 @@ xfs_compat_attrlist_by_handle(
 	if (copy_from_user(&al_hreq, arg,
 			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
 	/*
-- 
cgit v1.2.3


From fbaa929d862503b59110081efb57a40213193a6d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 2 Dec 2013 15:26:19 -0500
Subject: nfsd: when reusing an existing repcache entry, unhash it first

commit 781c2a5a5f75eacc04663aced0f0f1a648d4f308 upstream.

The DRC code will attempt to reuse an existing, expired cache entry in
preference to allocating a new one. It'll then search the cache, and if
it gets a hit it'll then free the cache entry that it was going to
reuse.

The cache code doesn't unhash the entry that it's going to reuse
however, so it's possible for it end up designating an entry for reuse
and then subsequently freeing the same entry after it finds it.  This
leads it to a later use-after-free situation and usually some list
corruption warnings or an oops.

Fix this by simply unhashing the entry that we intend to reuse. That
will mean that it's not findable via a search and should prevent this
situation from occurring.

Reported-by: Christoph Hellwig <hch@infradead.org>
Reported-by: g. artim <gartim@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfscache.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index e76244edd748..ec8d97ddc635 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -128,6 +128,13 @@ nfsd_reply_cache_alloc(void)
 	return rp;
 }
 
+static void
+nfsd_reply_cache_unhash(struct svc_cacherep *rp)
+{
+	hlist_del_init(&rp->c_hash);
+	list_del_init(&rp->c_lru);
+}
+
 static void
 nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
 {
@@ -403,7 +410,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 		rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
 		if (nfsd_cache_entry_expired(rp) ||
 		    num_drc_entries >= max_drc_entries) {
-			lru_put_end(rp);
+			nfsd_reply_cache_unhash(rp);
 			prune_cache_entries();
 			goto search_cache;
 		}
-- 
cgit v1.2.3


From 6b047827d4cdd57ac7c4f9da2d779d811649566f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 10 Jan 2013 03:57:25 -0500
Subject: Btrfs: fix access_ok() check in btrfs_ioctl_send()

commit 700ff4f095d78af0998953e922e041d75254518b upstream.

The closing parenthesis is in the wrong place.  We want to check
"sizeof(*arg->clone_sources) * arg->clone_sources_count" instead of
"sizeof(*arg->clone_sources * arg->clone_sources_count)".

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 09ea0bdde65f..256a9a46d544 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4623,8 +4623,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	}
 
 	if (!access_ok(VERIFY_READ, arg->clone_sources,
-			sizeof(*arg->clone_sources *
-			arg->clone_sources_count))) {
+			sizeof(*arg->clone_sources) *
+			arg->clone_sources_count)) {
 		ret = -EFAULT;
 		goto out;
 	}
-- 
cgit v1.2.3


From 89ec75229a591cd7a3f57c330dc0791b8646b112 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Fri, 6 Dec 2013 17:51:32 +0100
Subject: btrfs: call mnt_drop_write after interrupted subvol deletion

commit e43f998e47bae27e37e159915625e8d4b130153b upstream.

If btrfs_ioctl_snap_destroy blocks on the mutex and the process is
killed, mnt_write count is unbalanced and leads to unmountable
filesystem.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8dedf4019672..145b2c75ab83 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2093,7 +2093,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
 	if (err == -EINTR)
-		goto out;
+		goto out_drop_write;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -2235,6 +2235,7 @@ out_dput:
 	dput(dentry);
 out_unlock_dir:
 	mutex_unlock(&dir->i_mutex);
+out_drop_write:
 	mnt_drop_write_file(file);
 out:
 	kfree(vol_args);
-- 
cgit v1.2.3


From 0f8285ad902ab61b7631f059f0ffdf7a23b0601f Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Mon, 2 Dec 2013 19:59:31 +0100
Subject: nfs: fix do_div() warning by instead using sector_div()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3873d064b8538686bbbd4b858dc8a07db1f7f43a upstream.

When compiling a 32bit kernel with CONFIG_LBDAF=n the compiler complains like
shown below.  Fix this warning by instead using sector_div() which is provided
by the kernel.h header file.

fs/nfs/blocklayout/extents.c: In function ‘normalize’:
include/asm-generic/div64.h:43:28: warning: comparison of distinct pointer types lacks a cast [enabled by default]
fs/nfs/blocklayout/extents.c:47:13: note: in expansion of macro ‘do_div’
nfs/blocklayout/extents.c:47:2: warning: right shift count >= width of type [enabled by default]
fs/nfs/blocklayout/extents.c:47:2: warning: passing argument 1 of ‘__div64_32’ from incompatible pointer type [enabled by default]
include/asm-generic/div64.h:35:17: note: expected ‘uint64_t *’ but argument is of type ‘sector_t *’
 extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/blocklayout/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 9c3e117c3ed1..4d0161442565 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -44,7 +44,7 @@
 static inline sector_t normalize(sector_t s, int base)
 {
 	sector_t tmp = s; /* Since do_div modifies its argument */
-	return s - do_div(tmp, base);
+	return s - sector_div(tmp, base);
 }
 
 static inline sector_t normalize_up(sector_t s, int base)
-- 
cgit v1.2.3


From 39be1c3dd03ee291a006189fe1cf43a5322a6eaf Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Fri, 15 Nov 2013 16:36:16 -0500
Subject: NFSv4 wait on recovery for async session errors

commit 4a82fd7c4e78a1b7a224f9ae8bb7e1fd95f670e0 upstream.

When the state manager is processing the NFS4CLNT_DELEGRETURN flag, session
draining is off, but DELEGRETURN can still get a session error.
The async handler calls nfs4_schedule_session_recovery returns -EAGAIN, and
the DELEGRETURN done then restarts the RPC task in the prepare state.
With the state manager still processing the NFS4CLNT_DELEGRETURN flag with
session draining off, these DELEGRETURNs will cycle with errors filling up the
session slots.

This prevents OPEN reclaims (from nfs_delegation_claim_opens) required by the
NFS4CLNT_DELEGRETURN state manager processing from completing, hanging the
state manager in the __rpc_wait_for_completion_task in nfs4_run_open_task
as seen in this kernel thread dump:

kernel: 4.12.32.53-ma D 0000000000000000     0  3393      2 0x00000000
kernel: ffff88013995fb60 0000000000000046 ffff880138cc5400 ffff88013a9df140
kernel: ffff8800000265c0 ffffffff8116eef0 ffff88013fc10080 0000000300000001
kernel: ffff88013a4ad058 ffff88013995ffd8 000000000000fbc8 ffff88013a4ad058
kernel: Call Trace:
kernel: [<ffffffff8116eef0>] ? cache_alloc_refill+0x1c0/0x240
kernel: [<ffffffffa0358110>] ? rpc_wait_bit_killable+0x0/0xa0 [sunrpc]
kernel: [<ffffffffa0358152>] rpc_wait_bit_killable+0x42/0xa0 [sunrpc]
kernel: [<ffffffff8152914f>] __wait_on_bit+0x5f/0x90
kernel: [<ffffffffa0358110>] ? rpc_wait_bit_killable+0x0/0xa0 [sunrpc]
kernel: [<ffffffff815291f8>] out_of_line_wait_on_bit+0x78/0x90
kernel: [<ffffffff8109b520>] ? wake_bit_function+0x0/0x50
kernel: [<ffffffffa035810d>] __rpc_wait_for_completion_task+0x2d/0x30 [sunrpc]
kernel: [<ffffffffa040d44c>] nfs4_run_open_task+0x11c/0x160 [nfs]
kernel: [<ffffffffa04114e7>] nfs4_open_recover_helper+0x87/0x120 [nfs]
kernel: [<ffffffffa0411646>] nfs4_open_recover+0xc6/0x150 [nfs]
kernel: [<ffffffffa040cc6f>] ? nfs4_open_recoverdata_alloc+0x2f/0x60 [nfs]
kernel: [<ffffffffa0414e1a>] nfs4_open_delegation_recall+0x6a/0xa0 [nfs]
kernel: [<ffffffffa0424020>] nfs_end_delegation_return+0x120/0x2e0 [nfs]
kernel: [<ffffffff8109580f>] ? queue_work+0x1f/0x30
kernel: [<ffffffffa0424347>] nfs_client_return_marked_delegations+0xd7/0x110 [nfs]
kernel: [<ffffffffa04225d8>] nfs4_run_state_manager+0x548/0x620 [nfs]
kernel: [<ffffffffa0422090>] ? nfs4_run_state_manager+0x0/0x620 [nfs]
kernel: [<ffffffff8109b0f6>] kthread+0x96/0xa0
kernel: [<ffffffff8100c20a>] child_rip+0xa/0x20
kernel: [<ffffffff8109b060>] ? kthread+0x0/0xa0
kernel: [<ffffffff8100c200>] ? child_rip+0x0/0x20

The state manager can not therefore process the DELEGRETURN session errors.
Change the async handler to wait for recovery on session errors.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 23881380dc6f..75e49d42a7fb 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4222,8 +4222,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
 			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
-			task->tk_status = 0;
-			return -EAGAIN;
+			goto wait_on_recovery;
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
-- 
cgit v1.2.3


From fb5834ff2b2823e1eaecd920e2ade3e07f4a42f9 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Sun, 29 Sep 2013 10:33:16 +0800
Subject: Btrfs: fix memory leak of chunks' extent map

commit 7d3d1744f8a7d62e4875bd69cc2192a939813880 upstream.

As we're hold a ref on looking up the extent map, we need to drop the ref
before returning to callers.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bffb9174afb..b6c23c4abae2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4248,6 +4248,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got "
 			    "%Lu-%Lu\n", logical, logical+len, em->start,
 			    em->start + em->len);
+		free_extent_map(em);
 		return 1;
 	}
 
@@ -4429,6 +4430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
 			   "found %Lu-%Lu\n", logical, em->start,
 			   em->start + em->len);
+		free_extent_map(em);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From b395193ecf087a36e98b97c396f839b3fe9a4d19 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fusionio.com>
Date: Mon, 14 Oct 2013 17:23:08 -0400
Subject: Btrfs: fix hole check in log_one_extent

commit ed9e8af88e2551aaa6bf51d8063a2493e2d71597 upstream.

I added an assert to make sure we were looking up aligned offsets for csums and
I tripped it when running xfstests.  This is because log_one_extent was checking
if block_start == 0 for a hole instead of EXTENT_MAP_HOLE.  This worked out fine
in practice it seems, but it adds a lot of extra work that is uneeded.  With
this fix I'm no longer tripping my assert.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf68596b51fb..bca436330681 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3314,7 +3314,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 		btrfs_set_token_file_extent_type(leaf, fi,
 						 BTRFS_FILE_EXTENT_REG,
 						 &token);
-		if (em->block_start == 0)
+		if (em->block_start == EXTENT_MAP_HOLE)
 			skip_csum = true;
 	}
 
-- 
cgit v1.2.3


From 9436cf971ed1aa296e6ca36489f5240d3868ff84 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Tue, 15 Oct 2013 18:44:00 +0100
Subject: Btrfs: fix incorrect inode acl reset

commit 8185554d3eb09d23a805456b6fa98dcbb34aa518 upstream.

When a directory has a default ACL and a subdirectory is created
under that directory, btrfs_init_acl() is called when the
subdirectory's inode is created to initialize the inode's ACL
(inherited from the parent directory) but it was clearing the ACL
from the inode after setting it if posix_acl_create() returned
success, instead of clearing it only if it returned an error.

To reproduce this issue:

$ mkfs.btrfs -f /dev/loop0
$ mount /dev/loop0 /mnt
$ mkdir /mnt/acl
$ setfacl -d --set u::rwx,g::rwx,o::- /mnt/acl
$ getfacl /mnt/acl
user::rwx
group::rwx
other::r-x
default:user::rwx
default:group::rwx
default:other::---

$ mkdir /mnt/acl/dir1
$ getfacl /mnt/acl/dir1
user::rwx
group::rwx
other::---

After unmounting and mounting again the filesystem, fgetacl returned the
expected ACL:

$ umount /mnt/acl
$ mount /dev/loop0 /mnt
$ getfacl /mnt/acl/dir1
user::rwx
group::rwx
other::---
default:user::rwx
default:group::rwx
default:other::---

Meaning that the underlying xattr was persisted.

Reported-by: Giuseppe Fierro <giuseppe@fierro.org>
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index e15d2b0d8d3b..0890c83643e9 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -229,7 +229,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
 		if (ret > 0) {
 			/* we need an acl */
 			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
-		} else {
+		} else if (ret < 0) {
 			cache_no_acl(inode);
 		}
 	} else {
-- 
cgit v1.2.3


From f5749e3720c038fe92dc9ffb5e6fe7b4d8809ea1 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Tue, 29 Oct 2013 10:45:05 +0800
Subject: Btrfs: do not run snapshot-aware defragment on error

commit 6f519564d7d978c00351d9ab6abac3deeac31621 upstream.

If something wrong happens in write endio, running snapshot-aware defragment
can end up with undefined results, maybe a crash, so we should avoid it.

In order to share similar code, this also adds a helper to free the struct for
snapshot-aware defrag.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e2288dc5346..0bcee78cde16 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2419,10 +2419,23 @@ out_unlock:
 	return ret;
 }
 
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+	struct old_sa_defrag_extent *old, *tmp;
+
+	if (!new)
+		return;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+	kfree(new);
+}
+
 static void relink_file_extents(struct new_sa_defrag_extent *new)
 {
 	struct btrfs_path *path;
-	struct old_sa_defrag_extent *old, *tmp;
 	struct sa_defrag_extent_backref *backref;
 	struct sa_defrag_extent_backref *prev = NULL;
 	struct inode *inode;
@@ -2465,16 +2478,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
 	kfree(prev);
 
 	btrfs_free_path(path);
-
-	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
-		kfree(old);
-	}
 out:
+	free_sa_defrag_extent(new);
+
 	atomic_dec(&root->fs_info->defrag_running);
 	wake_up(&root->fs_info->transaction_wait);
-
-	kfree(new);
 }
 
 static struct new_sa_defrag_extent *
@@ -2484,7 +2492,7 @@ record_old_file_extents(struct inode *inode,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct old_sa_defrag_extent *old, *tmp;
+	struct old_sa_defrag_extent *old;
 	struct new_sa_defrag_extent *new;
 	int ret;
 
@@ -2532,7 +2540,7 @@ record_old_file_extents(struct inode *inode,
 		if (slot >= btrfs_header_nritems(l)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
-				goto out_free_list;
+				goto out_free_path;
 			else if (ret > 0)
 				break;
 			continue;
@@ -2561,7 +2569,7 @@ record_old_file_extents(struct inode *inode,
 
 		old = kmalloc(sizeof(*old), GFP_NOFS);
 		if (!old)
-			goto out_free_list;
+			goto out_free_path;
 
 		offset = max(new->file_pos, key.offset);
 		end = min(new->file_pos + new->len, key.offset + num_bytes);
@@ -2583,15 +2591,10 @@ next:
 
 	return new;
 
-out_free_list:
-	list_for_each_entry_safe(old, tmp, &new->head, list) {
-		list_del(&old->list);
-		kfree(old);
-	}
 out_free_path:
 	btrfs_free_path(path);
 out_kfree:
-	kfree(new);
+	free_sa_defrag_extent(new);
 	return NULL;
 }
 
@@ -2743,8 +2746,14 @@ out:
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
 	/* for snapshot-aware defrag */
-	if (new)
-		relink_file_extents(new);
+	if (new) {
+		if (ret) {
+			free_sa_defrag_extent(new);
+			atomic_dec(&root->fs_info->defrag_running);
+		} else {
+			relink_file_extents(new);
+		}
+	}
 
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
-- 
cgit v1.2.3


From 2c460e835ceecda4b327aacb6423b54f9a89042f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Thu, 26 Sep 2013 14:25:36 +0800
Subject: ceph: cleanup aborted requests when re-sending requests.

commit eb1b8af33c2e42a9a57fc0a7588f4a7b255d2e79 upstream.

Aborted requests usually get cleared when the reply is received.
If MDS crashes, no reply will be received. So we need to cleanup
aborted requests when re-sending requests.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mds_client.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4d2920304be8..0af8e298d5b9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1840,8 +1840,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	int mds = -1;
 	int err = -EAGAIN;
 
-	if (req->r_err || req->r_got_result)
+	if (req->r_err || req->r_got_result) {
+		if (req->r_aborted)
+			__unregister_request(mdsc, req);
 		goto out;
+	}
 
 	if (req->r_timeout &&
 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
-- 
cgit v1.2.3


From d58dd25632be5ec0250630ccddfea7e9bd913ecb Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Thu, 31 Oct 2013 09:10:47 +0800
Subject: ceph: wake up 'safe' waiters when unregistering request

commit fc55d2c9448b34218ca58733a6f51fbede09575b upstream.

We also need to wake up 'safe' waiters if error occurs or request
aborted. Otherwise sync(2)/fsync(2) may hang forever.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mds_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0af8e298d5b9..1ce14c18c468 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -639,6 +639,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		req->r_unsafe_dir = NULL;
 	}
 
+	complete_all(&req->r_safe_completion);
+
 	ceph_mdsc_put_request(req);
 }
 
@@ -2154,7 +2156,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	if (head->safe) {
 		req->r_got_safe = true;
 		__unregister_request(mdsc, req);
-		complete_all(&req->r_safe_completion);
 
 		if (req->r_got_unsafe) {
 			/*
-- 
cgit v1.2.3


From 6b8588219a59ae34f99e20f17bc9756f831a9b2e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 2 Dec 2013 09:31:36 -0500
Subject: ext4: call ext4_error_inode() if jbd2_journal_dirty_metadata() fails

commit ae1495b12df1897d4f42842a7aa7276d920f6290 upstream.

While it's true that errors can only happen if there is a bug in
jbd2_journal_dirty_metadata(), if a bug does happen, we need to halt
the kernel or remount the file system read-only in order to avoid
further data loss.  The ext4_journal_abort_handle() function doesn't
do any of this, and while it's likely that this call (since it doesn't
adjust refcounts) will likely result in the file system eventually
deadlocking since the current transaction will never be able to close,
it's much cleaner to call let ext4's error handling system deal with
this situation.

There's a separate bug here which is that if certain jbd2 errors
errors occur and file system is mounted errors=continue, the file
system will probably eventually end grind to a halt as described
above.  But things have been this way in a long time, and usually when
we have these sorts of errors it's pretty much a disaster --- and
that's why the jbd2 layer aggressively retries memory allocations,
which is the most likely cause of these jbd2 errors.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4_jbd2.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 1c88061da526..1be3996b5942 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -223,6 +223,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 		if (WARN_ON_ONCE(err)) {
 			ext4_journal_abort_handle(where, line, __func__, bh,
 						  handle, err);
+			ext4_error_inode(inode, where, line,
+					 bh->b_blocknr,
+					 "journal_dirty_metadata failed: "
+					 "handle type %u started at line %u, "
+					 "credits %u/%u, errcode %d",
+					 handle->h_type,
+					 handle->h_line_no,
+					 handle->h_requested_credits,
+					 handle->h_buffer_credits, err);
 		}
 	} else {
 		if (inode)
-- 
cgit v1.2.3


From e696abfcc89a087afef75ebca0848f37fb9877ae Mon Sep 17 00:00:00 2001
From: Junho Ryu <jayr@google.com>
Date: Tue, 3 Dec 2013 18:10:28 -0500
Subject: ext4: fix use-after-free in ext4_mb_new_blocks

commit 4e8d2139802ce4f41936a687f06c560b12115247 upstream.

ext4_mb_put_pa should hold pa->pa_lock before accessing pa->pa_count.
While ext4_mb_use_preallocated checks pa->pa_deleted first and then
increments pa->count later, ext4_mb_put_pa decrements pa->pa_count
before holding pa->pa_lock and then sets pa->pa_deleted.

* Free sequence
ext4_mb_put_pa (1):		atomic_dec_and_test pa->pa_count
ext4_mb_put_pa (2):		lock pa->pa_lock
ext4_mb_put_pa (3):			check pa->pa_deleted
ext4_mb_put_pa (4):			set pa->pa_deleted=1
ext4_mb_put_pa (5):		unlock pa->pa_lock
ext4_mb_put_pa (6):		remove pa from a list
ext4_mb_pa_callback:		free pa

* Use sequence
ext4_mb_use_preallocated (1):	iterate over preallocation
ext4_mb_use_preallocated (2):	lock pa->pa_lock
ext4_mb_use_preallocated (3):		check pa->pa_deleted
ext4_mb_use_preallocated (4):		increase pa->pa_count
ext4_mb_use_preallocated (5):	unlock pa->pa_lock
ext4_mb_release_context:	access pa

* Use-after-free sequence
[initial status]		<pa->pa_deleted = 0, pa_count = 1>
ext4_mb_use_preallocated (1):	iterate over preallocation
ext4_mb_use_preallocated (2):	lock pa->pa_lock
ext4_mb_use_preallocated (3):		check pa->pa_deleted
ext4_mb_put_pa (1):		atomic_dec_and_test pa->pa_count
[pa_count decremented]		<pa->pa_deleted = 0, pa_count = 0>
ext4_mb_use_preallocated (4):		increase pa->pa_count
[pa_count incremented]		<pa->pa_deleted = 0, pa_count = 1>
ext4_mb_use_preallocated (5):	unlock pa->pa_lock
ext4_mb_put_pa (2):		lock pa->pa_lock
ext4_mb_put_pa (3):			check pa->pa_deleted
ext4_mb_put_pa (4):			set pa->pa_deleted=1
[race condition!]		<pa->pa_deleted = 1, pa_count = 1>
ext4_mb_put_pa (5):		unlock pa->pa_lock
ext4_mb_put_pa (6):		remove pa from a list
ext4_mb_pa_callback:		free pa
ext4_mb_release_context:	access pa

AddressSanitizer has detected use-after-free in ext4_mb_new_blocks
Bug report: http://goo.gl/rG1On3

Signed-off-by: Junho Ryu <jayr@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/mballoc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 59c6750b894f..c1f58e0f26c3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3423,6 +3423,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 {
 	struct ext4_prealloc_space *pa;
 	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+	BUG_ON(atomic_read(&pa->pa_count));
+	BUG_ON(pa->pa_deleted == 0);
 	kmem_cache_free(ext4_pspace_cachep, pa);
 }
 
@@ -3436,11 +3439,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	ext4_group_t grp;
 	ext4_fsblk_t grp_blk;
 
-	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
-		return;
-
 	/* in this short window concurrent discard can set pa_deleted */
 	spin_lock(&pa->pa_lock);
+	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
 	if (pa->pa_deleted == 1) {
 		spin_unlock(&pa->pa_lock);
 		return;
-- 
cgit v1.2.3


From ae21dda05193c441bde106a4bbf88c185a68fbed Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Tue, 3 Dec 2013 21:22:21 -0500
Subject: ext4: check for overlapping extents in ext4_valid_extent_entries()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 5946d089379a35dda0e531710b48fca05446a196 upstream.

A corrupted ext4 may have out of order leaf extents, i.e.

extent: lblk 0--1023, len 1024, pblk 9217, flags: LEAF UNINIT
extent: lblk 1000--2047, len 1024, pblk 10241, flags: LEAF UNINIT
             ^^^^ overlap with previous extent

Reading such extent could hit BUG_ON() in ext4_es_cache_extent().

	BUG_ON(end < lblk);

The problem is that __read_extent_tree_block() tries to cache holes as
well but assumes 'lblk' is greater than 'prev' and passes underflowed
length to ext4_es_cache_extent(). Fix it by checking for overlapping
extents in ext4_valid_extent_entries().

I hit this when fuzz testing ext4, and am able to reproduce it by
modifying the on-disk extent by hand.

Also add the check for (ee_block + len - 1) in ext4_valid_extent() to
make sure the value is not overflow.

Ran xfstests on patched ext4 and no regression.

Cc: Lukáš Czerner <lczerner@redhat.com>
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/extents.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index dc1e03047226..d480d49d2696 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
 	ext4_fsblk_t block = ext4_ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
+	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+	ext4_lblk_t last = lblock + len - 1;
 
-	if (len == 0)
+	if (lblock > last)
 		return 0;
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
@@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,
 	if (depth == 0) {
 		/* leaf entries */
 		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+		ext4_fsblk_t pblock = 0;
+		ext4_lblk_t lblock = 0;
+		ext4_lblk_t prev = 0;
+		int len = 0;
 		while (entries) {
 			if (!ext4_valid_extent(inode, ext))
 				return 0;
+
+			/* Check for overlapping extents */
+			lblock = le32_to_cpu(ext->ee_block);
+			len = ext4_ext_get_actual_len(ext);
+			if ((lblock <= prev) && prev) {
+				pblock = ext4_ext_pblock(ext);
+				es->s_last_error_block = cpu_to_le64(pblock);
+				return 0;
+			}
 			ext++;
 			entries--;
+			prev = lblock + len - 1;
 		}
 	} else {
 		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
-- 
cgit v1.2.3


From b35e443037c2e162a015a5f46009e8c21e673b63 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 8 Dec 2013 21:11:59 -0500
Subject: ext4: Do not reserve clusters when fs doesn't support extents

commit 30fac0f75da24dd5bb43c9e911d2039a984ac815 upstream.

When the filesystem doesn't support extents (like in ext2/3
compatibility modes), there is no need to reserve any clusters. Space
estimates for writing are exact, hole punching doesn't need new
metadata, and there are no unwritten extents to convert.

This fixes a problem when filesystem still having some free space when
accessed with a native ext2/3 driver suddently reports ENOSPC when
accessed with ext4 driver.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3f7c39e6d097..e4923b6a9e39 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3213,10 +3213,18 @@ int ext4_calculate_overhead(struct super_block *sb)
 }
 
 
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t resv_clusters;
 
+	/*
+	 * There's no need to reserve anything when we aren't using extents.
+	 * The space estimates are exact, there are no unwritten extents,
+	 * hole punching doesn't need new metadata... This is needed especially
+	 * to keep ext2/3 backward compatibility.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		return 0;
 	/*
 	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
 	 * This should cover the situations where we can not afford to run
@@ -3225,7 +3233,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
 	 * allocation would require 1, or 2 blocks, higher numbers are
 	 * very rare.
 	 */
-	resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+			EXT4_SB(sb)->s_cluster_bits;
 
 	do_div(resv_clusters, 50);
 	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
@@ -3969,10 +3978,10 @@ no_journal:
 			 "available");
 	}
 
-	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
-			 "reserved pool", ext4_calculate_resv_clusters(sbi));
+			 "reserved pool", ext4_calculate_resv_clusters(sb));
 		goto failed_mount4a;
 	}
 
-- 
cgit v1.2.3


From ad403b48be4d1777fdfb65966cefee9e2d8a8bcd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 18 Dec 2013 00:44:44 -0500
Subject: ext4: fix deadlock when writing in ENOSPC conditions

commit 34cf865d54813aab3497838132fb1bbd293f4054 upstream.

Akira-san has been reporting rare deadlocks of his machine when running
xfstests test 269 on ext4 filesystem. The problem turned out to be in
ext4_da_reserve_metadata() and ext4_da_reserve_space() which called
ext4_should_retry_alloc() while holding i_data_sem. Since
ext4_should_retry_alloc() can force a transaction commit, this is a
lock ordering violation and leads to deadlocks.

Fix the problem by just removing the retry loops. These functions should
just report ENOSPC to the caller (e.g. ext4_da_write_begin()) and that
function must take care of retrying after dropping all necessary locks.

Reported-and-tested-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 904ca1a21dce..cb2bdc7ccb05 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1263,7 +1263,6 @@ static int ext4_journalled_write_end(struct file *file,
  */
 static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
 {
-	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int md_needed;
@@ -1275,7 +1274,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
-repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1295,10 +1293,6 @@ repeat:
 		ei->i_da_metadata_calc_len = save_len;
 		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
 		spin_unlock(&ei->i_block_reservation_lock);
-		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-			cond_resched();
-			goto repeat;
-		}
 		return -ENOSPC;
 	}
 	ei->i_reserved_meta_blocks += md_needed;
@@ -1312,7 +1306,6 @@ repeat:
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
-	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int md_needed;
@@ -1334,7 +1327,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
-repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1354,10 +1346,6 @@ repeat:
 		ei->i_da_metadata_calc_len = save_len;
 		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
 		spin_unlock(&ei->i_block_reservation_lock);
-		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-			cond_resched();
-			goto repeat;
-		}
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		return -ENOSPC;
 	}
-- 
cgit v1.2.3


From cd7442730bdf3c8ecdd9a0ae8d77750b359fe60e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 20 Dec 2013 09:29:35 -0500
Subject: ext4: add explicit casts when masking cluster sizes

commit f5a44db5d2d677dfbf12deee461f85e9ec633961 upstream.

The missing casts can cause the high 64-bits of the physical blocks to
be lost.  Set up new macros which allows us to make sure the right
thing happen, even if at some point we end up supporting larger
logical block numbers.

Thanks to the Emese Revfy and the PaX security team for reporting this
issue.

Reported-by: PaX Team <pageexec@freemail.hu>
Reported-by: Emese Revfy <re.emese@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4.h    | 10 ++++++++++
 fs/ext4/extents.c | 24 +++++++++++-------------
 fs/ext4/mballoc.c |  6 +++---
 3 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d12d400..7bb2e2e55123 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -280,6 +280,16 @@ struct ext4_io_submit {
 /* Translate # of blks to # of clusters */
 #define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \
 				 (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &				\
+				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
+				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
+				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) &				\
+				 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
 
 /*
  * Structure of a blocks group descriptor
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d480d49d2696..ecabf00829e2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1772,8 +1772,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
 	depth = ext_depth(inode);
 	if (!path[depth].p_ext)
 		goto out;
-	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
-	b2 &= ~(sbi->s_cluster_ratio - 1);
+	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
 
 	/*
 	 * get the next allocated block if the extent in the path
@@ -1783,7 +1782,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
 		b2 = ext4_ext_next_allocated_block(path);
 		if (b2 == EXT_MAX_BLOCKS)
 			goto out;
-		b2 &= ~(sbi->s_cluster_ratio - 1);
+		b2 = EXT4_LBLK_CMASK(sbi, b2);
 	}
 
 	/* check for wrap through zero on extent logical start block*/
@@ -2444,7 +2443,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		 * truncate operation has removed all of the blocks in
 		 * the cluster.
 		 */
-		if (pblk & (sbi->s_cluster_ratio - 1) &&
+		if (EXT4_PBLK_COFF(sbi, pblk) &&
 		    (ee_len == num))
 			*partial_cluster = EXT4_B2C(sbi, pblk);
 		else
@@ -3675,7 +3674,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_lblk_t lblk_start, lblk_end;
-	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
 	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
 
 	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
@@ -3734,9 +3733,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
 
 	/* Check towards left side */
-	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
 	if (c_offset) {
-		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
 		lblk_to = lblk_from + c_offset - 1;
 
 		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
@@ -3744,7 +3743,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 	}
 
 	/* Now check towards right. */
-	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
 	if (allocated_clusters && c_offset) {
 		lblk_from = lblk_start + num_blks;
 		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
@@ -3952,7 +3951,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 				     struct ext4_ext_path *path)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
 	ext4_lblk_t rr_cluster_start;
 	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
@@ -3970,8 +3969,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 	    (rr_cluster_start == ex_cluster_start)) {
 		if (rr_cluster_start == ex_cluster_end)
 			ee_start += ee_len - 1;
-		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
-			c_offset;
+		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
 		map->m_len = min(map->m_len,
 				 (unsigned) sbi->s_cluster_ratio - c_offset);
 		/*
@@ -4125,7 +4123,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 */
 	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
 	newex.ee_block = cpu_to_le32(map->m_lblk);
-	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+	cluster_offset = EXT4_LBLK_CMASK(sbi, map->m_lblk);
 
 	/*
 	 * If we are doing bigalloc, check to see if the extent returned
@@ -4193,7 +4191,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * needed so that future calls to get_implied_cluster_alloc()
 	 * work correctly.
 	 */
-	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
 	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
 	ar.goal -= offset;
 	ar.logical -= offset;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c1f58e0f26c3..4c0e3e023fc8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4107,7 +4107,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
 
 	/* set up allocation goals */
-	ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
 	ac->ac_status = AC_STATUS_CONTINUE;
 	ac->ac_sb = sb;
 	ac->ac_inode = ar->inode;
@@ -4644,7 +4644,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	 * blocks at the beginning or the end unless we are explicitly
 	 * requested to avoid doing so.
 	 */
-	overflow = block & (sbi->s_cluster_ratio - 1);
+	overflow = EXT4_PBLK_COFF(sbi, block);
 	if (overflow) {
 		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
 			overflow = sbi->s_cluster_ratio - overflow;
@@ -4658,7 +4658,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			count += overflow;
 		}
 	}
-	overflow = count & (sbi->s_cluster_ratio - 1);
+	overflow = EXT4_LBLK_COFF(sbi, count);
 	if (overflow) {
 		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
 			if (count > overflow)
-- 
cgit v1.2.3


From 8164ddf1f6f9fca6020143a259a9f9667d53c1e4 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Wed, 30 Oct 2013 11:10:52 -0400
Subject: ext4: fix FITRIM in no journal mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 8f9ff189205a6817aee5a1f996f876541f86e07c upstream.

When using FITRIM ioctl on a file system without journal it will
only trim the block group once, no matter how many times you invoke
FITRIM ioctl and how many block you release from the block group.

It is because we only clear EXT4_GROUP_INFO_WAS_TRIMMED_BIT in journal
callback. Fix this by clearing the bit in no journal mode as well.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Jorge Fábregas <jorge.fabregas@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/mballoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4c0e3e023fc8..fba960ee26de 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4771,8 +4771,8 @@ do_more:
 					 " group:%d block:%d count:%lu failed"
 					 " with %d", block_group, bit, count,
 					 err);
-		}
-
+		} else
+			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
-- 
cgit v1.2.3


From 88285f766c96205ccf30afd3d0621401701ff26c Mon Sep 17 00:00:00 2001
From: Li Wang <liwang@ubuntukylin.com>
Date: Wed, 13 Nov 2013 15:22:14 +0800
Subject: ceph: Avoid data inconsistency due to d-cache aliasing in readpage()

commit 56f91aad69444d650237295f68c195b74d888d95 upstream.

If the length of data to be read in readpage() is exactly
PAGE_CACHE_SIZE, the original code does not flush d-cache
for data consistency after finishing reading. This patches fixes
this.

Signed-off-by: Li Wang <liwang@ubuntukylin.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/addr.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3e68ac101040..5da06f020986 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -213,9 +213,13 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 	if (err < 0) {
 		SetPageError(page);
 		goto out;
-	} else if (err < PAGE_CACHE_SIZE) {
+	} else {
+		if (err < PAGE_CACHE_SIZE) {
 		/* zero fill remainder of page */
-		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+			zero_user_segment(page, err, PAGE_CACHE_SIZE);
+		} else {
+			flush_dcache_page(page);
+		}
 	}
 	SetPageUptodate(page);
 
-- 
cgit v1.2.3


From 21b6291b88c534d8c6018ad483e232abab63f644 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 6 Dec 2013 11:52:34 +0000
Subject: GFS2: don't hold s_umount over blkdev_put

commit dfe5b9ad83a63180f358b27d1018649a27b394a9 upstream.

This is a GFS2 version of Tejun's patch:
4f331f01b9c43bf001d3ffee578a97a1e0633eac
vfs: don't hold s_umount over close_bdev_exclusive() call

In this case its blkdev_put itself that is the issue and this
patch uses the same solution of dropping and retaking s_umount.

Reported-by: Tejun Heo <tj@kernel.org>
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/ops_fstype.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 60ede2a0f43f..f7dd3b4f8ab0 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1317,8 +1317,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(s))
 		goto error_bdev;
 
-	if (s->s_root)
+	if (s->s_root) {
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
 		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	}
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
-- 
cgit v1.2.3


From 8553459e73c6da3e5b9da9239dd8ef017181252a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 18 Dec 2013 14:14:52 +0000
Subject: GFS2: Fix incorrect invalidation for DIO/buffered I/O

commit dfd11184d894cd0a92397b25cac18831a1a6a5bc upstream.

In patch 209806aba9d540dde3db0a5ce72307f85f33468f we allowed
local deferred locks to be granted against a cached exclusive
lock. That opened up a corner case which this patch now
fixes.

The solution to the problem is to check whether we have cached
pages each time we do direct I/O and if so to unmap, flush
and invalidate those pages. Since the glock state machine
normally does that for us, mostly the code will be a no-op.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/aops.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0bad69ed6336..76251600cbea 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -999,6 +999,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int rv;
@@ -1019,6 +1020,35 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	if (rv != 1)
 		goto out; /* dio not valid, fall back to buffered i/o */
 
+	/*
+	 * Now since we are holding a deferred (CW) lock at this point, you
+	 * might be wondering why this is ever needed. There is a case however
+	 * where we've granted a deferred local lock against a cached exclusive
+	 * glock. That is ok provided all granted local locks are deferred, but
+	 * it also means that it is possible to encounter pages which are
+	 * cached and possibly also mapped. So here we check for that and sort
+	 * them out ahead of the dio. The glock state machine will take care of
+	 * everything else.
+	 *
+	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+	 * the first place, mapping->nr_pages will always be zero.
+	 */
+	if (mapping->nrpages) {
+		loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+		loff_t len = iov_length(iov, nr_segs);
+		loff_t end = PAGE_ALIGN(offset + len) - 1;
+
+		rv = 0;
+		if (len == 0)
+			goto out;
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+		rv = filemap_write_and_wait_range(mapping, lstart, end);
+		if (rv)
+			return rv;
+		truncate_inode_pages_range(mapping, lstart, end);
+	}
+
 	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, gfs2_get_block_direct,
 				  NULL, NULL, 0);
-- 
cgit v1.2.3


From ceed0859339443bc4fa22b9e0a825028ba8cd330 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 8 Dec 2013 21:12:59 -0500
Subject: jbd2: don't BUG but return ENOSPC if a handle runs out of space

commit f6c07cad081ba222d63623d913aafba5586c1d2c upstream.

If a handle runs out of space, we currently stop the kernel with a BUG
in jbd2_journal_dirty_metadata().  This makes it hard to figure out
what might be going on.  So return an error of ENOSPC, so we can let
the file system layer figure out what is going on, to make it more
likely we can get useful debugging information).  This should make it
easier to debug problems such as the one which was reported by:

    https://bugzilla.kernel.org/show_bug.cgi?id=44731

The only two callers of this function are ext4_handle_dirty_metadata()
and ocfs2_journal_dirty().  The ocfs2 function will trigger a
BUG_ON(), which means there will be no change in behavior.  The ext4
function will call ext4_error_inode() which will print the useful
debugging information and then handle the situation using ext4's error
handling mechanisms (i.e., which might mean halting the kernel or
remounting the file system read-only).

Also, since both file systems already call WARN_ON(), drop the WARN_ON
from jbd2_journal_dirty_metadata() to avoid two stack traces from
being displayed.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: ocfs2-devel@oss.oracle.com
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/transaction.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e0c0bc275924..a6917125f215 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1151,7 +1151,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 		 * once a transaction -bzzz
 		 */
 		jh->b_modified = 1;
-		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		if (handle->h_buffer_credits <= 0) {
+			ret = -ENOSPC;
+			goto out_unlock_bh;
+		}
 		handle->h_buffer_credits--;
 	}
 
@@ -1234,7 +1237,6 @@ out_unlock_bh:
 	jbd2_journal_put_journal_head(jh);
 out:
 	JBUFFER_TRACE(jh, "exit");
-	WARN_ON(ret);	/* All errors are bugs, so dump the stack */
 	return ret;
 }
 
-- 
cgit v1.2.3


From f1b6559cfa8c280249b52112223459b00dcdb6e9 Mon Sep 17 00:00:00 2001
From: Emil Goode <emilgoode@gmail.com>
Date: Tue, 28 May 2013 16:59:00 +0200
Subject: ceph: improve error handling in ceph_mdsmap_decode

commit c213b50b7dcbf06abcfbf1e4eee5b76586718bd9 upstream.

This patch makes the following improvements to the error handling
in the ceph_mdsmap_decode function:

- Add a NULL check for return value from kcalloc
- Make use of the variable err

Signed-off-by: Emil Goode <emilgoode@gmail.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mdsmap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 9278dec9e940..d4d38977dcbb 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -138,6 +138,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 				m->m_info[mds].export_targets =
 					kcalloc(num_export_targets, sizeof(u32),
 						GFP_NOFS);
+				if (m->m_info[mds].export_targets == NULL)
+					goto badmem;
 				for (j = 0; j < num_export_targets; j++)
 					m->m_info[mds].export_targets[j] =
 					       ceph_decode_32(&pexport_targets);
@@ -170,7 +172,7 @@ bad:
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
 	ceph_mdsmap_destroy(m);
-	return ERR_PTR(-EINVAL);
+	return ERR_PTR(err);
 }
 
 void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
-- 
cgit v1.2.3


From d5f9a684f9f21324fa58e97be127589db6c06a56 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Tue, 25 Jun 2013 14:48:19 +0800
Subject: ceph: Free mdsc if alloc mdsc->mdsmap failed.

commit fb3101b6f0db9ae3f35dc8e6ec908d0af8cdf12e upstream.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mds_client.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1ce14c18c468..63c789eb1e3e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3044,8 +3044,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	fsc->mdsc = mdsc;
 	mutex_init(&mdsc->mutex);
 	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
-	if (mdsc->mdsmap == NULL)
+	if (mdsc->mdsmap == NULL) {
+		kfree(mdsc);
 		return -ENOMEM;
+	}
 
 	init_completion(&mdsc->safe_umount_waiters);
 	init_waitqueue_head(&mdsc->session_close_wq);
-- 
cgit v1.2.3


From 724184c7a1fc44955f8900c382f18545423bc486 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Mon, 1 Jul 2013 18:33:39 -0400
Subject: ceph: avoid accessing invalid memory

commit 5446429630257f4723829409337a26c076907d5d upstream.

when mounting ceph with a dev name that starts with a slash, ceph
would attempt to access the character before that slash. Since we
don't actually own that byte of memory, we would trigger an
invalid access:

[   43.499934] BUG: unable to handle kernel paging request at ffff880fa3a97fff
[   43.500984] IP: [<ffffffff818f3884>] parse_mount_options+0x1a4/0x300
[   43.501491] PGD 743b067 PUD 10283c4067 PMD 10282a6067 PTE 8000000fa3a97060
[   43.502301] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[   43.503006] Dumping ftrace buffer:
[   43.503596]    (ftrace buffer empty)
[   43.504046] CPU: 0 PID: 10879 Comm: mount Tainted: G        W    3.10.0-sasha #1129
[   43.504851] task: ffff880fa625b000 ti: ffff880fa3412000 task.ti: ffff880fa3412000
[   43.505608] RIP: 0010:[<ffffffff818f3884>]  [<ffffffff818f3884>] parse_mount_options$
[   43.506552] RSP: 0018:ffff880fa3413d08  EFLAGS: 00010286
[   43.507133] RAX: ffff880fa3a98000 RBX: ffff880fa3a98000 RCX: 0000000000000000
[   43.507893] RDX: ffff880fa3a98001 RSI: 000000000000002f RDI: ffff880fa3a98000
[   43.508610] RBP: ffff880fa3413d58 R08: 0000000000001f99 R09: ffff880fa3fe64c0
[   43.509426] R10: ffff880fa3413d98 R11: ffff880fa38710d8 R12: ffff880fa3413da0
[   43.509792] R13: ffff880fa3a97fff R14: 0000000000000000 R15: ffff880fa3413d90
[   43.509792] FS:  00007fa9c48757e0(0000) GS:ffff880fd2600000(0000) knlGS:000000000000$
[   43.509792] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   43.509792] CR2: ffff880fa3a97fff CR3: 0000000fa3bb9000 CR4: 00000000000006b0
[   43.509792] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   43.509792] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   43.509792] Stack:
[   43.509792]  0000e5180000000e ffffffff85ca1900 ffff880fa38710d8 ffff880fa3413d98
[   43.509792]  0000000000000120 0000000000000000 ffff880fa3a98000 0000000000000000
[   43.509792]  ffffffff85cf32a0 0000000000000000 ffff880fa3413dc8 ffffffff818f3c72
[   43.509792] Call Trace:
[   43.509792]  [<ffffffff818f3c72>] ceph_mount+0xa2/0x390
[   43.509792]  [<ffffffff81226314>] ? pcpu_alloc+0x334/0x3c0
[   43.509792]  [<ffffffff81282f8d>] mount_fs+0x8d/0x1a0
[   43.509792]  [<ffffffff812263d0>] ? __alloc_percpu+0x10/0x20
[   43.509792]  [<ffffffff8129f799>] vfs_kern_mount+0x79/0x100
[   43.509792]  [<ffffffff812a224d>] do_new_mount+0xcd/0x1c0
[   43.509792]  [<ffffffff812a2e8d>] do_mount+0x15d/0x210
[   43.509792]  [<ffffffff81220e55>] ? strndup_user+0x45/0x60
[   43.509792]  [<ffffffff812a2fdd>] SyS_mount+0x9d/0xe0
[   43.509792]  [<ffffffff83fd816c>] tracesys+0xdd/0xe2
[   43.509792] Code: 4c 8b 5d c0 74 0a 48 8d 50 01 49 89 14 24 eb 17 31 c0 48 83 c9 ff $
[   43.509792] RIP  [<ffffffff818f3884>] parse_mount_options+0x1a4/0x300
[   43.509792]  RSP <ffff880fa3413d08>
[   43.509792] CR2: ffff880fa3a97fff
[   43.509792] ---[ end trace 22469cd81e93af51 ]---

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Reviewed-by: Sage Weil <sage@inktan.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7d377c9a5e35..6627b26a800c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
 	}
 	err = -EINVAL;
 	dev_name_end--;		/* back up to ':' separator */
-	if (*dev_name_end != ':') {
+	if (dev_name_end < dev_name || *dev_name_end != ':') {
 		pr_err("device name is missing path (no : separator in %s)\n",
 				dev_name);
 		goto out;
-- 
cgit v1.2.3


From acb2e930c72a1d38504d6d9c1f338562bbb9aa55 Mon Sep 17 00:00:00 2001
From: Nathaniel Yazdani <n1ght.4nd.d4y@gmail.com>
Date: Sun, 4 Aug 2013 21:04:30 -0700
Subject: ceph: fix null pointer dereference

commit c338c07c51e3106711fad5eb599e375eadb6855d upstream.

When register_session() is given an out-of-range argument for mds,
ceph_mdsmap_get_addr() will return a null pointer, which would be given to
ceph_con_open() & be dereferenced, causing a kernel oops. This fixes bug #4685
in the Ceph bug tracker <http://tracker.ceph.com/issues/4685>.

Signed-off-by: Nathaniel Yazdani <n1ght.4nd.d4y@gmail.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mds_client.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 63c789eb1e3e..d6a536886472 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -414,6 +414,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *s;
 
+	if (mds >= mdsc->mdsmap->m_max_mds)
+		return ERR_PTR(-EINVAL);
+
 	s = kzalloc(sizeof(*s), GFP_NOFS);
 	if (!s)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 10debbda102fe52eb75ba531b8ee79e98d3b8774 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 23 Jul 2013 16:48:01 +0300
Subject: ceph: cleanup types in striped_read()

commit 688bac461ba3e9d221a879ab40b687f5d7b5b19c upstream.

We pass in a u64 value for "len" and then immediately truncate away the
upper 32 bits.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <alex.elder@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e16907430..5b9baf5a867f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -313,9 +313,9 @@ static int striped_read(struct inode *inode,
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	u64 pos, this_len;
+	u64 pos, this_len, left;
 	int io_align, page_align;
-	int left, pages_left;
+	int pages_left;
 	int read;
 	struct page **page_pos;
 	int ret;
@@ -346,7 +346,7 @@ more:
 		ret = 0;
 	hit_stripe = this_len < left;
 	was_short = ret >= 0 && ret < this_len;
-	dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
+	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
 	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
 	if (ret > 0) {
@@ -378,7 +378,7 @@ more:
 			if (pos + left > inode->i_size)
 				left = inode->i_size - pos;
 
-			dout("zero tail %d\n", left);
+			dout("zero tail %llu\n", left);
 			ceph_zero_page_vector_range(page_align + read, left,
 						    pages);
 			read += left;
-- 
cgit v1.2.3


From 01bb1b04a1dd3736d20b859a9bcf3d7509566c1f Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Fri, 2 Aug 2013 18:14:48 +0800
Subject: ceph: Add check returned value on func ceph_calc_ceph_pg.

commit 2fbcbff1d6b9243ef71c64a8ab993bc3c7bb7af1 upstream.

Func ceph_calc_ceph_pg maybe failed.So add check for returned value.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Reviewed-by: Sage Weil <sage@inktank.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/ioctl.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index a5ce62eb7806..669622fd1ae3 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -211,8 +211,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
 
-	ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
-		ceph_file_layout_pg_pool(ci->i_layout));
+	r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
+				ceph_file_layout_pg_pool(ci->i_layout));
+	if (r < 0) {
+		up_read(&osdc->map_sem);
+		return r;
+	}
 
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
-- 
cgit v1.2.3


From 533bc2950e085ee99225655f60276760c00af31f Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Tue, 6 Aug 2013 16:20:38 +0800
Subject: ceph: fix bugs about handling short-read for sync read mode.

commit 02ae66d8b229708fd94b764f6c17ead1c7741fcf upstream.

cephfs . show_layout
>layyout.data_pool:     0
>layout.object_size:   4194304
>layout.stripe_unit:   4194304
>layout.stripe_count:  1

TestA:
>dd if=/dev/urandom of=test bs=1M count=2 oflag=direct
>dd if=/dev/urandom of=test bs=1M count=2 seek=4  oflag=direct
>dd if=test of=/dev/null bs=6M count=1 iflag=direct
The messages from func striped_read are:
ceph:           file.c:350  : striped_read 0~6291456 (read 0) got 2097152 HITSTRIPE SHORT
ceph:           file.c:350  : striped_read 2097152~4194304 (read 2097152) got 0 HITSTRIPE SHORT
ceph:           file.c:381  : zero tail 4194304
ceph:           file.c:390  : striped_read returns 6291456
The hole of file is from 2M--4M.But actualy it zero the last 4M include
the last 2M area which isn't a hole.
Using this patch, the messages are:
ceph:           file.c:350  : striped_read 0~6291456 (read 0) got 2097152 HITSTRIPE SHORT
ceph:           file.c:358  :  zero gap 2097152 to 4194304
ceph:           file.c:350  : striped_read 4194304~2097152 (read 4194304) got 2097152
ceph:           file.c:384  : striped_read returns 6291456

TestB:
>echo majianpeng > test
>dd if=test of=/dev/null bs=2M count=1 iflag=direct
The messages are:
ceph:           file.c:350  : striped_read 0~6291456 (read 0) got 11 HITSTRIPE SHORT
ceph:           file.c:350  : striped_read 11~6291445 (read 11) got 0 HITSTRIPE SHORT
ceph:           file.c:390  : striped_read returns 11
For this case,it did once more striped_read.It's no meaningless.
Using this patch, the message are:
ceph:           file.c:350  : striped_read 0~6291456 (read 0) got 11 HITSTRIPE SHORT
ceph:           file.c:384  : striped_read returns 11

Big thanks to Yan Zheng for the patch.

Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/file.c | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5b9baf5a867f..1ac0a59867dc 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -349,44 +349,37 @@ more:
 	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
 	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
-	if (ret > 0) {
-		int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
-
-		if (read < pos - off) {
-			dout(" zero gap %llu to %llu\n", off + read, pos);
-			ceph_zero_page_vector_range(page_align + read,
-						    pos - off - read, pages);
+	if (ret >= 0) {
+		int didpages;
+		if (was_short && (pos + ret < inode->i_size)) {
+			u64 tmp = min(this_len - ret,
+					inode->i_size - pos - ret);
+			dout(" zero gap %llu to %llu\n",
+				pos + ret, pos + ret + tmp);
+			ceph_zero_page_vector_range(page_align + read + ret,
+							tmp, pages);
+			ret += tmp;
 		}
+
+		didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
 		pos += ret;
 		read = pos - off;
 		left -= ret;
 		page_pos += didpages;
 		pages_left -= didpages;
 
-		/* hit stripe? */
-		if (left && hit_stripe)
+		/* hit stripe and need continue*/
+		if (left && hit_stripe && pos < inode->i_size)
 			goto more;
 	}
 
-	if (was_short) {
+	if (ret >= 0) {
+		ret = read;
 		/* did we bounce off eof? */
 		if (pos + left > inode->i_size)
 			*checkeof = 1;
-
-		/* zero trailing bytes (inside i_size) */
-		if (left > 0 && pos < inode->i_size) {
-			if (pos + left > inode->i_size)
-				left = inode->i_size - pos;
-
-			dout("zero tail %llu\n", left);
-			ceph_zero_page_vector_range(page_align + read, left,
-						    pages);
-			read += left;
-		}
 	}
 
-	if (ret >= 0)
-		ret = read;
 	dout("striped_read returns %d\n", ret);
 	return ret;
 }
-- 
cgit v1.2.3


From 30379f4553bae684a928c6bdd9dc606fd0ef6093 Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Wed, 21 Aug 2013 15:02:51 +0800
Subject: ceph: allow sync_read/write return partial successed size of
 read/write.

commit ee7289bfadda5f4ef60884547ebc9989c8fb314a upstream.

For sync_read/write, it may do multi stripe operations.If one of those
met erro, we return the former successed size rather than a error value.
There is a exception for write-operation met -EOLDSNAPC.If this occur,we
retry the whole write again.

Signed-off-by: Jianpeng Ma <majianpeng@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1ac0a59867dc..5de16f5ac7e9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -373,7 +373,7 @@ more:
 			goto more;
 	}
 
-	if (ret >= 0) {
+	if (read > 0) {
 		ret = read;
 		/* did we bounce off eof? */
 		if (pos + left > inode->i_size)
@@ -611,6 +611,8 @@ out:
 		if (check_caps)
 			ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
 					NULL);
+	} else if (ret != -EOLDSNAPC && written > 0) {
+		ret = written;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 70139b6643db61809db76dd642af87979421d41d Mon Sep 17 00:00:00 2001
From: Eric Whitney <enwlinux@gmail.com>
Date: Mon, 6 Jan 2014 14:00:23 -0500
Subject: ext4: fix bigalloc regression

commit d0abafac8c9162f39c4f6b2f8141b772a09b3770 upstream.

Commit f5a44db5d2 introduced a regression on filesystems created with
the bigalloc feature (cluster size > blocksize).  It causes xfstests
generic/006 and /013 to fail with an unexpected JBD2 failure and
transaction abort that leaves the test file system in a read only state.
Other xfstests run on bigalloc file systems are likely to fail as well.

The cause is the accidental use of a cluster mask where a cluster
offset was needed in ext4_ext_map_blocks().

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ecabf00829e2..a2b625e279db 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4123,7 +4123,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 */
 	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
 	newex.ee_block = cpu_to_le32(map->m_lblk);
-	cluster_offset = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
 
 	/*
 	 * If we are doing bigalloc, check to see if the extent returned
-- 
cgit v1.2.3


From 350f737ed039d2b36329ddcbcdf78e12c6d8c758 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Mon, 6 Jan 2014 17:16:01 -0500
Subject: GFS2: Increase i_writecount during gfs2_setattr_chown

commit 62e96cf81988101fe9e086b2877307b6adda5197 upstream.

This patch calls get_write_access in function gfs2_setattr_chown,
which merely increases inode->i_writecount for the duration of the
function. That will ensure that any file closes won't delete the
inode's multi-block reservation while the function is running.
It also ensures that a multi-block reservation exists when needed
for quota change operations during the chown.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/inode.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 62b484e4a9e4..bc5dac400125 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1536,10 +1536,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
 		ogid = ngid = NO_GID_QUOTA_CHANGE;
 
-	error = gfs2_quota_lock(ip, nuid, ngid);
+	error = get_write_access(inode);
 	if (error)
 		return error;
 
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out;
+
 	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
 	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
 		error = gfs2_quota_check(ip, nuid, ngid);
@@ -1566,6 +1578,8 @@ out_end_trans:
 	gfs2_trans_end(sdp);
 out_gunlock_q:
 	gfs2_quota_unlock(ip);
+out:
+	put_write_access(inode);
 	return error;
 }
 
-- 
cgit v1.2.3


From 3717eee3c4d1d44ded3bd84bf5f6437ec2aa5496 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 Nov 2013 16:31:29 -0800
Subject: vfs: In d_path don't call d_dname on a mount point

commit f48cfddc6729ef133933062320039808bafa6f45 upstream.

Aditya Kali (adityakali@google.com) wrote:
> Commit bf056bfa80596a5d14b26b17276a56a0dcb080e5:
> "proc: Fix the namespace inode permission checks." converted
> the namespace files into symlinks. The same commit changed
> the way namespace bind mounts appear in /proc/mounts:
>   $ mount --bind /proc/self/ns/ipc /mnt/ipc
> Originally:
>   $ cat /proc/mounts | grep ipc
>   proc /mnt/ipc proc rw,nosuid,nodev,noexec 0 0
>
> After commit bf056bfa80596a5d14b26b17276a56a0dcb080e5:
>   $ cat /proc/mounts | grep ipc
>   proc ipc:[4026531839] proc rw,nosuid,nodev,noexec 0 0
>
> This breaks userspace which expects the 2nd field in
> /proc/mounts to be a valid path.

The symlink /proc/<pid>/ns/{ipc,mnt,net,pid,user,uts} point to
dentries allocated with d_alloc_pseudo that we can mount, and
that have interesting names printed out with d_dname.

When these files are bind mounted /proc/mounts is not currently
displaying the mount point correctly because d_dname is called instead
of just displaying the path where the file is mounted.

Solve this by adding an explicit check to distinguish mounted pseudo
inodes and unmounted pseudo inodes.  Unmounted pseudo inodes always
use mount of their filesstem as the mnt_root  in their path making
these two cases easy to distinguish.

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Reported-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index da89cdfb21ab..9a59653d3449 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2686,8 +2686,13 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	 * thus don't need to be hashed.  They also don't need a name until a
 	 * user wants to identify the object in /proc/pid/fd/.  The little hack
 	 * below allows us to generate a name for these objects on demand:
+	 *
+	 * Some pseudo inodes are mountable.  When they are mounted
+	 * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
+	 * and instead have d_path return the mounted path.
 	 */
-	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+	if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+	    (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	get_fs_root(current->fs, &root);
-- 
cgit v1.2.3


From cc46cb337cf614b652b0068e6e388102bf8a9b54 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 14 Dec 2013 04:21:26 +0800
Subject: writeback: Fix data corruption on NFS

commit f9b0e058cbd04ada76b13afffa7e1df830543c24 upstream.

Commit 4f8ad655dbc8 "writeback: Refactor writeback_single_inode()" added
a condition to skip clean inode. However this is wrong in WB_SYNC_ALL
mode because there we also want to wait for outstanding writeback on
possibly clean inode. This was causing occasional data corruption issues
on NFS because it uses sync_inode() to make sure all outstanding writes
are flushed to the server before truncating the inode and with
sync_inode() returning prematurely file was sometimes extended back
by an outstanding write after it was truncated.

So modify the test to also check for pages under writeback in
WB_SYNC_ALL mode.

Fixes: 4f8ad655dbc82cf05d2edc11e66b78a42d38bf93
Reported-and-tested-by: Dan Duval <dan.duval@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fs-writeback.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3be57189efd5..e3ab1e4dc442 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -505,13 +505,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	}
 	WARN_ON(inode->i_state & I_SYNC);
 	/*
-	 * Skip inode if it is clean. We don't want to mess with writeback
-	 * lists in this function since flusher thread may be doing for example
-	 * sync in parallel and if we move the inode, it could get skipped. So
-	 * here we make sure inode is on some writeback list and leave it there
-	 * unless we have completely cleaned the inode.
+	 * Skip inode if it is clean and we have no outstanding writeback in
+	 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
+	 * function since flusher thread may be doing for example sync in
+	 * parallel and if we move the inode, it could get skipped. So here we
+	 * make sure inode is on some writeback list and leave it there unless
+	 * we have completely cleaned the inode.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY) &&
+	    (wbc->sync_mode != WB_SYNC_ALL ||
+	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
 	inode->i_state |= I_SYNC;
 	spin_unlock(&inode->i_lock);
-- 
cgit v1.2.3


From 4cb1e59ffcbee368f32c37ca5872fd73163c7783 Mon Sep 17 00:00:00 2001
From: Andreas Rohner <andreas.rohner@gmx.net>
Date: Tue, 14 Jan 2014 17:56:36 -0800
Subject: nilfs2: fix segctor bug that causes file system corruption

commit 70f2fe3a26248724d8a5019681a869abdaf3e89a upstream.

There is a bug in the function nilfs_segctor_collect, which results in
active data being written to a segment, that is marked as clean.  It is
possible, that this segment is selected for a later segment
construction, whereby the old data is overwritten.

The problem shows itself with the following kernel log message:

  nilfs_sufile_do_cancel_free: segment 6533 must be clean

Usually a few hours later the file system gets corrupted:

  NILFS: bad btree node (blocknr=8748107): level = 0, flags = 0x0, nchildren = 0
  NILFS error (device sdc1): nilfs_bmap_last_key: broken bmap (inode number=114660)

The issue can be reproduced with a file system that is nearly full and
with the cleaner running, while some IO intensive task is running.
Although it is quite hard to reproduce.

This is what happens:

 1. The cleaner starts the segment construction
 2. nilfs_segctor_collect is called
 3. sc_stage is on NILFS_ST_SUFILE and segments are freed
 4. sc_stage is on NILFS_ST_DAT current segment is full
 5. nilfs_segctor_extend_segments is called, which
    allocates a new segment
 6. The new segment is one of the segments freed in step 3
 7. nilfs_sufile_cancel_freev is called and produces an error message
 8. Loop around and the collection starts again
 9. sc_stage is on NILFS_ST_SUFILE and segments are freed
    including the newly allocated segment, which will contain active
    data and can be allocated at a later time
10. A few hours later another segment construction allocates the
    segment and causes file system corruption

This can be prevented by simply reordering the statements.  If
nilfs_sufile_cancel_freev is called before nilfs_segctor_extend_segments
the freed segments are marked as dirty and cannot be allocated any more.

Signed-off-by: Andreas Rohner <andreas.rohner@gmx.net>
Reviewed-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Andreas Rohner <andreas.rohner@gmx.net>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nilfs2/segment.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index cbd66188a28b..958a5b57ed4a 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 
 		nilfs_clear_logs(&sci->sc_segbufs);
 
-		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-		if (unlikely(err))
-			return err;
-
 		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
 							sci->sc_freesegs,
 							sci->sc_nfreesegs,
 							NULL);
 			WARN_ON(err); /* do not happen */
+			sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
 		}
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
 		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
 		sci->sc_stage = prev_stage;
 	}
-- 
cgit v1.2.3


From 209bd086e423d98ddf2fd52a6f1afda15b5758b1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 7 Jan 2014 12:58:19 -0500
Subject: ext4: avoid clearing beyond i_blocks when truncating an inline data
 file

commit 09c455aaa8f47a94d5bafaa23d58365768210507 upstream.

A missing cast means that when we are truncating a file which is less
than 60 bytes, we don't clear the correct area of memory, and in fact
we can end up truncating the next inode in the inode table, or worse
yet, some other kernel data structure.

Addresses-Coverity-Id: #751987

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inline.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 33331b4c2178..e350be6c7ac6 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1957,9 +1957,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 		}
 
 		/* Clear the content within i_blocks. */
-		if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
-			memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
-					EXT4_MIN_INLINE_DATA_SIZE - i_size);
+		if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+			void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+			memset(p + i_size, 0,
+			       EXT4_MIN_INLINE_DATA_SIZE - i_size);
+		}
 
 		EXT4_I(inode)->i_inline_size = i_size <
 					EXT4_MIN_INLINE_DATA_SIZE ?
-- 
cgit v1.2.3


From 9a8bd082503f4f91176d8fe0b91a23c5b768008e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Jan 2014 15:26:15 -0800
Subject: vfs: Is mounted should be testing mnt_ns for NULL or error.

commit 260a459d2e39761fbd39803497205ce1690bc7b1 upstream.

A bug was introduced with the is_mounted helper function in
commit f7a99c5b7c8bd3d3f533c8b38274e33f3da9096e
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Sat Jun 9 00:59:08 2012 -0400

    get rid of ->mnt_longterm

    it's enough to set ->mnt_ns of internal vfsmounts to something
    distinct from all struct mnt_namespace out there; then we can
    just use the check for ->mnt_ns != NULL in the fast path of
    mntput_no_expire()

    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

The intent was to test if the real_mount(vfsmount)->mnt_ns was
NULL_OR_ERR but the code is actually testing real_mount(vfsmount)
and always returning true.

The result is d_absolute_path returning paths it should be hiding.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/mount.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/mount.h b/fs/mount.h
index 64a858143ff9..68d80bdcd081 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -73,7 +73,7 @@ static inline int mnt_has_parent(struct mount *mnt)
 static inline int is_mounted(struct vfsmount *mnt)
 {
 	/* neither detached nor internal? */
-	return !IS_ERR_OR_NULL(real_mount(mnt));
+	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
 }
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
-- 
cgit v1.2.3


From 5c61a3d3ff81ddb53006bbc84aca8363b4f2841b Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Tue, 7 Jan 2014 17:26:58 +0800
Subject: Btrfs: handle EAGAIN case properly in btrfs_drop_snapshot()

commit 90515e7f5d7d24cbb2a4038a3f1b5cfa2921aa17 upstream.

We may return early in btrfs_drop_snapshot(), we shouldn't
call btrfs_std_err() for this case, fix it.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3b6d20bc2388..bbafa05519da 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7491,7 +7491,7 @@ out:
 	 */
 	if (root_dropped == false)
 		btrfs_add_dead_root(root);
-	if (err)
+	if (err && err != -EAGAIN)
 		btrfs_std_error(root->fs_info, err);
 	return err;
 }
-- 
cgit v1.2.3


From f0cea52a481c749ac0ed4cf0fd8a00f90306ebd5 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 15 Jan 2014 18:15:52 +0100
Subject: btrfs: restrict snapshotting to own subvolumes

commit d024206133ce21936b3d5780359afc00247655b7 upstream.

Currently, any user can snapshot any subvolume if the path is accessible and
thus indirectly create and keep files he does not own under his direcotries.
This is not possible with traditional directories.

In security context, a user can snapshot root filesystem and pin any
potentially buggy binaries, even if the updates are applied.

All the snapshots are visible to the administrator, so it's possible to
verify if there are suspicious snapshots.

Another more practical problem is that any user can pin the space used
by eg. root and cause ENOSPC.

Original report:
https://bugs.launchpad.net/ubuntu/+source/apparmor/+bug/484786

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ioctl.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 145b2c75ab83..783906c687b5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1528,6 +1528,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 			printk(KERN_INFO "btrfs: Snapshot src from "
 			       "another FS\n");
 			ret = -EINVAL;
+		} else if (!inode_owner_or_capable(src_inode)) {
+			/*
+			 * Subvolume creation is not restricted, but snapshots
+			 * are limited to own subvolumes only
+			 */
+			ret = -EPERM;
 		} else {
 			ret = btrfs_mksubvol(&file->f_path, name, namelen,
 					     BTRFS_I(src_inode)->root,
-- 
cgit v1.2.3


From d840f9899a6bc8bd88b3c87cd067ddf39a6ada45 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 22 Jan 2014 19:36:57 +0100
Subject: fuse: fix pipe_buf_operations

commit 28a625cbc2a14f17b83e47ef907b2658576a32aa upstream.

Having this struct in module memory could Oops when if the module is
unloaded while the buffer still persists in a pipe.

Since sock_pipe_buf_ops is essentially the same as fuse_dev_pipe_buf_steal
merge them into nosteal_pipe_buf_ops (this is the same as
default_pipe_buf_ops except stealing the page from the buffer is not
allowed).

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dev.c | 22 +++++-----------------
 fs/splice.c   | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1d55f9465400..23bf1a52a5da 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
 }
 
-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
-				   struct pipe_buffer *buf)
-{
-	return 1;
-}
-
-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
-	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
-	.confirm = generic_pipe_buf_confirm,
-	.release = generic_pipe_buf_release,
-	.steal = fuse_dev_pipe_buf_steal,
-	.get = generic_pipe_buf_get,
-};
-
 static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 				    struct pipe_inode_info *pipe,
 				    size_t len, unsigned int flags)
@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 		buf->page = bufs[page_nr].page;
 		buf->offset = bufs[page_nr].offset;
 		buf->len = bufs[page_nr].len;
-		buf->ops = &fuse_dev_pipe_buf_ops;
+		/*
+		 * Need to be careful about this.  Having buf->ops in module
+		 * code can Oops if the buffer persists after module unload.
+		 */
+		buf->ops = &nosteal_pipe_buf_ops;
 
 		pipe->nrbufs++;
 		page_nr++;
diff --git a/fs/splice.c b/fs/splice.c
index d37431dd60a1..4b5a5fac3383 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_nosteal,
+	.get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
+
 static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 			    unsigned long vlen, loff_t offset)
 {
-- 
cgit v1.2.3


From 060168af7cbbade31561f3a1efbc1bac91ee2166 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 27 Jan 2014 17:07:19 -0800
Subject: compat: fix sys_fanotify_mark

commit 592f6b842f64e416c7598a1b97c649b34241e22d upstream.

Commit 91c2e0bcae72 ("unify compat fanotify_mark(2), switch to
COMPAT_SYSCALL_DEFINE") added a new unified compat fanotify_mark syscall
to be used by all architectures.

Unfortunately the unified version merges the split mask parameter in a
wrong way: the lower and higher word got swapped.

This was discovered with glibc's tst-fanotify test case.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reported-by: Andreas Krebbel <krebbel@linux.vnet.ibm.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/notify/fanotify/fanotify_user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 77cc85dd0db0..f1680cdbd88b 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -867,9 +867,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
 {
 	return sys_fanotify_mark(fanotify_fd, flags,
 #ifdef __BIG_ENDIAN
-				((__u64)mask1 << 32) | mask0,
-#else
 				((__u64)mask0 << 32) | mask1,
+#else
+				((__u64)mask1 << 32) | mask0,
 #endif
 				 dfd, pathname);
 }
-- 
cgit v1.2.3


From 9580348ce827b7026c12f2fde1da7d064c338dc6 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Jan 2014 14:05:44 -0800
Subject: fs/compat: fix parameter handling for compat readv/writev syscalls

commit dfd948e32af2e7b28bcd7a490c0a30d4b8df2a36 upstream.

We got a report that the pwritev syscall does not work correctly in
compat mode on s390.

It turned out that with commit 72ec35163f9f ("switch compat readv/writev
variants to COMPAT_SYSCALL_DEFINE") we lost the zero extension of a
couple of syscall parameters because the some parameter types haven't
been converted from unsigned long to compat_ulong_t.

This is needed for architectures where the ABI requires that the caller
of a function performed zero and/or sign extension to 64 bit of all
parameters.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/read_write.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 2cefa417be34..f6b7c600eb7f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -947,9 +947,9 @@ out:
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen)
+		compat_ulong_t, vlen)
 {
 	struct fd f = fdget(fd);
 	ssize_t ret;
@@ -983,9 +983,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, u32, pos_low, u32, pos_high)
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	return compat_sys_preadv64(fd, vec, vlen, pos);
@@ -1013,9 +1013,9 @@ out:
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
 		const struct compat_iovec __user *, vec,
-		unsigned long, vlen)
+		compat_ulong_t, vlen)
 {
 	struct fd f = fdget(fd);
 	ssize_t ret;
@@ -1049,9 +1049,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
 	return ret;
 }
 
-COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
 		const struct compat_iovec __user *,vec,
-		unsigned long, vlen, u32, pos_low, u32, pos_high)
+		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
 {
 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
 	return compat_sys_pwritev64(fd, vec, vlen, pos);
-- 
cgit v1.2.3


From 7462402eae491665f58f434dbf45bd964cbf8b79 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Jan 2014 14:05:46 -0800
Subject: fs/compat: fix lookup_dcookie() parameter handling

commit d8d14bd09cddbaf0168d61af638455a26bd027ff upstream.

Commit d5dc77bfeeab ("consolidate compat lookup_dcookie()") coverted all
architectures to the new compat_sys_lookup_dcookie() syscall.

The "len" paramater of the new compat syscall must have the type
compat_size_t in order to enforce zero extension for architectures where
the ABI requires that the caller of a function performed zero and/or
sign extension to 64 bit of all parameters.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcookies.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcookies.c b/fs/dcookies.c
index ab5954b50267..ac44a69fbea9 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -204,7 +204,7 @@ out:
 }
 
 #ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
 {
 #ifdef __BIG_ENDIAN
 	return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
-- 
cgit v1.2.3


From ed6161470c6c8c45d97211d2426eb050dc382a4a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Thu, 21 Nov 2013 17:58:08 +0200
Subject: ore: Fix wrong math in allocation of per device BIO

commit aad560b7f63b495f48a7232fd086c5913a676e6f upstream.

At IO preparation we calculate the max pages at each device and
allocate a BIO per device of that size. The calculation was wrong
on some unaligned corner cases offset/length combination and would
make prepare return with -ENOMEM. This would be bad for pnfs-objects
that would in that case IO through MDS. And fatal for exofs were it
would fail writes with EIO.

Fix it by doing the proper math, that will work in all cases. (I
ran a test with all possible offset/length combinations this time
round).

Also when reading we do not need to allocate for the parity units
since we jump over them.

Also lower the max_io_length to take into account the parity pages
so not to allocate BIOs bigger than PAGE_SIZE

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/exofs/ore.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index b74422888604..85cde3e76290 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
 
 	layout->max_io_length =
 		(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
-							layout->group_width;
+					(layout->group_width - layout->parity);
 	if (layout->parity) {
 		unsigned stripe_length =
 				(layout->group_width - layout->parity) *
@@ -286,7 +286,8 @@ int  ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
 	if (length) {
 		ore_calc_stripe_info(layout, offset, length, &ios->si);
 		ios->length = ios->si.length;
-		ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+		ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+				 ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
 		if (layout->parity)
 			_ore_post_alloc_raid_stuff(ios);
 	}
@@ -536,6 +537,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	u64	H = LmodS - G * T;
 
 	u32	N = div_u64(H, U);
+	u32	Nlast;
 
 	/* "H - (N * U)" is just "H % U" so it's bound to u32 */
 	u32	C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
@@ -568,6 +570,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
 	si->length = T - H;
 	if (si->length > length)
 		si->length = length;
+
+	Nlast = div_u64(H + si->length + U - 1, U);
+	si->maxdevUnits = Nlast - N;
+
 	si->M = M;
 }
 EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +589,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 	int ret;
 
 	if (per_dev->bio == NULL) {
-		unsigned pages_in_stripe = ios->layout->group_width *
-					(ios->layout->stripe_unit / PAGE_SIZE);
-		unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
-					(ios->layout->group_width -
-					 ios->layout->parity);
-		unsigned bio_size = (nr_pages + pages_in_stripe) /
-					ios->layout->group_width;
+		unsigned bio_size;
+
+		if (!ios->reading) {
+			bio_size = ios->si.maxdevUnits;
+		} else {
+			bio_size = (ios->si.maxdevUnits + 1) *
+			     (ios->layout->group_width - ios->layout->parity) /
+			     ios->layout->group_width;
+		}
+		bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
 
 		per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
 		if (unlikely(!per_dev->bio)) {
@@ -609,8 +618,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios,  unsigned *cur_pg,
 		added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
 					    pglen, pgbase);
 		if (unlikely(pglen != added_len)) {
-			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
-				   per_dev->bio->bi_vcnt);
+			/* If bi_vcnt == bi_max then this is a SW BUG */
+			ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+				   "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+				   per_dev->bio->bi_vcnt,
+				   per_dev->bio->bi_max_vecs,
+				   BIO_MAX_PAGES_KMALLOC, cur_len);
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -1098,7 +1111,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
 		size_attr->attr = g_attr_logical_length;
 		size_attr->attr.val_ptr = &size_attr->newsize;
 
-		ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+		ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
 			     _LLU(oc->comps->obj.id), _LLU(obj_size), i);
 		ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
 					&size_attr->attr);
-- 
cgit v1.2.3


From afce3609db524d801a8fc4096810fa6844062f65 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 4 Dec 2013 17:39:23 -0500
Subject: NFSv4: OPEN must handle the NFS4ERR_IO return code correctly

commit c7848f69ec4a8c03732cde5c949bd2aa711a9f4b upstream.

decode_op_hdr() cannot distinguish between an XDR decoding error and
the perfectly valid errorcode NFS4ERR_IO. This is normally not a
problem, but for the particular case of OPEN, we need to be able
to increment the NFSv4 open sequence id when the server returns
a valid response.

Reported-by: J Bruce Fields <bfields@fieldses.org>
Link: http://lkml.kernel.org/r/20131204210356.GA19452@fieldses.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4xdr.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4be8d135ed61..988efb4caac0 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -3002,7 +3002,8 @@ out_overflow:
 	return -EIO;
 }
 
-static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+		int *nfs_retval)
 {
 	__be32 *p;
 	uint32_t opnum;
@@ -3012,19 +3013,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 	if (unlikely(!p))
 		goto out_overflow;
 	opnum = be32_to_cpup(p++);
-	if (opnum != expected) {
-		dprintk("nfs: Server returned operation"
-			" %d but we issued a request for %d\n",
-				opnum, expected);
-		return -EIO;
-	}
+	if (unlikely(opnum != expected))
+		goto out_bad_operation;
 	nfserr = be32_to_cpup(p);
-	if (nfserr != NFS_OK)
-		return nfs4_stat_to_errno(nfserr);
-	return 0;
+	if (nfserr == NFS_OK)
+		*nfs_retval = 0;
+	else
+		*nfs_retval = nfs4_stat_to_errno(nfserr);
+	return true;
+out_bad_operation:
+	dprintk("nfs: Server returned operation"
+		" %d but we issued a request for %d\n",
+			opnum, expected);
+	*nfs_retval = -EREMOTEIO;
+	return false;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
-	return -EIO;
+	*nfs_retval = -EIO;
+	return false;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+	int retval;
+
+	__decode_op_hdr(xdr, expected, &retval);
+	return retval;
 }
 
 /* Dummy routine */
@@ -4842,11 +4856,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 	uint32_t savewords, bmlen, i;
 	int status;
 
-	status = decode_op_hdr(xdr, OP_OPEN);
-	if (status != -EIO)
-		nfs_increment_open_seqid(status, res->seqid);
-	if (!status)
-		status = decode_stateid(xdr, &res->stateid);
+	if (!__decode_op_hdr(xdr, OP_OPEN, &status))
+		return status;
+	nfs_increment_open_seqid(status, res->seqid);
+	if (status)
+		return status;
+	status = decode_stateid(xdr, &res->stateid);
 	if (unlikely(status))
 		return status;
 
-- 
cgit v1.2.3


From c19422192bfb29a3c9ed00961f317b6f46f14923 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Mon, 13 Jan 2014 16:54:45 -0500
Subject: nfs4.1: properly handle ENOTSUP in SECINFO_NO_NAME

commit 78b19bae0813bd6f921ca58490196abd101297bd upstream.

Don't check for -NFS4ERR_NOTSUPP, it's already been mapped to -ENOTSUPP
by nfs4_stat_to_errno.

This allows the client to mount v4.1 servers that don't support
SECINFO_NO_NAME by falling back to the "guess and check" method of
nfs4_find_root_sec.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75e49d42a7fb..5aca7b400ad9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6683,7 +6683,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 		switch (err) {
 		case 0:
 		case -NFS4ERR_WRONGSEC:
-		case -NFS4ERR_NOTSUPP:
+		case -ENOTSUPP:
 			goto out;
 		default:
 			err = nfs4_handle_exception(server, err, &exception);
@@ -6715,7 +6715,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
 	 * Fall back on "guess and check" method if
 	 * the server doesn't support SECINFO_NO_NAME
 	 */
-	if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
+	if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
 		err = nfs4_find_root_sec(server, fhandle, info);
 		goto out_freepage;
 	}
-- 
cgit v1.2.3


From 14ff66ce27e91666bf55e415104fdec9f9b18668 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 17 Jan 2014 17:03:41 -0500
Subject: NFSv4.1: Handle errors correctly in nfs41_walk_client_list

commit 64590daa9e0dfb3aad89e3ab9230683b76211d5b upstream.

Both nfs41_walk_client_list and nfs40_walk_client_list expect the
'status' variable to be set to the value -NFS4ERR_STALE_CLIENTID
if the loop fails to find a match.
The problem is that the 'pos->cl_cons_state > NFS_CS_READY' changes
the value of 'status', and sets it either to the value '0' (which
indicates success), or to the value EINTR.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4client.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 4cbad5d6b276..3afae9059222 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -324,9 +324,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
 			prev = pos;
 
 			status = nfs_wait_client_init_complete(pos);
-			spin_lock(&nn->nfs_client_lock);
 			if (status < 0)
-				continue;
+				goto out;
+			status = -NFS4ERR_STALE_CLIENTID;
+			spin_lock(&nn->nfs_client_lock);
 		}
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
@@ -464,7 +465,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
 			}
 			spin_lock(&nn->nfs_client_lock);
 			if (status < 0)
-				continue;
+				break;
+			status = -NFS4ERR_STALE_CLIENTID;
 		}
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
-- 
cgit v1.2.3


From daab6e7df44ba7d4281379c6ac5780e10d133286 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Sun, 19 Jan 2014 22:45:36 -0500
Subject: nfs4: fix discover_server_trunking use after free

commit abad2fa5ba67725a3f9c376c8cfe76fbe94a3041 upstream.

If clp is new (cl_count = 1) and it matches another client in
nfs4_discover_server_trunking, the nfs_put_client will free clp before
->cl_preserve_clid is set.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4client.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3afae9059222..02773aab43c5 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -240,13 +240,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	error = nfs4_discover_server_trunking(clp, &old);
 	if (error < 0)
 		goto error;
-	nfs_put_client(clp);
-	if (clp != old) {
-		clp->cl_preserve_clid = true;
-		clp = old;
-	}
 
-	return clp;
+	if (clp != old)
+		clp->cl_preserve_clid = true;
+	nfs_put_client(clp);
+	return old;
 
 error:
 	nfs_mark_client_ready(clp, error);
-- 
cgit v1.2.3


From 305a7c624df026dbdde23eb880549db7df0ad7b9 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 22 Jan 2014 20:34:54 +0200
Subject: pnfs: Proper delay for NFS4ERR_RECALLCONFLICT in layout_get_done

commit ed7e5423014ad89720fcf315c0b73f2c5d0c7bd2 upstream.

An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT
only when a Server Sent a RECALL do to that GET_LAYOUT, or
the RECALL and GET_LAYOUT crossed on the wire.
In any way this means we want to wait at most until in-flight IO
is finished and the RECALL can be satisfied.

So a proper wait here is more like 1/10 of a second, not 15 seconds
like we have now. In case of a server bug we delay exponentially
longer on each retry.

Current code totally craps out performance of very large files on
most pnfs-objects layouts, because of how the map changes when the
file has grown into the next raid group.

[Stable: This will patch back to 3.9. If there are earlier still
 maintained trees, please tell me I'll send a patch]

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5aca7b400ad9..26e71bdb5b33 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6232,9 +6232,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct pnfs_layout_hdr *lo;
 	struct nfs4_state *state = NULL;
-	unsigned long timeo, giveup;
+	unsigned long timeo, now, giveup;
 
-	dprintk("--> %s\n", __func__);
+	dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
 
 	if (!nfs41_sequence_done(task, &lgp->res.seq_res))
 		goto out;
@@ -6242,12 +6242,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case 0:
 		goto out;
+	/*
+	 * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+	 * (or clients) writing to the same RAID stripe
+	 */
 	case -NFS4ERR_LAYOUTTRYLATER:
+	/*
+	 * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
+	 * existing layout before getting a new one).
+	 */
 	case -NFS4ERR_RECALLCONFLICT:
 		timeo = rpc_get_timeout(task->tk_client);
 		giveup = lgp->args.timestamp + timeo;
-		if (time_after(giveup, jiffies))
-			task->tk_status = -NFS4ERR_DELAY;
+		now = jiffies;
+		if (time_after(giveup, now)) {
+			unsigned long delay;
+
+			/* Delay for:
+			 * - Not less then NFS4_POLL_RETRY_MIN.
+			 * - One last time a jiffie before we give up
+			 * - exponential backoff (time_now minus start_attempt)
+			 */
+			delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
+				    min((giveup - now - 1),
+					now - lgp->args.timestamp));
+
+			dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
+				__func__, delay);
+			rpc_delay(task, delay);
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
+			goto out; /* Do not call nfs4_async_handle_error() */
+		}
 		break;
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
-- 
cgit v1.2.3


From a082f872743e09fe2cdf870b2d8f1b5dccd36cf7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 29 Jan 2014 16:05:30 -0500
Subject: Btrfs: disable snapshot aware defrag for now

commit 8101c8dbf6243ba517aab58d69bf1bc37d8b7b9c upstream.

It's just broken and it's taking a lot of effort to fix it, so for now just
disable it so people can defrag in peace.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0bcee78cde16..25e6a8e1014e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2655,7 +2655,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 			EXTENT_DEFRAG, 1, cached_state);
 	if (ret) {
 		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
-		if (last_snapshot >= BTRFS_I(inode)->generation)
+		if (0 && last_snapshot >= BTRFS_I(inode)->generation)
 			/* the inode is shared */
 			new = record_old_file_extents(inode, ordered_extent);
 
-- 
cgit v1.2.3


From 7e405afc255142ceb5d8cc3d97a7cffac46f0e4e Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 6 Feb 2014 12:04:28 -0800
Subject: mm: __set_page_dirty uses spin_lock_irqsave instead of spin_lock_irq

commit 227d53b397a32a7614667b3ecaf1d89902fb6c12 upstream.

To use spin_{un}lock_irq is dangerous if caller disabled interrupt.
During aio buffer migration, we have a possibility to see the following
call stack.

aio_migratepage  [disable interrupt]
  migrate_page_copy
    clear_page_dirty_for_io
      set_page_dirty
        __set_page_dirty_buffers
          __set_page_dirty
            spin_lock_irq

This mean, current aio migration is a deadlockable.  spin_lock_irqsave
is a safer alternative and we should use it.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: David Rientjes rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/buffer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index d2a4d1bb2d57..75964d734444 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -620,14 +620,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 static void __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
-	spin_lock_irq(&mapping->tree_lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
 		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 	}
-	spin_unlock_irq(&mapping->tree_lock);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
 
-- 
cgit v1.2.3


From 434672c0c50a3ef988c9d582c0148c9067e6961c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 10 Feb 2014 14:25:41 -0800
Subject: fs/file.c:fdtable: avoid triggering OOMs from alloc_fdmem

commit 96c7a2ff21501691587e1ae969b83cbec8b78e08 upstream.

Recently due to a spike in connections per second memcached on 3
separate boxes triggered the OOM killer from accept.  At the time the
OOM killer was triggered there was 4GB out of 36GB free in zone 1.  The
problem was that alloc_fdtable was allocating an order 3 page (32KiB) to
hold a bitmap, and there was sufficient fragmentation that the largest
page available was 8KiB.

I find the logic that PAGE_ALLOC_COSTLY_ORDER can't fail pretty dubious
but I do agree that order 3 allocations are very likely to succeed.

There are always pathologies where order > 0 allocations can fail when
there are copious amounts of free memory available.  Using the pigeon
hole principle it is easy to show that it requires 1 page more than 50%
of the pages being free to guarantee an order 1 (8KiB) allocation will
succeed, 1 page more than 75% of the pages being free to guarantee an
order 2 (16KiB) allocation will succeed and 1 page more than 87.5% of
the pages being free to guarantee an order 3 allocate will succeed.

A server churning memory with a lot of small requests and replies like
memcached is a common case that if anything can will skew the odds
against large pages being available.

Therefore let's not give external applications a practical way to kill
linux server applications, and specify __GFP_NORETRY to the kmalloc in
alloc_fdmem.  Unless I am misreading the code and by the time the code
reaches should_alloc_retry in __alloc_pages_slowpath (where
__GFP_NORETRY becomes signification).  We have already tried everything
reasonable to allocate a page and the only thing left to do is wait.  So
not waiting and falling back to vmalloc immediately seems like the
reasonable thing to do even if there wasn't a chance of triggering the
OOM killer.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Cong Wang <cwang@twopensource.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 4a78f981557a..9de20265a78c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size)
 	 * vmalloc() if the allocation size will be considered "large" by the VM.
 	 */
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
 		if (data != NULL)
 			return data;
 	}
-- 
cgit v1.2.3


From 53fecc53969681726e39275576fc75af39e7e7af Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sun, 26 Jan 2014 23:53:43 -0600
Subject: CIFS: Fix SMB2 mounts so they don't try to set or get xattrs via cifs

commit 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 upstream.

When mounting with smb2 (or smb2.1 or smb3) we need to check to make
sure that attempts to query or set extended attributes do not
attempt to send the request with the older cifs protocol instead
(eventually we also need to add the support in SMB2
to query/set extended attributes but this patch prevents us from
using the wrong protocol for extended attribute operations).

Signed-off-by: Steve French <smfrench@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/cifsglob.h |  6 ++++++
 fs/cifs/xattr.c    | 49 ++++++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ea3a0b3018a5..56ac89fe957e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -370,6 +370,12 @@ struct smb_version_operations {
 	void (*new_lease_key)(struct cifs_fid *fid);
 	int (*calc_signature)(struct smb_rqst *rqst,
 				   struct TCP_Server_Info *server);
+	ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
+			const unsigned char *, const unsigned char *, char *,
+			size_t, const struct nls_table *, int);
+	int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
+			const char *, const void *, const __u16,
+			const struct nls_table *, int);
 };
 
 struct smb_version_values {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 09afda4cc58e..95c43bb20335 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 			goto remove_ea_exit;
 
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
-			(__u16)0, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, NULL, (__u16)0,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	}
 remove_ea_exit:
 	kfree(full_path);
@@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 			cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
 
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
-			(__u16)value_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
 		   == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto set_ea_exit;
 
 		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
-		rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
-			(__u16)value_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->set_EA)
+			rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+				full_path, ea_name, ea_value, (__u16)value_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
 			strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
 #ifdef CONFIG_CIFS_ACL
@@ -272,17 +278,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			/* revalidate/getattr then populate from inode */
 		} /* BB add else when above is implemented */
 		ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
-		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
-			buf_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
 			goto get_ea_exit;
 
 		ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
-		rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
-			buf_size, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (pTcon->ses->server->ops->query_all_EAs)
+			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, ea_name, ea_value, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
 			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
@@ -400,11 +410,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 	/* if proc/fs/cifs/streamstoxattr is set then
 		search server for EAs or streams to
 		returns as xattrs */
-	rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
-				buf_size, cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
 
+	if (pTcon->ses->server->ops->query_all_EAs)
+		rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+				full_path, NULL, data, buf_size,
+				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
 list_ea_exit:
 	kfree(full_path);
 	free_xid(xid);
-- 
cgit v1.2.3


From 367d96446cc94044d4bb7858d9cadabb69cf3030 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sat, 1 Feb 2014 23:27:18 -0600
Subject: Add protocol specific operation for CIFS xattrs

commit d979f3b0a1f0b5499ab85e68cdf02b56852918b6 upstream.

Changeset 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 added protocol
operations for get/setxattr to avoid calling cifs operations
on smb2/smb3 mounts for xattr operations and this changeset
adds the calls to cifs specific protocol operations for xattrs
(in order to reenable cifs support for xattrs which was
temporarily disabled by the previous changeset.  We do not
have SMB2/SMB3 worker function for setting xattrs yet so
this only enables it for cifs.

CCing stable since without these two small changsets (its
small coreq 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 is
also needed) calling getfattr/setfattr on smb2/smb3 mounts
causes problems.

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Shirish Pargaonkar <spargaonkar@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/inode.c   | 13 +++++++++----
 fs/cifs/smb1ops.c |  4 ++++
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 449b6cf09b09..9d463501348f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -490,10 +490,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 		return PTR_ERR(tlink);
 	tcon = tlink_tcon(tlink);
 
-	rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
-			    ea_value, 4 /* size of buf */, cifs_sb->local_nls,
-			    cifs_sb->mnt_cifs_flags &
-				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (tcon->ses->server->ops->query_all_EAs == NULL) {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
+
+	rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
+			"SETFILEBITS", ea_value, 4 /* size of buf */,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 	cifs_put_tlink(tlink);
 	if (rc < 0)
 		return (int)rc;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 3efdb9d5c0b8..4a61ba7f5e0d 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -948,6 +948,10 @@ struct smb_version_operations smb1_operations = {
 	.mand_lock = cifs_mand_lock,
 	.mand_unlock_range = cifs_unlock_range,
 	.push_mand_locks = cifs_push_mandatory_locks,
+#ifdef CONFIG_CIFS_XATTR
+	.query_all_EAs = CIFSSMBQAllEAs,
+	.set_EA = CIFSSMBSetEA,
+#endif /* CIFS_XATTR */
 };
 
 struct smb_version_values smb1_values = {
-- 
cgit v1.2.3


From 6bd8c8508282b9db05b9bb5a9f76e051f247a618 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sun, 2 Feb 2014 23:31:47 -0600
Subject: retrieving CIFS ACLs when mounted with SMB2 fails dropping session

commit 83e3bc23ef9ce7c03b7b4e5d3d790246ea59db3e upstream.

The get/set ACL xattr support for CIFS ACLs attempts to send old
cifs dialect protocol requests even when mounted with SMB2 or later
dialects. Sending cifs requests on an smb2 session causes problems -
the server drops the session due to the illegal request.

This patch makes CIFS ACL operations protocol specific to fix that.

Attempting to query/set CIFS ACLs for SMB2 will now return
EOPNOTSUPP (until we add worker routines for sending query
ACL requests via SMB2) instead of sending invalid (cifs)
requests.

A separate followon patch will be needed to fix cifs_acl_to_fattr
(which takes a cifs specific u16 fid so can't be abstracted
to work with SMB2 until that is changed) and will be needed
to fix mount problems when "cifsacl" is specified on mount
with e.g. vers=2.1

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Shirish Pargaonkar <spargaonkar@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/cifsacl.c  | 28 ++++++++++++++++++++++++----
 fs/cifs/cifsglob.h |  4 ++++
 fs/cifs/smb1ops.c  |  4 ++++
 fs/cifs/xattr.c    | 15 +++++++++++----
 4 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 51f5e0ee7237..494b68349667 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1027,15 +1027,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
 	__u32 secdesclen = 0;
 	struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
 	struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+	struct cifs_tcon *tcon;
+
+	if (IS_ERR(tlink))
+		return PTR_ERR(tlink);
+	tcon = tlink_tcon(tlink);
 
 	cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
 
 	/* Get the security descriptor */
-	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
+
+	if (tcon->ses->server->ops->get_acl == NULL) {
+		cifs_put_tlink(tlink);
+		return -EOPNOTSUPP;
+	}
+
+	pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+						&secdesclen);
 	if (IS_ERR(pntsd)) {
 		rc = PTR_ERR(pntsd);
 		cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
-		goto out;
+		cifs_put_tlink(tlink);
+		return rc;
 	}
 
 	/*
@@ -1048,6 +1063,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
 	pnntsd = kmalloc(secdesclen, GFP_KERNEL);
 	if (!pnntsd) {
 		kfree(pntsd);
+		cifs_put_tlink(tlink);
 		return -ENOMEM;
 	}
 
@@ -1056,14 +1072,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
 
 	cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
 
+	if (tcon->ses->server->ops->set_acl == NULL)
+		rc = -EOPNOTSUPP;
+
 	if (!rc) {
 		/* Set the security descriptor */
-		rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
+		rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
+						     path, aclflag);
 		cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
 	}
+	cifs_put_tlink(tlink);
 
 	kfree(pnntsd);
 	kfree(pntsd);
-out:
 	return rc;
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 56ac89fe957e..e2c2d96491fa 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -376,6 +376,10 @@ struct smb_version_operations {
 	int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
 			const char *, const void *, const __u16,
 			const struct nls_table *, int);
+	struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
+			const char *, u32 *);
+	int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
+			int);
 };
 
 struct smb_version_values {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 4a61ba7f5e0d..4885a40f3210 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -952,6 +952,10 @@ struct smb_version_operations smb1_operations = {
 	.query_all_EAs = CIFSSMBQAllEAs,
 	.set_EA = CIFSSMBSetEA,
 #endif /* CIFS_XATTR */
+#ifdef CONFIG_CIFS_ACL
+	.get_acl = get_cifs_acl,
+	.set_acl = set_cifs_acl,
+#endif /* CIFS_ACL */
 };
 
 struct smb_version_values smb1_values = {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 95c43bb20335..5ac836a86b18 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -176,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 			rc = -ENOMEM;
 		} else {
 			memcpy(pacl, ea_value, value_size);
-			rc = set_cifs_acl(pacl, value_size,
-				direntry->d_inode, full_path, CIFS_ACL_DACL);
+			if (pTcon->ses->server->ops->set_acl)
+				rc = pTcon->ses->server->ops->set_acl(pacl,
+						value_size, direntry->d_inode,
+						full_path, CIFS_ACL_DACL);
+			else
+				rc = -EOPNOTSUPP;
 			if (rc == 0) /* force revalidate of the inode */
 				CIFS_I(direntry->d_inode)->time = 0;
 			kfree(pacl);
@@ -323,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			u32 acllen;
 			struct cifs_ntsd *pacl;
 
-			pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
-						full_path, &acllen);
+			if (pTcon->ses->server->ops->get_acl == NULL)
+				goto get_ea_exit; /* rc already EOPNOTSUPP */
+
+			pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
+					direntry->d_inode, full_path, &acllen);
 			if (IS_ERR(pacl)) {
 				rc = PTR_ERR(pacl);
 				cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
-- 
cgit v1.2.3


From 5de64260830aafce062b99e17cb3963ecc4291bf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 7 Feb 2014 17:10:26 +1100
Subject: lockd: send correct lock when granting a delayed lock.

commit 2ec197db1a56c9269d75e965f14c344b58b2a4f6 upstream.

If an NFS client attempts to get a lock (using NLM) and the lock is
not available, the server will remember the request and when the lock
becomes available it will send a GRANT request to the client to
provide the lock.

If the client already held an adjacent lock, the GRANT callback will
report the union of the existing and new locks, which can confuse the
client.

This happens because __posix_lock_file (called by vfs_lock_file)
updates the passed-in file_lock structure when adjacent or
over-lapping locks are found.

To avoid this problem we take a copy of the two fields that can
be changed (fl_start and fl_end) before the call and restore them
afterwards.
An alternate would be to allocate a 'struct file_lock', initialise it,
use locks_copy_lock() to take a copy, then locks_release_private()
after the vfs_lock_file() call.  But that is a lot more work.

Reported-by: Olaf Kirch <okir@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

--
v1 had a couple of issues (large on-stack struct and didn't really work properly).
This version is much better tested.
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svclock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 8ebd3f551e0c..ffc4045fc62e 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -767,6 +767,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	struct nlm_file		*file = block->b_file;
 	struct nlm_lock		*lock = &block->b_call->a_args.lock;
 	int			error;
+	loff_t			fl_start, fl_end;
 
 	dprintk("lockd: grant blocked lock %p\n", block);
 
@@ -784,9 +785,16 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	}
 
 	/* Try the lock operation again */
+	/* vfs_lock_file() can mangle fl_start and fl_end, but we need
+	 * them unchanged for the GRANT_MSG
+	 */
 	lock->fl.fl_flags |= FL_SLEEP;
+	fl_start = lock->fl.fl_start;
+	fl_end = lock->fl.fl_end;
 	error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
 	lock->fl.fl_flags &= ~FL_SLEEP;
+	lock->fl.fl_start = fl_start;
+	lock->fl.fl_end = fl_end;
 
 	switch (error) {
 	case 0:
-- 
cgit v1.2.3


From caaeac355477b49d6b50791fb6f0ced3d2b4e93b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 6 Feb 2014 15:14:13 -0500
Subject: block: Fix nr_vecs for inline integrity vectors

commit 087787959ce851d7bbb19f10f6e9241b7f85a3ca upstream.

Commit 9f060e2231ca changed the way we handle allocations for the
integrity vectors. When the vectors are inline there is no associated
slab and consequently bvec_nr_vecs() returns 0. Ensure that we check
against BIP_INLINE_VECS in that case.

Reported-by: David Milburn <dmilburn@redhat.com>
Tested-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/bio-integrity.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 45e944fe52a6..8dccf73025b3 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_integrity_free);
 
+static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
+{
+	if (bip->bip_slab == BIO_POOL_NONE)
+		return BIP_INLINE_VECS;
+
+	return bvec_nr_vecs(bip->bip_slab);
+}
+
 /**
  * bio_integrity_add_page - Attach integrity metadata
  * @bio:	bio to update
@@ -129,7 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_vec *iv;
 
-	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
+	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
 		printk(KERN_ERR "%s: bip_vec full\n", __func__);
 		return 0;
 	}
-- 
cgit v1.2.3


From 4687eb1ebf2129cf46e636ec197173b99a4959e2 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Wed, 12 Feb 2014 11:48:31 -0500
Subject: ext4: fix error paths in swap_inode_boot_loader()

commit 30d29b119ef01776e0a301444ab24defe8d8bef3 upstream.

In swap_inode_boot_loader() we forgot to release ->i_mutex and resume
unlocked dio for inode and inode_bl if there is an error starting the
journal handle.  This commit fixes this issue.

Reported-by: Ahmed Tamrawi <ahmedtamrawi@gmail.com>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Dr. Tilmann Bubeck <t.bubeck@reinform.de>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c0427e2f6648..42624a995b00 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,7 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
 	if (IS_ERR(handle)) {
 		err = -EINVAL;
-		goto swap_boot_out;
+		goto journal_err_out;
 	}
 
 	/* Protect extent tree against block allocations via delalloc */
@@ -203,6 +203,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 
 	ext4_double_up_write_data_sem(inode, inode_bl);
 
+journal_err_out:
 	ext4_inode_resume_unlocked_dio(inode);
 	ext4_inode_resume_unlocked_dio(inode_bl);
 
-- 
cgit v1.2.3


From f13ea132887a8649cc6d45942e2be27f4362351f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 12 Feb 2014 12:16:04 -0500
Subject: ext4: don't try to modify s_flags if the the file system is read-only

commit 23301410972330c0ae9a8afc379ba2005e249cc6 upstream.

If an ext4 file system is created by some tool other than mke2fs
(perhaps by someone who has a pathalogical fear of the GPL) that
doesn't set one or the other of the EXT2_FLAGS_{UN}SIGNED_HASH flags,
and that file system is then mounted read-only, don't try to modify
the s_flags field.  Otherwise, if dm_verity is in use, the superblock
will change, causing an dm_verity failure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4923b6a9e39..a7a5f7ea74db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3592,16 +3592,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
-	i = le32_to_cpu(es->s_flags);
-	if (i & EXT2_FLAGS_UNSIGNED_HASH)
-		sbi->s_hash_unsigned = 3;
-	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+		i = le32_to_cpu(es->s_flags);
+		if (i & EXT2_FLAGS_UNSIGNED_HASH)
+			sbi->s_hash_unsigned = 3;
+		else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
 #ifdef __CHAR_UNSIGNED__
-		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
-		sbi->s_hash_unsigned = 3;
+			if (!(sb->s_flags & MS_RDONLY))
+				es->s_flags |=
+					cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+			sbi->s_hash_unsigned = 3;
 #else
-		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+			if (!(sb->s_flags & MS_RDONLY))
+				es->s_flags |=
+					cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
+		}
 	}
 
 	/* Handle clustersize */
-- 
cgit v1.2.3


From 94a197c6bc4976fdec2ea37a0612df226839c541 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 15 Feb 2014 21:33:13 -0500
Subject: ext4: fix online resize with very large inode tables

commit b93c95353413041a8cebad915a8109619f66bcc6 upstream.

If a file system has a large number of inodes per block group, all of
the metadata blocks in a flex_bg may be larger than what can fit in a
single block group.  Unfortunately, ext4_alloc_group_tables() in
resize.c was never tested to see if it would handle this case
correctly, and there were a large number of bugs which caused the
following sequence to result in a BUG_ON:

kernel bug at fs/ext4/resize.c:409!
   ...
call trace:
 [<ffffffff81256768>] ext4_flex_group_add+0x1448/0x1830
 [<ffffffff81257de2>] ext4_resize_fs+0x7b2/0xe80
 [<ffffffff8123ac50>] ext4_ioctl+0xbf0/0xf00
 [<ffffffff811c111d>] do_vfs_ioctl+0x2dd/0x4b0
 [<ffffffff811b9df2>] ? final_putname+0x22/0x50
 [<ffffffff811c1371>] sys_ioctl+0x81/0xa0
 [<ffffffff81676aa9>] system_call_fastpath+0x16/0x1b
code: c8 4c 89 df e8 41 96 f8 ff 44 89 e8 49 01 c4 44 29 6d d4 0
rip  [<ffffffff81254fa1>] set_flexbg_block_bitmap+0x171/0x180


This can be reproduced with the following command sequence:

   mke2fs -t ext4 -i 4096 /dev/vdd 1G
   mount -t ext4 /dev/vdd /vdd
   resize2fs /dev/vdd 8G

To fix this, we need to make sure the right thing happens when a block
group's inode table straddles two block groups, which means the
following bugs had to be fixed:

1) Not clearing the BLOCK_UNINIT flag in the second block group in
   ext4_alloc_group_tables --- the was proximate cause of the BUG_ON.

2) Incorrectly determining how many block groups contained contiguous
   free blocks in ext4_alloc_group_tables().

3) Incorrectly setting the start of the next block range to be marked
   in use after a discontinuity in setup_new_flex_group_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 49d3c01eabf8..6bb79434861f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -238,6 +238,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
 	ext4_group_t group;
 	ext4_group_t last_group;
 	unsigned overhead;
+	__u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
 
 	BUG_ON(flex_gd->count == 0 || group_data == NULL);
 
@@ -261,7 +262,7 @@ next_group:
 	src_group++;
 	for (; src_group <= last_group; src_group++) {
 		overhead = ext4_group_overhead_blocks(sb, src_group);
-		if (overhead != 0)
+		if (overhead == 0)
 			last_blk += group_data[src_group - group].blocks_count;
 		else
 			break;
@@ -275,8 +276,7 @@ next_group:
 		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count--;
-		if (flexbg_size > 1)
-			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+		flex_gd->bg_flags[group] &= uninit_mask;
 	}
 
 	/* Allocate inode bitmaps */
@@ -287,22 +287,30 @@ next_group:
 		group = ext4_get_group_number(sb, start_blk - 1);
 		group -= group_data[0].group;
 		group_data[group].free_blocks_count--;
-		if (flexbg_size > 1)
-			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+		flex_gd->bg_flags[group] &= uninit_mask;
 	}
 
 	/* Allocate inode tables */
 	for (; it_index < flex_gd->count; it_index++) {
-		if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+		unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+		ext4_fsblk_t next_group_start;
+
+		if (start_blk + itb > last_blk)
 			goto next_group;
 		group_data[it_index].inode_table = start_blk;
-		group = ext4_get_group_number(sb, start_blk - 1);
+		group = ext4_get_group_number(sb, start_blk);
+		next_group_start = ext4_group_first_block_no(sb, group + 1);
 		group -= group_data[0].group;
-		group_data[group].free_blocks_count -=
-					EXT4_SB(sb)->s_itb_per_group;
-		if (flexbg_size > 1)
-			flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
 
+		if (start_blk + itb > next_group_start) {
+			flex_gd->bg_flags[group + 1] &= uninit_mask;
+			overhead = start_blk + itb - next_group_start;
+			group_data[group + 1].free_blocks_count -= overhead;
+			itb -= overhead;
+		}
+
+		group_data[group].free_blocks_count -= itb;
+		flex_gd->bg_flags[group] &= uninit_mask;
 		start_blk += EXT4_SB(sb)->s_itb_per_group;
 	}
 
@@ -615,7 +623,7 @@ handle_ib:
 			if (err)
 				goto out;
 			count = group_table_count[j];
-			start = group_data[i].block_bitmap;
+			start = (&group_data[i].block_bitmap)[j];
 			block = start;
 		}
 
-- 
cgit v1.2.3


From d6b55805ac4ffbcd63bf74ce572cc7a94f15be9e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 15 Feb 2014 22:42:25 -0500
Subject: ext4: fix online resize with a non-standard blocks per group setting

commit 3d2660d0c9c2f296837078c189b68a47f6b2e3b5 upstream.

The set_flexbg_block_bitmap() function assumed that the number of
blocks in a blockgroup was sb->blocksize * 8, which is normally true,
but not always!  Use EXT4_BLOCKS_PER_GROUP(sb) instead, to fix block
bitmap corruption after:

mke2fs -t ext4 -g 3072 -i 4096 /dev/vdd 1G
mount -t ext4 /dev/vdd /vdd
resize2fs /dev/vdd 8G

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Jon Bernard <jbernard@tuxion.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6bb79434861f..c503850a61a8 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -404,7 +404,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
 		start = ext4_group_first_block_no(sb, group);
 		group -= flex_gd->groups[0].group;
 
-		count2 = sb->s_blocksize * 8 - (block - start);
+		count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
 		if (count2 > count)
 			count2 = count;
 
-- 
cgit v1.2.3


From 0c9f21983490feb5637990d719e729bf77b99bd1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 Feb 2014 19:29:32 -0500
Subject: ext4: don't leave i_crtime.tv_sec uninitialized

commit 19ea80603715d473600cd993b9987bc97d042e02 upstream.

If the i_crtime field is not present in the inode, don't leave the
field uninitialized.

Fixes: ef7f38359 ("ext4: Add nanosecond timestamps")
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Tested-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7bb2e2e55123..790b14c5f262 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -774,6 +774,8 @@ do {									       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))		       \
 		(einode)->xtime.tv_sec = 				       \
 			(signed)le32_to_cpu((raw_inode)->xtime);	       \
+	else								       \
+		(einode)->xtime.tv_sec = 0;				       \
 	if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra))	       \
 		ext4_decode_extra_time(&(einode)->xtime,		       \
 				       raw_inode->xtime ## _extra);	       \
-- 
cgit v1.2.3


From 9f0afafeb7bdd6c2c2b27a93c803fff975a232be Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 14 Feb 2014 07:20:35 -0500
Subject: cifs: ensure that uncached writes handle unmapped areas correctly

commit 5d81de8e8667da7135d3a32a964087c0faf5483f upstream.

It's possible for userland to pass down an iovec via writev() that has a
bogus user pointer in it. If that happens and we're doing an uncached
write, then we can end up getting less bytes than we expect from the
call to iov_iter_copy_from_user. This is CVE-2014-0069

cifs_iovec_write isn't set up to handle that situation however. It'll
blindly keep chugging through the page array and not filling those pages
with anything useful. Worse yet, we'll later end up with a negative
number in wdata->tailsz, which will confuse the sending routines and
cause an oops at the very least.

Fix this by having the copy phase of cifs_iovec_write stop copying data
in this situation and send the last write as a short one. At the same
time, we want to avoid sending a zero-length write to the server, so
break out of the loop and set rc to -EFAULT if that happens. This also
allows us to handle the case where no address in the iovec is valid.

[Note: Marking this for stable on v3.4+ kernels, but kernels as old as
       v2.6.38 may have a similar problem and may need similar fix]

Reviewed-by: Pavel Shilovsky <piastry@etersoft.ru>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/file.c | 37 ++++++++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c2934f8701da..8b0c656f2ab2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2353,7 +2353,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 		 unsigned long nr_segs, loff_t *poffset)
 {
 	unsigned long nr_pages, i;
-	size_t copied, len, cur_len;
+	size_t bytes, copied, len, cur_len;
 	ssize_t total_written = 0;
 	loff_t offset;
 	struct iov_iter it;
@@ -2408,14 +2408,45 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 
 		save_len = cur_len;
 		for (i = 0; i < nr_pages; i++) {
-			copied = min_t(const size_t, cur_len, PAGE_SIZE);
+			bytes = min_t(const size_t, cur_len, PAGE_SIZE);
 			copied = iov_iter_copy_from_user(wdata->pages[i], &it,
-							 0, copied);
+							 0, bytes);
 			cur_len -= copied;
 			iov_iter_advance(&it, copied);
+			/*
+			 * If we didn't copy as much as we expected, then that
+			 * may mean we trod into an unmapped area. Stop copying
+			 * at that point. On the next pass through the big
+			 * loop, we'll likely end up getting a zero-length
+			 * write and bailing out of it.
+			 */
+			if (copied < bytes)
+				break;
 		}
 		cur_len = save_len - cur_len;
 
+		/*
+		 * If we have no data to send, then that probably means that
+		 * the copy above failed altogether. That's most likely because
+		 * the address in the iovec was bogus. Set the rc to -EFAULT,
+		 * free anything we allocated and bail out.
+		 */
+		if (!cur_len) {
+			for (i = 0; i < nr_pages; i++)
+				put_page(wdata->pages[i]);
+			kfree(wdata);
+			rc = -EFAULT;
+			break;
+		}
+
+		/*
+		 * i + 1 now represents the number of pages we actually used in
+		 * the copy phase above. Bring nr_pages down to that, and free
+		 * any pages that we didn't use.
+		 */
+		for ( ; nr_pages > i + 1; nr_pages--)
+			put_page(wdata->pages[nr_pages - 1]);
+
 		wdata->sync_mode = WB_SYNC_ALL;
 		wdata->nr_pages = nr_pages;
 		wdata->offset = (__u64)offset;
-- 
cgit v1.2.3


From 32c36d8888008fb30c8215d0fc189a288285bc12 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastry@etersoft.ru>
Date: Fri, 14 Feb 2014 13:31:02 +0400
Subject: CIFS: Fix too big maxBuf size for SMB3 mounts

commit 2365c4eaf077c48574ab6f143960048fc0f31518 upstream.

SMB3 servers can respond with MaxTransactSize of more than 4M
that can cause a memory allocation error returned from kmalloc
in a lock codepath. Also the client doesn't support multicredit
requests now and allows buffer sizes of 65536 bytes only. Set
MaxTransactSize to this maximum supported value.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <smfrench@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2glob.h |  3 +++
 fs/cifs/smb2ops.c  | 14 ++++----------
 fs/cifs/smb2pdu.c  |  3 +++
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 7c0e2143e775..cc592ef6584a 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -55,4 +55,7 @@
 #define SMB2_NTLMV2_SESSKEY_SIZE (16)
 #define SMB2_HMACSHA256_SIZE (32)
 
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
 #endif	/* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f2e76f3b0c61..e2756bb40b4d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -181,11 +181,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	/* start with specified wsize, or default */
 	wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
 	wsize = min_t(unsigned int, wsize, server->max_write);
-	/*
-	 * limit write size to 2 ** 16, because we don't support multicredit
-	 * requests now.
-	 */
-	wsize = min_t(unsigned int, wsize, 2 << 15);
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
 
 	return wsize;
 }
@@ -199,11 +196,8 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 	/* start with specified rsize, or default */
 	rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
 	rsize = min_t(unsigned int, rsize, server->max_read);
-	/*
-	 * limit write size to 2 ** 16, because we don't support multicredit
-	 * requests now.
-	 */
-	rsize = min_t(unsigned int, rsize, 2 << 15);
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
 
 	return rsize;
 }
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 2b95ce2b54e8..c7a6fd87bb6e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -408,6 +408,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	server->dialect = le16_to_cpu(rsp->DialectRevision);
 
 	server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
+	/* set it to the maximum buffer size value we can send with 1 credit */
+	server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize),
+			       SMB2_MAX_BUFFER_SIZE);
 	server->max_read = le32_to_cpu(rsp->MaxReadSize);
 	server->max_write = le32_to_cpu(rsp->MaxWriteSize);
 	/* BB Do we need to validate the SecurityMode? */
-- 
cgit v1.2.3


From d5685be1332f2dde176de463b0eebea875118a8a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Nov 2013 07:17:07 -0800
Subject: fs: fix iversion handling

commit dff6efc326a4d5f305797d4a6bba14f374fdd633 upstream.

Currently notify_change directly updates i_version for size updates,
which not only is counter to how all other fields are updated through
struct iattr, but also breaks XFS, which need inode updates to happen
under its own lock, and synchronized to the structure that gets written
to the log.

Remove the update in the common code, and it to btrfs and ext4,
XFS already does a proper updaste internally and currently gets a
double update with the existing code.

IMHO this is 3.13 and -stable material and should go in through the XFS
tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Acked-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/attr.c        | 5 -----
 fs/btrfs/inode.c | 8 ++++++--
 fs/ext4/inode.c  | 4 ++++
 3 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 1449adb14ef6..8dd5825ec708 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -182,11 +182,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 			return -EPERM;
 	}
 
-	if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
-		if (attr->ia_size != inode->i_size)
-			inode_inc_iversion(inode);
-	}
-
 	if ((ia_valid & ATTR_MODE)) {
 		umode_t amode = attr->ia_mode;
 		/* Flag setting protected by i_mutex */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 25e6a8e1014e..8fcd2424e7f9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4527,8 +4527,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 	 * these flags set.  For all other operations the VFS set these flags
 	 * explicitly if it wants a timestamp update.
 	 */
-	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
-		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+	if (newsize != oldsize) {
+		inode_inc_iversion(inode);
+		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+			inode->i_ctime = inode->i_mtime =
+				current_fs_time(inode->i_sb);
+	}
 
 	if (newsize > oldsize) {
 		truncate_pagecache(inode, oldsize, newsize);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cb2bdc7ccb05..21dff8f236f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4704,6 +4704,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			if (attr->ia_size > sbi->s_bitmap_maxbytes)
 				return -EFBIG;
 		}
+
+		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+			inode_inc_iversion(inode);
+
 		if (S_ISREG(inode->i_mode) &&
 		    (attr->ia_size < inode->i_size)) {
 			if (ext4_should_order_data(inode)) {
-- 
cgit v1.2.3


From 4480120a6cf3c834cc73431d615bb716be586f2f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 20 Feb 2014 17:02:27 +0100
Subject: quota: Fix race between dqput() and dquot_scan_active()

commit 1362f4ea20fa63688ba6026e586d9746ff13a846 upstream.

Currently last dqput() can race with dquot_scan_active() causing it to
call callback for an already deactivated dquot. The race is as follows:

CPU1					CPU2
  dqput()
    spin_lock(&dq_list_lock);
    if (atomic_read(&dquot->dq_count) > 1) {
     - not taken
    if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
      spin_unlock(&dq_list_lock);
      ->release_dquot(dquot);
        if (atomic_read(&dquot->dq_count) > 1)
         - not taken
					  dquot_scan_active()
					    spin_lock(&dq_list_lock);
					    if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
					     - not taken
					    atomic_inc(&dquot->dq_count);
					    spin_unlock(&dq_list_lock);
        - proceeds to release dquot
					    ret = fn(dquot, priv);
					     - called for inactive dquot

Fix the problem by making sure possible ->release_dquot() is finished by
the time we call the callback and new calls to it will notice reference
dquot_scan_active() has taken and bail out.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/quota/dquot.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 3e64169ef527..38802d683969 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb,
 		dqstats_inc(DQST_LOOKUPS);
 		dqput(old_dquot);
 		old_dquot = dquot;
-		ret = fn(dquot, priv);
-		if (ret < 0)
-			goto out;
+		/*
+		 * ->release_dquot() can be racing with us. Our reference
+		 * protects us from new calls to it so just wait for any
+		 * outstanding call and recheck the DQ_ACTIVE_B after that.
+		 */
+		wait_on_dquot(dquot);
+		if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+			ret = fn(dquot, priv);
+			if (ret < 0)
+				goto out;
+		}
 		spin_lock(&dq_list_lock);
 		/* We are safe to continue now because our dquot could not
 		 * be moved out of the inuse list while we hold the reference */
-- 
cgit v1.2.3