From a6136922d905d103b4edb88c5f3dd3ccf3ce51fe Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 11 Dec 2018 12:37:49 -0500
Subject: aio: fix spectre gadget in lookup_ioctx

commit a538e3ff9dabcdf6c3f477a373c629213d1c3066 upstream.

Matthew pointed out that the ioctx_table is susceptible to spectre v1,
because the index can be controlled by an attacker.  The below patch
should mitigate the attack for all of the aio system calls.

Cc: stable@vger.kernel.org
Reported-by: Matthew Wilcox <willy@infradead.org>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 04c4d6218978..44551d96eaa4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -45,6 +45,7 @@
 
 #include <asm/kmap_types.h>
 #include <linux/uaccess.h>
+#include <linux/nospec.h>
 
 #include "internal.h"
 
@@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	if (!table || id >= table->nr)
 		goto out;
 
+	id = array_index_nospec(id, table->nr);
 	ctx = rcu_dereference(table->table[id]);
 	if (ctx && ctx->user_id == ctx_id) {
 		if (percpu_ref_tryget_live(&ctx->users))
-- 
cgit v1.2.3


From 498a6e6be0de3cd8ccccce6628f0ea1dc4d8f159 Mon Sep 17 00:00:00 2001
From: Piotr Jaroszynski <pjaroszynski@nvidia.com>
Date: Fri, 14 Dec 2018 14:17:14 -0800
Subject: fs/iomap.c: get/put the page in iomap_page_create/release()

commit 61c6de667263184125d5ca75e894fcad632b0dd3 upstream.

migrate_page_move_mapping() expects pages with private data set to have
a page_count elevated by 1.  This is what used to happen for xfs through
the buffer_heads code before the switch to iomap in commit 82cb14175e7d
("xfs: add support for sub-pagesize writeback without buffer_heads").
Not having the count elevated causes move_pages() to fail on memory
mapped files coming from xfs.

Make iomap compatible with the migrate_page_move_mapping() assumption by
elevating the page count as part of iomap_page_create() and lowering it
in iomap_page_release().

It causes the move_pages() syscall to misbehave on memory mapped files
from xfs.  It does not not move any pages, which I suppose is "just" a
perf issue, but it also ends up returning a positive number which is out
of spec for the syscall.  Talking to Michal Hocko, it sounds like
returning positive numbers might be a necessary update to move_pages()
anyway though
(https://lkml.kernel.org/r/20181116114955.GJ14706@dhcp22.suse.cz).

I only hit this in tests that verify that move_pages() actually moved
the pages.  The test also got confused by the positive return from
move_pages() (it got treated as a success as positive numbers were not
expected and not handled) making it a bit harder to track down what's
going on.

Link: http://lkml.kernel.org/r/20181115184140.1388751-1-pjaroszynski@nvidia.com
Fixes: 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads")
Signed-off-by: Piotr Jaroszynski <pjaroszynski@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Brian Foster <bfoster@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/iomap.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/iomap.c b/fs/iomap.c
index ec15cf2ec696..37da7a61a6c5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -117,6 +117,12 @@ iomap_page_create(struct inode *inode, struct page *page)
 	atomic_set(&iop->read_count, 0);
 	atomic_set(&iop->write_count, 0);
 	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+
+	/*
+	 * migrate_page_move_mapping() assumes that pages with private data have
+	 * their count elevated by 1.
+	 */
+	get_page(page);
 	set_page_private(page, (unsigned long)iop);
 	SetPagePrivate(page);
 	return iop;
@@ -133,6 +139,7 @@ iomap_page_release(struct page *page)
 	WARN_ON_ONCE(atomic_read(&iop->write_count));
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
+	put_page(page);
 	kfree(iop);
 }
 
-- 
cgit v1.2.3


From d41c49daf25953490fe3a0832fa6bb0a6739ab0c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 14 Dec 2018 14:17:17 -0800
Subject: userfaultfd: check VM_MAYWRITE was set after verifying the uffd is
 registered

commit 01e881f5a1fca4677e82733061868c6d6ea05ca7 upstream.

Calling UFFDIO_UNREGISTER on virtual ranges not yet registered in uffd
could trigger an harmless false positive WARN_ON.  Check the vma is
already registered before checking VM_MAYWRITE to shut off the false
positive warning.

Link: http://lkml.kernel.org/r/20181206212028.18726-2-aarcange@redhat.com
Cc: <stable@vger.kernel.org>
Fixes: 29ec90660d68 ("userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: syzbot+06c7092e7d71218a2c16@syzkaller.appspotmail.com
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/userfaultfd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cd58939dc977..7a85e609fc27 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1566,7 +1566,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		cond_resched();
 
 		BUG_ON(!vma_can_userfault(vma));
-		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
@@ -1575,6 +1574,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		if (!vma->vm_userfaultfd_ctx.ctx)
 			goto skip;
 
+		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+
 		if (vma->vm_start > start)
 			start = vma->vm_start;
 		vma_end = min(end, vma->vm_end);
-- 
cgit v1.2.3


From 3faf68a42f97b5ca3aa78a0e93ec4d3ff9971ce7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 5 Nov 2018 07:50:10 +0200
Subject: ovl: fix decode of dir file handle with multi lower layers

commit 155b8a0492a90a4c6e22f046a3568b92a6bc48da upstream.

When decoding a lower file handle, we first call ovl_check_origin_fh()
with connected=false to get any real lower dentry for overlay inode
cache lookup.

If the real dentry is a disconnected dir dentry, ovl_check_origin_fh()
is called again with connected=true to get a connected real dentry
and find the lower layer the real dentry belongs to.

If the first call returned a connected real dentry, we use it to
lookup an overlay connected dentry, but the first ovl_check_origin_fh()
call with connected=false did not check that the found dentry is under
the root of the layer (see ovl_acceptable()), it only checked that
the found dentry super block matches the uuid of the lower file handle.

In case there are multiple lower layers on the same fs and the found
dentry is not from the top most lower layer, using the layer index
returned from the first ovl_check_origin_fh() is wrong and we end
up failing to decode the file handle.

Fix this by always calling ovl_check_origin_fh() with connected=true
if we got a directory dentry in the first call.

Fixes: 8b58924ad55c ("ovl: lookup in inode cache first when decoding...")
Cc: <stable@vger.kernel.org> # v4.17
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/overlayfs/export.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 8fa37cd7818a..54e5d17d7f3e 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -754,9 +754,8 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
 		goto out;
 	}
 
-	/* Otherwise, get a connected non-upper dir or disconnected non-dir */
-	if (d_is_dir(origin.dentry) &&
-	    (origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
+	/* Find origin.dentry again with ovl_acceptable() layer check */
+	if (d_is_dir(origin.dentry)) {
 		dput(origin.dentry);
 		origin.dentry = NULL;
 		err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
@@ -769,6 +768,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
 			goto out_err;
 	}
 
+	/* Get a connected non-upper dir or disconnected non-dir */
 	dentry = ovl_get_dentry(sb, NULL, &origin, index);
 
 out:
-- 
cgit v1.2.3


From 2a335229946ec7e04fdd9477e63a08650a7a9999 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 14 Nov 2018 16:01:34 +0200
Subject: ovl: fix missing override creds in link of a metacopy upper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 91ff20f34e94424e586f57f4f593beae16504f86 upstream.

Theodore Ts'o reported a v4.19 regression with docker-dropbox:
https://marc.info/?l=linux-fsdevel&m=154070089431116&w=2

"I was rebuilding my dropbox Docker container, and it failed in 4.19
 with the following error:
 ...
 dpkg: error: error creating new backup file \
              '/var/lib/dpkg/status-old': Invalid cross-device link"

The problem did not reproduce with metacopy feature disabled.
The error was caused by insufficient credentials to set
"trusted.overlay.redirect" xattr on link of a metacopy file.

Reproducer:

 echo Y > /sys/module/overlay/parameters/redirect_dir
 echo Y > /sys/module/overlay/parameters/metacopy
 cd /tmp
 mkdir l u w m
 chmod 777 l u
 touch l/foo
 ln l/foo l/link
 chmod 666 l/foo
 mount -t overlay none -olowerdir=l,upperdir=u,workdir=w m
 su fsgqa
 ln m/foo m/bar
 [   21.455823] overlayfs: failed to set redirect (-1)
 ln: failed to create hard link 'm/bar' => 'm/foo':\
     Invalid cross-device link

Reported-by: Theodore Y. Ts'o <tytso@mit.edu>
Reported-by: Maciej Zięba <maciekz82@gmail.com>
Fixes: 4120fe64dce4 ("ovl: Set redirect on upper inode when it is linked")
Cc: <stable@vger.kernel.org> # v4.19
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/overlayfs/dir.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 3bbde0a9f48f..b2aadd3e1fec 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -652,6 +652,18 @@ static int ovl_symlink(struct inode *dir, struct dentry *dentry,
 	return ovl_create_object(dentry, S_IFLNK, 0, link);
 }
 
+static int ovl_set_link_redirect(struct dentry *dentry)
+{
+	const struct cred *old_cred;
+	int err;
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	err = ovl_set_redirect(dentry, false);
+	revert_creds(old_cred);
+
+	return err;
+}
+
 static int ovl_link(struct dentry *old, struct inode *newdir,
 		    struct dentry *new)
 {
@@ -672,7 +684,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
 		goto out_drop_write;
 
 	if (ovl_is_metacopy_dentry(old)) {
-		err = ovl_set_redirect(old, false);
+		err = ovl_set_link_redirect(old);
 		if (err)
 			goto out_drop_write;
 	}
-- 
cgit v1.2.3


From d9267e136044854e72c3196179cac63ba4fa4254 Mon Sep 17 00:00:00 2001
From: Chad Austin <chadaustin@fb.com>
Date: Mon, 10 Dec 2018 10:54:52 -0800
Subject: fuse: continue to send FUSE_RELEASEDIR when FUSE_OPEN returns ENOSYS

commit 2e64ff154ce6ce9a8dc0f9556463916efa6ff460 upstream.

When FUSE_OPEN returns ENOSYS, the no_open bit is set on the connection.

Because the FUSE_RELEASE and FUSE_RELEASEDIR paths share code, this
incorrectly caused the FUSE_RELEASEDIR request to be dropped and never sent
to userspace.

Pass an isdir bool to distinguish between FUSE_RELEASE and FUSE_RELEASEDIR
inside of fuse_file_put.

Fixes: 7678ac50615d ("fuse: support clients that don't implement 'open'")
Cc: <stable@vger.kernel.org> # v3.14
Signed-off-by: Chad Austin <chadaustin@fb.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dir.c    |  2 +-
 fs/fuse/file.c   | 21 +++++++++++----------
 fs/fuse/fuse_i.h |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0979609d6eba..82a13221775e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1439,7 +1439,7 @@ static int fuse_dir_open(struct inode *inode, struct file *file)
 
 static int fuse_dir_release(struct inode *inode, struct file *file)
 {
-	fuse_release_common(file, FUSE_RELEASEDIR);
+	fuse_release_common(file, true);
 
 	return 0;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a0ffed34b85d..fbd6978479cb 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -87,12 +87,12 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 	iput(req->misc.release.inode);
 }
 
-static void fuse_file_put(struct fuse_file *ff, bool sync)
+static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
 {
 	if (refcount_dec_and_test(&ff->count)) {
 		struct fuse_req *req = ff->reserved_req;
 
-		if (ff->fc->no_open) {
+		if (ff->fc->no_open && !isdir) {
 			/*
 			 * Drop the release request when client does not
 			 * implement 'open'
@@ -245,10 +245,11 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
 	req->in.args[0].value = inarg;
 }
 
-void fuse_release_common(struct file *file, int opcode)
+void fuse_release_common(struct file *file, bool isdir)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_req *req = ff->reserved_req;
+	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 
 	fuse_prepare_release(ff, file->f_flags, opcode);
 
@@ -270,7 +271,7 @@ void fuse_release_common(struct file *file, int opcode)
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fc->destroy_req != NULL);
+	fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -286,7 +287,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 	if (fc->writeback_cache)
 		write_inode_now(inode, 1);
 
-	fuse_release_common(file, FUSE_RELEASE);
+	fuse_release_common(file, false);
 
 	/* return value is ignored by VFS */
 	return 0;
@@ -300,7 +301,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags)
 	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
 	 * synchronous, we are fine with not doing igrab() here"
 	 */
-	fuse_file_put(ff, true);
+	fuse_file_put(ff, true, false);
 }
 EXPORT_SYMBOL_GPL(fuse_sync_release);
 
@@ -805,7 +806,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		put_page(page);
 	}
 	if (req->ff)
-		fuse_file_put(req->ff, false);
+		fuse_file_put(req->ff, false, false);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1459,7 +1460,7 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 		__free_page(req->pages[i]);
 
 	if (req->ff)
-		fuse_file_put(req->ff, false);
+		fuse_file_put(req->ff, false, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1616,7 +1617,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
 	ff = __fuse_write_file_get(fc, fi);
 	err = fuse_flush_times(inode, ff);
 	if (ff)
-		fuse_file_put(ff, 0);
+		fuse_file_put(ff, false, false);
 
 	return err;
 }
@@ -1930,7 +1931,7 @@ static int fuse_writepages(struct address_space *mapping,
 		err = 0;
 	}
 	if (data.ff)
-		fuse_file_put(data.ff, false);
+		fuse_file_put(data.ff, false, false);
 
 	kfree(data.orig_pages);
 out:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index f78e9614bb5f..cec8b8e74969 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -749,7 +749,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags);
 /**
  * Send RELEASE or RELEASEDIR request
  */
-void fuse_release_common(struct file *file, int opcode);
+void fuse_release_common(struct file *file, bool isdir);
 
 /**
  * Send FSYNC or FSYNCDIR request
-- 
cgit v1.2.3


From de956d40781191903c7eb8e6843b0cb506ab1f43 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <dave.kleikamp@oracle.com>
Date: Tue, 27 Nov 2018 19:31:30 +0000
Subject: nfs: don't dirty kernel pages read by direct-io

[ Upstream commit ad3cba223ac02dc769c3bbe88efe277bbb457566 ]

When we use direct_IO with an NFS backing store, we can trigger a
WARNING in __set_page_dirty(), as below, since we're dirtying the page
unnecessarily in nfs_direct_read_completion().

To fix, replicate the logic in commit 53cbf3b157a0 ("fs: direct-io:
don't dirtying pages for ITER_BVEC/ITER_KVEC direct read").

Other filesystems that implement direct_IO handle this; most use
blockdev_direct_IO(). ceph and cifs have similar logic.

mount 127.0.0.1:/export /nfs
dd if=/dev/zero of=/nfs/image bs=1M count=200
losetup --direct-io=on -f /nfs/image
mkfs.btrfs /dev/loop0
mount -t btrfs /dev/loop0 /mnt/

kernel: WARNING: CPU: 0 PID: 8067 at fs/buffer.c:580 __set_page_dirty+0xaf/0xd0
kernel: Modules linked in: loop(E) nfsv3(E) rpcsec_gss_krb5(E) nfsv4(E) dns_resolver(E) nfs(E) fscache(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) fuse(E) tun(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_
kernel:  snd_seq(E) snd_seq_device(E) snd_pcm(E) video(E) snd_timer(E) snd(E) soundcore(E) ip_tables(E) xfs(E) libcrc32c(E) sd_mod(E) sr_mod(E) cdrom(E) ata_generic(E) pata_acpi(E) crc32c_intel(E) ahci(E) li
kernel: CPU: 0 PID: 8067 Comm: kworker/0:2 Tainted: G            E     4.20.0-rc1.master.20181111.ol7.x86_64 #1
kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
kernel: Workqueue: nfsiod rpc_async_release [sunrpc]
kernel: RIP: 0010:__set_page_dirty+0xaf/0xd0
kernel: Code: c3 48 8b 02 f6 c4 04 74 d4 48 89 df e8 ba 05 f7 ff 48 89 c6 eb cb 48 8b 43 08 a8 01 75 1f 48 89 d8 48 8b 00 a8 04 74 02 eb 87 <0f> 0b eb 83 48 83 e8 01 eb 9f 48 83 ea 01 0f 1f 00 eb 8b 48 83 e8
kernel: RSP: 0000:ffffc1c8825b7d78 EFLAGS: 00013046
kernel: RAX: 000fffffc0020089 RBX: fffff2b603308b80 RCX: 0000000000000001
kernel: RDX: 0000000000000001 RSI: ffff9d11478115c8 RDI: ffff9d11478115d0
kernel: RBP: ffffc1c8825b7da0 R08: 0000646f6973666e R09: 8080808080808080
kernel: R10: 0000000000000001 R11: 0000000000000000 R12: ffff9d11478115d0
kernel: R13: ffff9d11478115c8 R14: 0000000000003246 R15: 0000000000000001
kernel: FS:  0000000000000000(0000) GS:ffff9d115ba00000(0000) knlGS:0000000000000000
kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kernel: CR2: 00007f408686f640 CR3: 0000000104d8e004 CR4: 00000000000606f0
kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
kernel: Call Trace:
kernel:  __set_page_dirty_buffers+0xb6/0x110
kernel:  set_page_dirty+0x52/0xb0
kernel:  nfs_direct_read_completion+0xc4/0x120 [nfs]
kernel:  nfs_pgio_release+0x10/0x20 [nfs]
kernel:  rpc_free_task+0x30/0x70 [sunrpc]
kernel:  rpc_async_release+0x12/0x20 [sunrpc]
kernel:  process_one_work+0x174/0x390
kernel:  worker_thread+0x4f/0x3e0
kernel:  kthread+0x102/0x140
kernel:  ? drain_workqueue+0x130/0x130
kernel:  ? kthread_stop+0x110/0x110
kernel:  ret_from_fork+0x35/0x40
kernel: ---[ end trace 01341980905412c9 ]---

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>

[forward-ported to v4.20]
Signed-off-by: Calum Mackay <calum.mackay@oracle.com>
Reviewed-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/direct.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index aa12c3063bae..33824a0a57bf 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -98,8 +98,11 @@ struct nfs_direct_req {
 	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
 	struct work_struct	work;
 	int			flags;
+	/* for write */
 #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
 #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
+	/* for read */
+#define NFS_ODIRECT_SHOULD_DIRTY	(3)	/* dirty user-space page after read */
 	struct nfs_writeverf	verf;		/* unstable write verifier */
 };
 
@@ -412,7 +415,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
 		struct page *page = req->wb_page;
 
-		if (!PageCompound(page) && bytes < hdr->good_bytes)
+		if (!PageCompound(page) && bytes < hdr->good_bytes &&
+		    (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
 			set_page_dirty(page);
 		bytes += req->wb_bytes;
 		nfs_list_remove_request(req);
@@ -587,6 +591,9 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 	if (!is_sync_kiocb(iocb))
 		dreq->iocb = iocb;
 
+	if (iter_is_iovec(iter))
+		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
+
 	nfs_start_io_direct(inode);
 
 	NFS_I(inode)->read_io += count;
-- 
cgit v1.2.3


From b5a8028c25f3f3c3bbbe09646fe331a570189cbf Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sat, 3 Nov 2018 15:02:44 -0500
Subject: cifs: In Kconfig CONFIG_CIFS_POSIX needs depends on legacy (insecure
 cifs)

[ Upstream commit 6e785302dad32228819d8066e5376acd15d0e6ba ]

Missing a dependency.  Shouldn't show cifs posix extensions
in Kconfig if CONFIG_CIFS_ALLOW_INSECURE_DIALECTS (ie SMB1
protocol) is disabled.

Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index abcd78e332fe..85dadb93c992 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -133,7 +133,7 @@ config CIFS_XATTR
 
 config CIFS_POSIX
         bool "CIFS POSIX Extensions"
-        depends on CIFS_XATTR
+        depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR
         help
           Enabling this option will cause the cifs client to attempt to
 	  negotiate a newer dialect with servers, such as Samba 3.0.5
-- 
cgit v1.2.3


From b4c7c826709b7d882ec9b264d5032e887e6bd720 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 31 Oct 2018 10:06:08 -0700
Subject: Btrfs: fix missing delayed iputs on unmount

[ Upstream commit d6fd0ae25c6495674dc5a41a8d16bc8e0073276d ]

There's a race between close_ctree() and cleaner_kthread().
close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it
sees it set, but this is racy; the cleaner might have already checked
the bit and could be cleaning stuff. In particular, if it deletes unused
block groups, it will create delayed iputs for the free space cache
inodes. As of "btrfs: don't run delayed_iputs in commit", we're no
longer running delayed iputs after a commit. Therefore, if the cleaner
creates more delayed iputs after delayed iputs are run in
btrfs_commit_super(), we will leak inodes on unmount and get a busy
inode crash from the VFS.

Fix it by parking the cleaner before we actually close anything. Then,
any remaining delayed iputs will always be handled in
btrfs_commit_super(). This also ensures that the commit in close_ctree()
is really the last commit, so we can get rid of the commit in
cleaner_kthread().

The fstest/generic/475 followed by 476 can trigger a crash that
manifests as a slab corruption caused by accessing the freed kthread
structure by a wake up function. Sample trace:

[ 5657.077612] BUG: unable to handle kernel NULL pointer dereference at 00000000000000cc
[ 5657.079432] PGD 1c57a067 P4D 1c57a067 PUD da10067 PMD 0
[ 5657.080661] Oops: 0000 [#1] PREEMPT SMP
[ 5657.081592] CPU: 1 PID: 5157 Comm: fsstress Tainted: G        W         4.19.0-rc8-default+ #323
[ 5657.083703] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
[ 5657.086577] RIP: 0010:shrink_page_list+0x2f9/0xe90
[ 5657.091937] RSP: 0018:ffffb5c745c8f728 EFLAGS: 00010287
[ 5657.092953] RAX: 0000000000000074 RBX: ffffb5c745c8f830 RCX: 0000000000000000
[ 5657.094590] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff9a8747fdf3d0
[ 5657.095987] RBP: ffffb5c745c8f9e0 R08: 0000000000000000 R09: 0000000000000000
[ 5657.097159] R10: ffff9a8747fdf5e8 R11: 0000000000000000 R12: ffffb5c745c8f788
[ 5657.098513] R13: ffff9a877f6ff2c0 R14: ffff9a877f6ff2c8 R15: dead000000000200
[ 5657.099689] FS:  00007f948d853b80(0000) GS:ffff9a877d600000(0000) knlGS:0000000000000000
[ 5657.101032] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 5657.101953] CR2: 00000000000000cc CR3: 00000000684bd000 CR4: 00000000000006e0
[ 5657.103159] Call Trace:
[ 5657.103776]  shrink_inactive_list+0x194/0x410
[ 5657.104671]  shrink_node_memcg.constprop.84+0x39a/0x6a0
[ 5657.105750]  shrink_node+0x62/0x1c0
[ 5657.106529]  try_to_free_pages+0x1a4/0x500
[ 5657.107408]  __alloc_pages_slowpath+0x2c9/0xb20
[ 5657.108418]  __alloc_pages_nodemask+0x268/0x2b0
[ 5657.109348]  kmalloc_large_node+0x37/0x90
[ 5657.110205]  __kmalloc_node+0x236/0x310
[ 5657.111014]  kvmalloc_node+0x3e/0x70

Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add trace ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/disk-io.c | 51 +++++++++++++++------------------------------------
 1 file changed, 15 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 834a3f5ef642..d4a7f7ca4145 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1656,9 +1656,8 @@ static int cleaner_kthread(void *arg)
 	struct btrfs_root *root = arg;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int again;
-	struct btrfs_trans_handle *trans;
 
-	do {
+	while (1) {
 		again = 0;
 
 		/* Make the cleaner go to sleep early. */
@@ -1707,42 +1706,16 @@ static int cleaner_kthread(void *arg)
 		 */
 		btrfs_delete_unused_bgs(fs_info);
 sleep:
+		if (kthread_should_park())
+			kthread_parkme();
+		if (kthread_should_stop())
+			return 0;
 		if (!again) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			if (!kthread_should_stop())
-				schedule();
+			schedule();
 			__set_current_state(TASK_RUNNING);
 		}
-	} while (!kthread_should_stop());
-
-	/*
-	 * Transaction kthread is stopped before us and wakes us up.
-	 * However we might have started a new transaction and COWed some
-	 * tree blocks when deleting unused block groups for example. So
-	 * make sure we commit the transaction we started to have a clean
-	 * shutdown when evicting the btree inode - if it has dirty pages
-	 * when we do the final iput() on it, eviction will trigger a
-	 * writeback for it which will fail with null pointer dereferences
-	 * since work queues and other resources were already released and
-	 * destroyed by the time the iput/eviction/writeback is made.
-	 */
-	trans = btrfs_attach_transaction(root);
-	if (IS_ERR(trans)) {
-		if (PTR_ERR(trans) != -ENOENT)
-			btrfs_err(fs_info,
-				  "cleaner transaction attach returned %ld",
-				  PTR_ERR(trans));
-	} else {
-		int ret;
-
-		ret = btrfs_commit_transaction(trans);
-		if (ret)
-			btrfs_err(fs_info,
-				  "cleaner open transaction commit returned %d",
-				  ret);
 	}
-
-	return 0;
 }
 
 static int transaction_kthread(void *arg)
@@ -3923,6 +3896,13 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 	int ret;
 
 	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+	/*
+	 * We don't want the cleaner to start new transactions, add more delayed
+	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
+	 * because that frees the task_struct, and the transaction kthread might
+	 * still try to wake up the cleaner.
+	 */
+	kthread_park(fs_info->cleaner_kthread);
 
 	/* wait for the qgroup rescan worker to stop */
 	btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -3950,9 +3930,8 @@ void close_ctree(struct btrfs_fs_info *fs_info)
 
 	if (!sb_rdonly(fs_info->sb)) {
 		/*
-		 * If the cleaner thread is stopped and there are
-		 * block groups queued for removal, the deletion will be
-		 * skipped when we quit the cleaner thread.
+		 * The cleaner kthread is stopped, so do one final pass over
+		 * unused block groups.
 		 */
 		btrfs_delete_unused_bgs(fs_info);
 
-- 
cgit v1.2.3


From 38d072a4a71738d95a2b9e47a4bbd091d7e55af3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 20 Dec 2018 23:23:24 +1100
Subject: iomap: Revert "fs/iomap.c: get/put the page in
 iomap_page_create/release()"

[ Upstream commit a837eca2412051628c0529768c9bc4f3580b040e ]

This reverts commit 61c6de667263184125d5ca75e894fcad632b0dd3.

The reverted commit added page reference counting to iomap page
structures that are used to track block size < page size state. This
was supposed to align the code with page migration page accounting
assumptions, but what it has done instead is break XFS filesystems.
Every fstests run I've done on sub-page block size XFS filesystems
has since picking up this commit 2 days ago has failed with bad page
state errors such as:

# ./run_check.sh "-m rmapbt=1,reflink=1 -i sparse=1 -b size=1k" "generic/038"
....
SECTION       -- xfs
FSTYP         -- xfs (debug)
PLATFORM      -- Linux/x86_64 test1 4.20.0-rc6-dgc+
MKFS_OPTIONS  -- -f -m rmapbt=1,reflink=1 -i sparse=1 -b size=1k /dev/sdc
MOUNT_OPTIONS -- /dev/sdc /mnt/scratch

generic/038 454s ...
 run fstests generic/038 at 2018-12-20 18:43:05
 XFS (sdc): Unmounting Filesystem
 XFS (sdc): Mounting V5 Filesystem
 XFS (sdc): Ending clean mount
 BUG: Bad page state in process kswapd0  pfn:3a7fa
 page:ffffea0000ccbeb0 count:0 mapcount:0 mapping:ffff88800d9b6360 index:0x1
 flags: 0xfffffc0000000()
 raw: 000fffffc0000000 dead000000000100 dead000000000200 ffff88800d9b6360
 raw: 0000000000000001 0000000000000000 00000000ffffffff
 page dumped because: non-NULL mapping
 CPU: 0 PID: 676 Comm: kswapd0 Not tainted 4.20.0-rc6-dgc+ #915
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
 Call Trace:
  dump_stack+0x67/0x90
  bad_page.cold.116+0x8a/0xbd
  free_pcppages_bulk+0x4bf/0x6a0
  free_unref_page_list+0x10f/0x1f0
  shrink_page_list+0x49d/0xf50
  shrink_inactive_list+0x19d/0x3b0
  shrink_node_memcg.constprop.77+0x398/0x690
  ? shrink_slab.constprop.81+0x278/0x3f0
  shrink_node+0x7a/0x2f0
  kswapd+0x34b/0x6d0
  ? node_reclaim+0x240/0x240
  kthread+0x11f/0x140
  ? __kthread_bind_mask+0x60/0x60
  ret_from_fork+0x24/0x30
 Disabling lock debugging due to kernel taint
....

The failures are from anyway that frees pages and empties the
per-cpu page magazines, so it's not a predictable failure or an easy
to debug failure.

generic/038 is a reliable reproducer of this problem - it has a 9 in
10 failure rate on one of my test machines. Failure on other
machines have been at random points in fstests runs but every run
has ended up tripping this problem. Hence generic/038 was used to
bisect the failure because it was the most reliable failure.

It is too close to the 4.20 release (not to mention holidays) to
try to diagnose, fix and test the underlying cause of the problem,
so reverting the commit is the only option we have right now. The
revert has been tested against a current tot 4.20-rc7+ kernel across
multiple machines running sub-page block size XFs filesystems and
none of the bad page state failures have been seen.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Cc: Piotr Jaroszynski <pjaroszynski@nvidia.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: William Kucharski <william.kucharski@oracle.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Brian Foster <bfoster@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/iomap.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap.c b/fs/iomap.c
index 37da7a61a6c5..ec15cf2ec696 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -117,12 +117,6 @@ iomap_page_create(struct inode *inode, struct page *page)
 	atomic_set(&iop->read_count, 0);
 	atomic_set(&iop->write_count, 0);
 	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
-
-	/*
-	 * migrate_page_move_mapping() assumes that pages with private data have
-	 * their count elevated by 1.
-	 */
-	get_page(page);
 	set_page_private(page, (unsigned long)iop);
 	SetPagePrivate(page);
 	return iop;
@@ -139,7 +133,6 @@ iomap_page_release(struct page *page)
 	WARN_ON_ONCE(atomic_read(&iop->write_count));
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
-	put_page(page);
 	kfree(iop);
 }
 
-- 
cgit v1.2.3


From 9c5ccadb7b42ee124d4009212a2b737c98590da7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Thu, 5 Jul 2018 17:51:20 +0200
Subject: Revert "vfs: Allow userns root to call mknod on owned filesystems."

commit 94f82008ce30e2624537d240d64ce718255e0b80 upstream.

This reverts commit 55956b59df336f6738da916dbb520b6e37df9fbd.

commit 55956b59df33 ("vfs: Allow userns root to call mknod on owned filesystems.")
enabled mknod() in user namespaces for userns root if CAP_MKNOD is
available. However, these device nodes are useless since any filesystem
mounted from a non-initial user namespace will set the SB_I_NODEV flag on
the filesystem. Now, when a device node s created in a non-initial user
namespace a call to open() on said device node will fail due to:

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

The problem with this is that as of the aforementioned commit mknod()
creates partially functional device nodes in non-initial user namespaces.
In particular, it has the consequence that as of the aforementioned commit
open() will be more privileged with respect to device nodes than mknod().
Before it was the other way around. Specifically, if mknod() succeeded
then it was transparent for any userspace application that a fatal error
must have occured when open() failed.

All of this breaks multiple userspace workloads and a widespread assumption
about how to handle mknod(). Basically, all container runtimes and systemd
live by the slogan "ask for forgiveness not permission" when running user
namespace workloads. For mknod() the assumption is that if the syscall
succeeds the device nodes are useable irrespective of whether it succeeds
in a non-initial user namespace or not. This logic was chosen explicitly
to allow for the glorious day when mknod() will actually be able to create
fully functional device nodes in user namespaces.
A specific problem people are already running into when running 4.18 rc
kernels are failing systemd services. For any distro that is run in a
container systemd services started with the PrivateDevices= property set
will fail to start since the device nodes in question cannot be
opened (cf. the arguments in [1]).

Full disclosure, Seth made the very sound argument that it is already
possible to end up with partially functional device nodes. Any filesystem
mounted with MS_NODEV set will allow mknod() to succeed but will not allow
open() to succeed. The difference to the case here is that the MS_NODEV
case is transparent to userspace since it is an explicitly set mount option
while the SB_I_NODEV case is an implicit property enforced by the kernel
and hence opaque to userspace.

[1]: https://github.com/systemd/systemd/pull/9483

Signed-off-by: Christian Brauner <christian@brauner.io>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Seth Forshee <seth.forshee@canonical.com>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/namei.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0cab6494978c..914178cdbe94 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3701,8 +3701,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	if (error)
 		return error;
 
-	if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
-	    !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD))
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
 		return -EPERM;
 
 	if (!dir->i_op->mknod)
-- 
cgit v1.2.3


From 0736458856868c920079f42a1e881d271e201390 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 7 Nov 2018 23:04:43 +0100
Subject: ubifs: Handle re-linking of inodes correctly while recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e58725d51fa8da9133f3f1c54170aa2e43056b91 upstream.

UBIFS's recovery code strictly assumes that a deleted inode will never
come back, therefore it removes all data which belongs to that inode
as soon it faces an inode with link count 0 in the replay list.
Before O_TMPFILE this assumption was perfectly fine. With O_TMPFILE
it can lead to data loss upon a power-cut.

Consider a journal with entries like:
0: inode X (nlink = 0) /* O_TMPFILE was created */
1: data for inode X /* Someone writes to the temp file */
2: inode X (nlink = 0) /* inode was changed, xattr, chmod, … */
3: inode X (nlink = 1) /* inode was re-linked via linkat() */

Upon replay of entry #2 UBIFS will drop all data that belongs to inode X,
this will lead to an empty file after mounting.

As solution for this problem, scan the replay list for a re-link entry
before dropping data.

Fixes: 474b93704f32 ("ubifs: Implement O_TMPFILE")
Cc: stable@vger.kernel.org
Cc: Russell Senior <russell@personaltelco.net>
Cc: Rafał Miłecki <zajec5@gmail.com>
Reported-by: Russell Senior <russell@personaltelco.net>
Reported-by: Rafał Miłecki <zajec5@gmail.com>
Tested-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ubifs/replay.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 4844538eb926..c6f9b2225387 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -209,6 +209,38 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
 	return ubifs_tnc_remove_range(c, &min_key, &max_key);
 }
 
+/**
+ * inode_still_linked - check whether inode in question will be re-linked.
+ * @c: UBIFS file-system description object
+ * @rino: replay entry to test
+ *
+ * O_TMPFILE files can be re-linked, this means link count goes from 0 to 1.
+ * This case needs special care, otherwise all references to the inode will
+ * be removed upon the first replay entry of an inode with link count 0
+ * is found.
+ */
+static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino)
+{
+	struct replay_entry *r;
+
+	ubifs_assert(c, rino->deletion);
+	ubifs_assert(c, key_type(c, &rino->key) == UBIFS_INO_KEY);
+
+	/*
+	 * Find the most recent entry for the inode behind @rino and check
+	 * whether it is a deletion.
+	 */
+	list_for_each_entry_reverse(r, &c->replay_list, list) {
+		ubifs_assert(c, r->sqnum >= rino->sqnum);
+		if (key_inum(c, &r->key) == key_inum(c, &rino->key))
+			return r->deletion == 0;
+
+	}
+
+	ubifs_assert(c, 0);
+	return false;
+}
+
 /**
  * apply_replay_entry - apply a replay entry to the TNC.
  * @c: UBIFS file-system description object
@@ -236,6 +268,11 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
 			{
 				ino_t inum = key_inum(c, &r->key);
 
+				if (inode_still_linked(c, r)) {
+					err = 0;
+					break;
+				}
+
 				err = ubifs_tnc_remove_ino(c, inum);
 				break;
 			}
-- 
cgit v1.2.3


From 6bb41321166fe7db834fd7137b596d4312e38273 Mon Sep 17 00:00:00 2001
From: Ivan Delalande <colona@arista.com>
Date: Thu, 13 Dec 2018 15:20:52 -0800
Subject: proc/sysctl: don't return ENOMEM on lookup when a table is
 unregistering

commit ea5751ccd665a2fd1b24f9af81f6167f0718c5f6 upstream.

proc_sys_lookup can fail with ENOMEM instead of ENOENT when the
corresponding sysctl table is being unregistered. In our case we see
this upon opening /proc/sys/net/*/conf files while network interfaces
are being deleted, which confuses our configuration daemon.

The problem was successfully reproduced and this fix tested on v4.9.122
and v4.20-rc6.

v2: return ERR_PTRs in all cases when proc_sys_make_inode fails instead
of mixing them with NULL. Thanks Al Viro for the feedback.

Fixes: ace0c791e6c3 ("proc/sysctl: Don't grab i_lock under sysctl_lock.")
Cc: stable@vger.kernel.org
Signed-off-by: Ivan Delalande <colona@arista.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/proc_sysctl.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 89921a0d2ebb..4d598a399bbf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -464,7 +464,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 
 	inode = new_inode(sb);
 	if (!inode)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
 	inode->i_ino = get_next_ino();
 
@@ -474,8 +474,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	if (unlikely(head->unregistering)) {
 		spin_unlock(&sysctl_lock);
 		iput(inode);
-		inode = NULL;
-		goto out;
+		return ERR_PTR(-ENOENT);
 	}
 	ei->sysctl = head;
 	ei->sysctl_entry = table;
@@ -500,7 +499,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	if (root->set_ownership)
 		root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
 
-out:
 	return inode;
 }
 
@@ -549,10 +547,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
 			goto out;
 	}
 
-	err = ERR_PTR(-ENOMEM);
 	inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
-	if (!inode)
+	if (IS_ERR(inode)) {
+		err = ERR_CAST(inode);
 		goto out;
+	}
 
 	d_set_d_op(dentry, &proc_sys_dentry_operations);
 	err = d_splice_alias(inode, dentry);
@@ -685,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file,
 		if (d_in_lookup(child)) {
 			struct dentry *res;
 			inode = proc_sys_make_inode(dir->d_sb, head, table);
-			if (!inode) {
+			if (IS_ERR(inode)) {
 				d_lookup_done(child);
 				dput(child);
 				return false;
-- 
cgit v1.2.3


From b878c8a7f08f0c225b6a46ba1ac867e9c5d17807 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 25 Nov 2018 17:20:31 -0500
Subject: ext4: add ext4_sb_bread() to disambiguate ENOMEM cases

commit fb265c9cb49e2074ddcdd4de99728aefdd3b3592 upstream.

Today, when sb_bread() returns NULL, this can either be because of an
I/O error or because the system failed to allocate the buffer.  Since
it's an old interface, changing would require changing many call
sites.

So instead we create our own ext4_sb_bread(), which also allows us to
set the REQ_META flag.

Also fixed a problem in the xattr code where a NULL return in a
function could also mean that the xattr was not found, which could
lead to the wrong error getting returned to userspace.

Fixes: ac27a0ec112a ("ext4: initial copy of files from ext3")
Cc: stable@kernel.org # 2.6.19
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4.h    |  2 ++
 fs/ext4/migrate.c | 36 +++++++++++++-------------
 fs/ext4/resize.c  | 72 ++++++++++++++++++++++++++--------------------------
 fs/ext4/super.c   | 23 +++++++++++++++++
 fs/ext4/xattr.c   | 76 ++++++++++++++++++++++++++-----------------------------
 5 files changed, 115 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5cfb1e2f6a5b..05eddfbd3404 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2542,6 +2542,8 @@ extern int ext4_group_extend(struct super_block *sb,
 extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
+extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
+					 sector_t block, int op_flags);
 extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
 extern int ext4_calculate_overhead(struct super_block *sb);
 extern void ext4_superblock_csum_set(struct super_block *sb);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 61a9d1927817..a98bfca9c463 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -116,9 +116,9 @@ static int update_ind_extent_range(handle_t *handle, struct inode *inode,
 	int i, retval = 0;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, pblock);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	i_data = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -145,9 +145,9 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode,
 	int i, retval = 0;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, pblock);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	i_data = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -175,9 +175,9 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode,
 	int i, retval = 0;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, pblock);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, pblock, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	i_data = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -224,9 +224,9 @@ static int free_dind_blocks(handle_t *handle,
 	struct buffer_head *bh;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	tmp_idata = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -254,9 +254,9 @@ static int free_tind_blocks(handle_t *handle,
 	struct buffer_head *bh;
 	unsigned long max_entries = inode->i_sb->s_blocksize >> 2;
 
-	bh = sb_bread(inode->i_sb, le32_to_cpu(i_data));
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	tmp_idata = (__le32 *)bh->b_data;
 	for (i = 0; i < max_entries; i++) {
@@ -382,9 +382,9 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
 	struct ext4_extent_header *eh;
 
 	block = ext4_idx_pblock(ix);
-	bh = sb_bread(inode->i_sb, block);
-	if (!bh)
-		return -EIO;
+	bh = ext4_sb_bread(inode->i_sb, block, 0);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 
 	eh = (struct ext4_extent_header *)bh->b_data;
 	if (eh->eh_depth != 0) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a5efee34415f..87350b642681 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -127,10 +127,12 @@ static int verify_group_input(struct super_block *sb,
 	else if (free_blocks_count < 0)
 		ext4_warning(sb, "Bad blocks count %u",
 			     input->blocks_count);
-	else if (!(bh = sb_bread(sb, end - 1)))
+	else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) {
+		err = PTR_ERR(bh);
+		bh = NULL;
 		ext4_warning(sb, "Cannot read last block (%llu)",
 			     end - 1);
-	else if (outside(input->block_bitmap, start, end))
+	} else if (outside(input->block_bitmap, start, end))
 		ext4_warning(sb, "Block bitmap not in group (block %llu)",
 			     (unsigned long long)input->block_bitmap);
 	else if (outside(input->inode_bitmap, start, end))
@@ -781,11 +783,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
-	struct buffer_head **o_group_desc, **n_group_desc;
-	struct buffer_head *dind;
-	struct buffer_head *gdb_bh;
+	struct buffer_head **o_group_desc, **n_group_desc = NULL;
+	struct buffer_head *dind = NULL;
+	struct buffer_head *gdb_bh = NULL;
 	int gdbackups;
-	struct ext4_iloc iloc;
+	struct ext4_iloc iloc = { .bh = NULL };
 	__le32 *data;
 	int err;
 
@@ -794,21 +796,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
 		       gdb_num);
 
-	gdb_bh = sb_bread(sb, gdblock);
-	if (!gdb_bh)
-		return -EIO;
+	gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+	if (IS_ERR(gdb_bh))
+		return PTR_ERR(gdb_bh);
 
 	gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
 	if (gdbackups < 0) {
 		err = gdbackups;
-		goto exit_bh;
+		goto errout;
 	}
 
 	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
-	dind = sb_bread(sb, le32_to_cpu(*data));
-	if (!dind) {
-		err = -EIO;
-		goto exit_bh;
+	dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+	if (IS_ERR(dind)) {
+		err = PTR_ERR(dind);
+		dind = NULL;
+		goto errout;
 	}
 
 	data = (__le32 *)dind->b_data;
@@ -816,18 +819,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		ext4_warning(sb, "new group %u GDT block %llu not reserved",
 			     group, gdblock);
 		err = -EINVAL;
-		goto exit_dind;
+		goto errout;
 	}
 
 	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 	if (unlikely(err))
-		goto exit_dind;
+		goto errout;
 
 	BUFFER_TRACE(gdb_bh, "get_write_access");
 	err = ext4_journal_get_write_access(handle, gdb_bh);
 	if (unlikely(err))
-		goto exit_dind;
+		goto errout;
 
 	BUFFER_TRACE(dind, "get_write_access");
 	err = ext4_journal_get_write_access(handle, dind);
@@ -837,7 +840,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	/* ext4_reserve_inode_write() gets a reference on the iloc */
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (unlikely(err))
-		goto exit_dind;
+		goto errout;
 
 	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
 				     sizeof(struct buffer_head *),
@@ -846,7 +849,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		err = -ENOMEM;
 		ext4_warning(sb, "not enough memory for %lu groups",
 			     gdb_num + 1);
-		goto exit_inode;
+		goto errout;
 	}
 
 	/*
@@ -862,7 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	err = ext4_handle_dirty_metadata(handle, NULL, dind);
 	if (unlikely(err)) {
 		ext4_std_error(sb, err);
-		goto exit_inode;
+		goto errout;
 	}
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >>
 			   (9 - EXT4_SB(sb)->s_cluster_bits);
@@ -871,8 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
 	if (unlikely(err)) {
 		ext4_std_error(sb, err);
-		iloc.bh = NULL;
-		goto exit_inode;
+		goto errout;
 	}
 	brelse(dind);
 
@@ -888,15 +890,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	err = ext4_handle_dirty_super(handle, sb);
 	if (err)
 		ext4_std_error(sb, err);
-
 	return err;
-
-exit_inode:
+errout:
 	kvfree(n_group_desc);
 	brelse(iloc.bh);
-exit_dind:
 	brelse(dind);
-exit_bh:
 	brelse(gdb_bh);
 
 	ext4_debug("leaving with error %d\n", err);
@@ -916,9 +914,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 
 	gdblock = ext4_meta_bg_first_block_no(sb, group) +
 		   ext4_bg_has_super(sb, group);
-	gdb_bh = sb_bread(sb, gdblock);
-	if (!gdb_bh)
-		return -EIO;
+	gdb_bh = ext4_sb_bread(sb, gdblock, 0);
+	if (IS_ERR(gdb_bh))
+		return PTR_ERR(gdb_bh);
 	n_group_desc = ext4_kvmalloc((gdb_num + 1) *
 				     sizeof(struct buffer_head *),
 				     GFP_NOFS);
@@ -975,9 +973,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 		return -ENOMEM;
 
 	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
-	dind = sb_bread(sb, le32_to_cpu(*data));
-	if (!dind) {
-		err = -EIO;
+	dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0);
+	if (IS_ERR(dind)) {
+		err = PTR_ERR(dind);
+		dind = NULL;
 		goto exit_free;
 	}
 
@@ -996,9 +995,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 			err = -EINVAL;
 			goto exit_bh;
 		}
-		primary[res] = sb_bread(sb, blk);
-		if (!primary[res]) {
-			err = -EIO;
+		primary[res] = ext4_sb_bread(sb, blk, 0);
+		if (IS_ERR(primary[res])) {
+			err = PTR_ERR(primary[res]);
+			primary[res] = NULL;
 			goto exit_bh;
 		}
 		gdbackups = verify_reserved_gdb(sb, group, primary[res]);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a149df1c6a1..804ce1e5a1e7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -140,6 +140,29 @@ MODULE_ALIAS_FS("ext3");
 MODULE_ALIAS("ext3");
 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 
+/*
+ * This works like sb_bread() except it uses ERR_PTR for error
+ * returns.  Currently with sb_bread it's impossible to distinguish
+ * between ENOMEM and EIO situations (since both result in a NULL
+ * return.
+ */
+struct buffer_head *
+ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
+{
+	struct buffer_head *bh = sb_getblk(sb, block);
+
+	if (bh == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	return ERR_PTR(-EIO);
+}
+
 static int ext4_verify_csum_type(struct super_block *sb,
 				 struct ext4_super_block *es)
 {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 4380c8630539..52856d98afd3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -522,14 +522,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
 
-	error = -ENODATA;
 	if (!EXT4_I(inode)->i_file_acl)
-		goto cleanup;
+		return -ENODATA;
 	ea_idebug(inode, "reading block %llu",
 		  (unsigned long long)EXT4_I(inode)->i_file_acl);
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh)
-		goto cleanup;
+	bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	error = ext4_xattr_check_block(inode, bh);
@@ -696,26 +695,23 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
 
-	error = 0;
 	if (!EXT4_I(inode)->i_file_acl)
-		goto cleanup;
+		return 0;
 	ea_idebug(inode, "reading block %llu",
 		  (unsigned long long)EXT4_I(inode)->i_file_acl);
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	error = -EIO;
-	if (!bh)
-		goto cleanup;
+	bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+	if (IS_ERR(bh))
+		return PTR_ERR(bh);
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	error = ext4_xattr_check_block(inode, bh);
 	if (error)
 		goto cleanup;
 	ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh);
-	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
-
+	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer,
+					buffer_size);
 cleanup:
 	brelse(bh);
-
 	return error;
 }
 
@@ -830,9 +826,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
 	}
 
 	if (EXT4_I(inode)->i_file_acl) {
-		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-		if (!bh) {
-			ret = -EIO;
+		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			ret = PTR_ERR(bh);
 			goto out;
 		}
 
@@ -1825,16 +1821,15 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 
 	if (EXT4_I(inode)->i_file_acl) {
 		/* The inode already has an extended attribute block. */
-		bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		if (!bs->bh)
-			goto cleanup;
+		bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bs->bh))
+			return PTR_ERR(bs->bh);
 		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		error = ext4_xattr_check_block(inode, bs->bh);
 		if (error)
-			goto cleanup;
+			return error;
 		/* Find the named attribute. */
 		bs->s.base = BHDR(bs->bh);
 		bs->s.first = BFIRST(bs->bh);
@@ -1843,13 +1838,10 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 		error = xattr_find_entry(inode, &bs->s.here, bs->s.end,
 					 i->name_index, i->name, 1);
 		if (error && error != -ENODATA)
-			goto cleanup;
+			return error;
 		bs->s.not_found = error;
 	}
-	error = 0;
-
-cleanup:
-	return error;
+	return 0;
 }
 
 static int
@@ -2278,9 +2270,9 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode)
 
 	if (!EXT4_I(inode)->i_file_acl)
 		return NULL;
-	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-	if (!bh)
-		return ERR_PTR(-EIO);
+	bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+	if (IS_ERR(bh))
+		return bh;
 	error = ext4_xattr_check_block(inode, bh);
 	if (error) {
 		brelse(bh);
@@ -2750,10 +2742,11 @@ retry:
 	if (EXT4_I(inode)->i_file_acl) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-		error = -EIO;
-		if (!bh)
+		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
 			goto cleanup;
+		}
 		error = ext4_xattr_check_block(inode, bh);
 		if (error) {
 			brelse(bh);
@@ -2907,11 +2900,12 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	}
 
 	if (EXT4_I(inode)->i_file_acl) {
-		bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
-		if (!bh) {
-			EXT4_ERROR_INODE(inode, "block %llu read error",
-					 EXT4_I(inode)->i_file_acl);
-			error = -EIO;
+		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			error = PTR_ERR(bh);
+			if (error == -EIO)
+				EXT4_ERROR_INODE(inode, "block %llu read error",
+						 EXT4_I(inode)->i_file_acl);
 			goto cleanup;
 		}
 		error = ext4_xattr_check_block(inode, bh);
@@ -3064,8 +3058,10 @@ ext4_xattr_block_cache_find(struct inode *inode,
 	while (ce) {
 		struct buffer_head *bh;
 
-		bh = sb_bread(inode->i_sb, ce->e_value);
-		if (!bh) {
+		bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO);
+		if (IS_ERR(bh)) {
+			if (PTR_ERR(bh) == -ENOMEM)
+				return NULL;
 			EXT4_ERROR_INODE(inode, "block %lu read error",
 					 (unsigned long)ce->e_value);
 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
-- 
cgit v1.2.3


From 0a1c177dd90348342ccf918423d91aebb1e32bfb Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Mon, 3 Dec 2018 23:28:02 -0500
Subject: ext4: fix possible use after free in ext4_quota_enable

commit 61157b24e60fb3cd1f85f2c76a7b1d628f970144 upstream.

The function frees qf_inode via iput but then pass qf_inode to
lockdep_set_quota_inode on the failure path. This may result in a
use-after-free bug. The patch frees df_inode only when it is never used.

Fixes: daf647d2dd5 ("ext4: add lockdep annotations for i_data_sem")
Cc: stable@kernel.org # 4.6
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 804ce1e5a1e7..98b60657e1dd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5712,9 +5712,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 	qf_inode->i_flags |= S_NOQUOTA;
 	lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
 	err = dquot_enable(qf_inode, type, format_id, flags);
-	iput(qf_inode);
 	if (err)
 		lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
+	iput(qf_inode);
 
 	return err;
 }
-- 
cgit v1.2.3


From 0d078853b87a929b9e37055932a7c873647327cb Mon Sep 17 00:00:00 2001
From: Maurizio Lombardi <mlombard@redhat.com>
Date: Tue, 4 Dec 2018 00:06:53 -0500
Subject: ext4: missing unlock/put_page() in ext4_try_to_write_inline_data()

commit 132d00becb31e88469334e1e62751c81345280e0 upstream.

In case of error, ext4_try_to_write_inline_data() should unlock
and release the page it holds.

Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data")
Cc: stable@kernel.org # 3.8
Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inline.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9c4bac18cc6c..27373d88b5f0 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -705,8 +705,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
 			goto out_up_read;
+		}
 	}
 
 	ret = 1;
-- 
cgit v1.2.3


From 11bb168baef2dc9f613e021749b6f22966ae0959 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?ruippan=20=28=E6=BD=98=E7=9D=BF=29?= <ruippan@tencent.com>
Date: Tue, 4 Dec 2018 01:04:12 -0500
Subject: ext4: fix EXT4_IOC_GROUP_ADD ioctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e647e29196b7f802f8242c39ecb7cc937f5ef217 upstream.

Commit e2b911c53584 ("ext4: clean up feature test macros with
predicate functions") broke the EXT4_IOC_GROUP_ADD ioctl.  This was
not noticed since only very old versions of resize2fs (before
e2fsprogs 1.42) use this ioctl.  However, using a new kernel with an
enterprise Linux userspace will cause attempts to use online resize to
fail with "No reserved GDT blocks".

Fixes: e2b911c53584 ("ext4: clean up feature test macros with predicate...")
Cc: stable@kernel.org # v4.4
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: ruippan (潘睿) <ruippan@tencent.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 87350b642681..bc8ee0c498cc 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1631,7 +1631,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	}
 
 	if (reserved_gdb || gdb_off == 0) {
-		if (ext4_has_feature_resize_inode(sb) ||
+		if (!ext4_has_feature_resize_inode(sb) ||
 		    !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext4_warning(sb,
 				     "No reserved GDT blocks, can't resize");
-- 
cgit v1.2.3


From 6633fcb231a0864ca5c961a7664b137474ca3779 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Dec 2018 12:28:13 -0500
Subject: ext4: include terminating u32 in size of xattr entries when expanding
 inodes

commit a805622a757b6d7f65def4141d29317d8e37b8a1 upstream.

In ext4_expand_extra_isize_ea(), we calculate the total size of the
xattr header, plus the xattr entries so we know how much of the
beginning part of the xattrs to move when expanding the inode extra
size.  We need to include the terminating u32 at the end of the xattr
entries, or else if there is uninitialized, non-zero bytes after the
xattr entries and before the xattr values, the list of xattr entries
won't be properly terminated.

Reported-by: Steve Graham <stgraham2000@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 52856d98afd3..8ebf0967424f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2725,7 +2725,7 @@ retry:
 	base = IFIRST(header);
 	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
 	min_offs = end - base;
-	total_ino = sizeof(struct ext4_xattr_ibody_header);
+	total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
 
 	error = xattr_check_inode(inode, header, end);
 	if (error)
-- 
cgit v1.2.3


From 263663888d2f00421887e251d93ebc52650a37ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Dec 2018 12:29:13 -0500
Subject: ext4: avoid declaring fs inconsistent due to invalid file handles

commit 8a363970d1dc38c4ec4ad575c862f776f468d057 upstream.

If we receive a file handle, either from NFS or open_by_handle_at(2),
and it points at an inode which has not been initialized, and the file
system has metadata checksums enabled, we shouldn't try to get the
inode, discover the checksum is invalid, and then declare the file
system as being inconsistent.

This can be reproduced by creating a test file system via "mke2fs -t
ext4 -O metadata_csum /tmp/foo.img 8M", mounting it, cd'ing into that
directory, and then running the following program.

#define _GNU_SOURCE
#include <fcntl.h>

struct handle {
	struct file_handle fh;
	unsigned char fid[MAX_HANDLE_SZ];
};

int main(int argc, char **argv)
{
	struct handle h = {{8, 1 }, { 12, }};

	open_by_handle_at(AT_FDCWD, &h.fh, O_RDONLY);
	return 0;
}

Google-Bug-Id: 120690101
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4.h   | 15 +++++++++++++--
 fs/ext4/ialloc.c |  2 +-
 fs/ext4/inode.c  | 54 +++++++++++++++++++++++++++++++++++++-----------------
 fs/ext4/ioctl.c  |  2 +-
 fs/ext4/namei.c  |  4 ++--
 fs/ext4/resize.c |  5 +++--
 fs/ext4/super.c  | 19 +++++--------------
 fs/ext4/xattr.c  |  5 +++--
 8 files changed, 65 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05eddfbd3404..032cf9b92665 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2459,8 +2459,19 @@ int do_journal_get_write_access(handle_t *handle,
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 
-extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
+typedef enum {
+	EXT4_IGET_NORMAL =	0,
+	EXT4_IGET_SPECIAL =	0x0001, /* OK to iget a system inode */
+	EXT4_IGET_HANDLE = 	0x0002	/* Inode # is from a handle */
+} ext4_iget_flags;
+
+extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+				 ext4_iget_flags flags, const char *function,
+				 unsigned int line);
+
+#define ext4_iget(sb, ino, flags) \
+	__ext4_iget((sb), (ino), (flags), __func__, __LINE__)
+
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct dentry *, struct iattr *);
 extern int  ext4_getattr(const struct path *, struct kstat *, u32, unsigned int);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 2addcb8730e1..091a18a51c99 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1225,7 +1225,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	if (!ext4_test_bit(bit, bitmap_bh->b_data))
 		goto bad_orphan;
 
-	inode = ext4_iget(sb, ino);
+	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		ext4_error(sb, "couldn't read orphan inode %lu (err %d)",
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 244531d3065a..bcf08801f935 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4786,7 +4786,9 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
 		return inode_peek_iversion(inode);
 }
 
-struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+			  ext4_iget_flags flags, const char *function,
+			  unsigned int line)
 {
 	struct ext4_iloc iloc;
 	struct ext4_inode *raw_inode;
@@ -4800,6 +4802,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	gid_t i_gid;
 	projid_t i_projid;
 
+	if (((flags & EXT4_IGET_NORMAL) &&
+	     (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
+	    (ino < EXT4_ROOT_INO) ||
+	    (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+		if (flags & EXT4_IGET_HANDLE)
+			return ERR_PTR(-ESTALE);
+		__ext4_error(sb, function, line,
+			     "inode #%lu: comm %s: iget: illegal inode #",
+			     ino, current->comm);
+		return ERR_PTR(-EFSCORRUPTED);
+	}
+
 	inode = iget_locked(sb, ino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -4815,18 +4829,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	raw_inode = ext4_raw_inode(&iloc);
 
 	if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
-		EXT4_ERROR_INODE(inode, "root inode unallocated");
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: root inode unallocated");
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
 
+	if ((flags & EXT4_IGET_HANDLE) &&
+	    (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
 			EXT4_INODE_SIZE(inode->i_sb) ||
 		    (ei->i_extra_isize & 3)) {
-			EXT4_ERROR_INODE(inode,
-					 "bad extra_isize %u (inode size %u)",
+			ext4_error_inode(inode, function, line, 0,
+					 "iget: bad extra_isize %u "
+					 "(inode size %u)",
 					 ei->i_extra_isize,
 					 EXT4_INODE_SIZE(inode->i_sb));
 			ret = -EFSCORRUPTED;
@@ -4848,7 +4870,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 
 	if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
-		EXT4_ERROR_INODE(inode, "checksum invalid");
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: checksum invalid");
 		ret = -EFSBADCRC;
 		goto bad_inode;
 	}
@@ -4905,7 +4928,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(sb, raw_inode);
 	if ((size = i_size_read(inode)) < 0) {
-		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: bad i_size value: %lld", size);
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
 	}
@@ -4981,7 +5005,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: bad extended attribute block %llu",
 				 ei->i_file_acl);
 		ret = -EFSCORRUPTED;
 		goto bad_inode;
@@ -5009,8 +5034,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	} else if (S_ISLNK(inode->i_mode)) {
 		/* VFS does not allow setting these so must be corruption */
 		if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
-			EXT4_ERROR_INODE(inode,
-			  "immutable or append flags not allowed on symlinks");
+			ext4_error_inode(inode, function, line, 0,
+					 "iget: immutable or append flags "
+					 "not allowed on symlinks");
 			ret = -EFSCORRUPTED;
 			goto bad_inode;
 		}
@@ -5040,7 +5066,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		make_bad_inode(inode);
 	} else {
 		ret = -EFSCORRUPTED;
-		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
+		ext4_error_inode(inode, function, line, 0,
+				 "iget: bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
@@ -5054,13 +5081,6 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
-struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
-{
-	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
-		return ERR_PTR(-EFSCORRUPTED);
-	return ext4_iget(sb, ino);
-}
-
 static int ext4_inode_blocks_set(handle_t *handle,
 				struct ext4_inode *raw_inode,
 				struct ext4_inode_info *ei)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0edee31913d1..d37dafa1d133 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -125,7 +125,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
+	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(inode_bl))
 		return PTR_ERR(inode_bl);
 	ei_bl = EXT4_I(inode_bl);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ffa25753e929..4f8de2b9e87e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1571,7 +1571,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 					 dentry);
 			return ERR_PTR(-EFSCORRUPTED);
 		}
-		inode = ext4_iget_normal(dir->i_sb, ino);
+		inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL);
 		if (inode == ERR_PTR(-ESTALE)) {
 			EXT4_ERROR_INODE(dir,
 					 "deleted inode referenced: %u",
@@ -1613,7 +1613,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 		return ERR_PTR(-EFSCORRUPTED);
 	}
 
-	return d_obtain_alias(ext4_iget_normal(child->d_sb, ino));
+	return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL));
 }
 
 /*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bc8ee0c498cc..48421de803b7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1637,7 +1637,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
 		}
-		inode = ext4_iget(sb, EXT4_RESIZE_INO);
+		inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL);
 		if (IS_ERR(inode)) {
 			ext4_warning(sb, "Error opening resize inode");
 			return PTR_ERR(inode);
@@ -1965,7 +1965,8 @@ retry:
 		}
 
 		if (!resize_inode)
-			resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
+			resize_inode = ext4_iget(sb, EXT4_RESIZE_INO,
+						 EXT4_IGET_SPECIAL);
 		if (IS_ERR(resize_inode)) {
 			ext4_warning(sb, "Error opening resize inode");
 			return PTR_ERR(resize_inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 98b60657e1dd..2908a212a7a4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1173,20 +1173,11 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 {
 	struct inode *inode;
 
-	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
-		return ERR_PTR(-ESTALE);
-	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
-		return ERR_PTR(-ESTALE);
-
-	/* iget isn't really right if the inode is currently unallocated!!
-	 *
-	 * ext4_read_inode will return a bad_inode if the inode had been
-	 * deleted, so we should be safe.
-	 *
+	/*
 	 * Currently we don't know the generation for parent directory, so
 	 * a generation of 0 means "accept any"
 	 */
-	inode = ext4_iget_normal(sb, ino);
+	inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 	if (generation && inode->i_generation != generation) {
@@ -4350,7 +4341,7 @@ no_journal:
 	 * so we can safely mount the rest of the filesystem now.
 	 */
 
-	root = ext4_iget(sb, EXT4_ROOT_INO);
+	root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(root)) {
 		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
@@ -4620,7 +4611,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 	 * happen if we iget() an unused inode, as the subsequent iput()
 	 * will try to delete it.
 	 */
-	journal_inode = ext4_iget(sb, journal_inum);
+	journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
 	if (IS_ERR(journal_inode)) {
 		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
@@ -5702,7 +5693,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 	if (!qf_inums[type])
 		return -EPERM;
 
-	qf_inode = ext4_iget(sb, qf_inums[type]);
+	qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
 	if (IS_ERR(qf_inode)) {
 		ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
 		return PTR_ERR(qf_inode);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 8ebf0967424f..c0ba5206cd9d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -384,7 +384,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
 	struct inode *inode;
 	int err;
 
-	inode = ext4_iget(parent->i_sb, ea_ino);
+	inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		ext4_error(parent->i_sb,
@@ -1486,7 +1486,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 	}
 
 	while (ce) {
-		ea_inode = ext4_iget(inode->i_sb, ce->e_value);
+		ea_inode = ext4_iget(inode->i_sb, ce->e_value,
+				     EXT4_IGET_NORMAL);
 		if (!IS_ERR(ea_inode) &&
 		    !is_bad_inode(ea_inode) &&
 		    (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
-- 
cgit v1.2.3


From bf2fd1f970405aeae0dc7cbf42d1ba82e7bf6b70 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Dec 2018 14:07:58 -0500
Subject: ext4: force inode writes when nfsd calls commit_metadata()

commit fde872682e175743e0c3ef939c89e3c6008a1529 upstream.

Some time back, nfsd switched from calling vfs_fsync() to using a new
commit_metadata() hook in export_operations().  If the file system did
not provide a commit_metadata() hook, it fell back to using
sync_inode_metadata().  Unfortunately doesn't work on all file
systems.  In particular, it doesn't work on ext4 due to how the inode
gets journalled --- the VFS writeback code will not always call
ext4_write_inode().

So we need to provide our own ext4_nfs_commit_metdata() method which
calls ext4_write_inode() directly.

Google-Bug-Id: 121195940
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2908a212a7a4..ee0f30852835 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1202,6 +1202,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 				    ext4_nfs_get_inode);
 }
 
+static int ext4_nfs_commit_metadata(struct inode *inode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL
+	};
+
+	trace_ext4_nfs_commit_metadata(inode);
+	return ext4_write_inode(inode, &wbc);
+}
+
 /*
  * Try to release metadata pages (indirect blocks, directories) which are
  * mapped via the block device.  Since these pages could have journal heads
@@ -1406,6 +1416,7 @@ static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
 	.get_parent = ext4_get_parent,
+	.commit_metadata = ext4_nfs_commit_metadata,
 };
 
 enum {
-- 
cgit v1.2.3


From 0cb4f6559087e99c14f129bc1332c40e84fd0559 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 19 Dec 2018 14:36:58 -0500
Subject: ext4: check for shutdown and r/o file system in ext4_write_inode()

commit 18f2c4fcebf2582f96cbd5f2238f4f354a0e4847 upstream.

If the file system has been shut down or is read-only, then
ext4_write_inode() needs to bail out early.

Also use jbd2_complete_transaction() instead of ext4_force_commit() so
we only force a commit if it is needed.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bcf08801f935..36abbdafb26e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5369,9 +5369,13 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	int err;
 
-	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
+	    sb_rdonly(inode->i_sb))
 		return 0;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (EXT4_SB(inode->i_sb)->s_journal) {
 		if (ext4_journal_current_handle()) {
 			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
@@ -5387,7 +5391,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
 			return 0;
 
-		err = ext4_force_commit(inode->i_sb);
+		err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
+						EXT4_I(inode)->i_sync_tid);
 	} else {
 		struct ext4_iloc iloc;
 
-- 
cgit v1.2.3


From 28867a52e48d989a4c0f8476223a5d76918041f5 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sun, 11 Nov 2018 22:22:17 +0800
Subject: btrfs: dev-replace: go back to suspended state if target device is
 missing

commit 0d228ece59a35a9b9e8ff0d40653234a6d90f61e upstream.

At the time of forced unmount we place the running replace to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes
back and expect the target device is missing.

Then let the replace state continue to be in
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub
running as part of replace.

Fixes: e93c89c1aaaa ("Btrfs: add new sources for device replace code")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/dev-replace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 981434764bb9..8eb9307b6394 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -887,6 +887,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 			   "cannot continue dev_replace, tgtdev is missing");
 		btrfs_info(fs_info,
 			   "you may cancel the operation after 'mount -o degraded'");
+		dev_replace->replace_state =
+					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
 		btrfs_dev_replace_write_unlock(dev_replace);
 		return 0;
 	}
-- 
cgit v1.2.3


From c1f90eb019714c51f191f2c1abbfda7ebfaf7c35 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Sun, 11 Nov 2018 22:22:18 +0800
Subject: btrfs: dev-replace: go back to suspend state if another EXCL_OP is
 running

commit 05c49e6bc1e8866ecfd674ebeeb58cdbff9145c2 upstream.

In a secnario where balance and replace co-exists as below,

  - start balance
  - pause balance
  - start replace
  - reboot

and when system restarts, balance resumes first. Then the replace is
attempted to restart but will fail as the EXCL_OP lock is already held
by the balance. If so place the replace state back to
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state.

Fixes: 010a47bde9420 ("btrfs: add proper safety check before resuming dev-replace")
CC: stable@vger.kernel.org # 4.18+
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/dev-replace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8eb9307b6394..b2b283e48439 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -900,6 +900,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 	 * dev-replace to start anyway.
 	 */
 	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+		btrfs_dev_replace_write_lock(dev_replace);
+		dev_replace->replace_state =
+					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+		btrfs_dev_replace_write_unlock(dev_replace);
 		btrfs_info(fs_info,
 		"cannot resume dev-replace, other exclusive operation running");
 		return 0;
-- 
cgit v1.2.3


From 7708a83090bab6e6c7348b0dae4f47681921ca1c Mon Sep 17 00:00:00 2001
From: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Date: Thu, 29 Nov 2018 17:31:32 +0800
Subject: btrfs: skip file_extent generation check for free_space_inode in
 run_delalloc_nocow

commit 27a7ff554e8d349627a90bda275c527b7348adae upstream.

The test case btrfs/001 with inode_cache mount option will encounter the
following warning:

  WARNING: CPU: 1 PID: 23700 at fs/btrfs/inode.c:956 cow_file_range.isra.19+0x32b/0x430 [btrfs]
  CPU: 1 PID: 23700 Comm: btrfs Kdump: loaded Tainted: G        W  O      4.20.0-rc4-custom+ #30
  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
  RIP: 0010:cow_file_range.isra.19+0x32b/0x430 [btrfs]
  Call Trace:
   ? free_extent_buffer+0x46/0x90 [btrfs]
   run_delalloc_nocow+0x455/0x900 [btrfs]
   btrfs_run_delalloc_range+0x1a7/0x360 [btrfs]
   writepage_delalloc+0xf9/0x150 [btrfs]
   __extent_writepage+0x125/0x3e0 [btrfs]
   extent_write_cache_pages+0x1b6/0x3e0 [btrfs]
   ? __wake_up_common_lock+0x63/0xc0
   extent_writepages+0x50/0x80 [btrfs]
   do_writepages+0x41/0xd0
   ? __filemap_fdatawrite_range+0x9e/0xf0
   __filemap_fdatawrite_range+0xbe/0xf0
   btrfs_fdatawrite_range+0x1b/0x50 [btrfs]
   __btrfs_write_out_cache+0x42c/0x480 [btrfs]
   btrfs_write_out_ino_cache+0x84/0xd0 [btrfs]
   btrfs_save_ino_cache+0x551/0x660 [btrfs]
   commit_fs_roots+0xc5/0x190 [btrfs]
   btrfs_commit_transaction+0x2bf/0x8d0 [btrfs]
   btrfs_mksubvol+0x48d/0x4d0 [btrfs]
   btrfs_ioctl_snap_create_transid+0x170/0x180 [btrfs]
   btrfs_ioctl_snap_create_v2+0x124/0x180 [btrfs]
   btrfs_ioctl+0x123f/0x3030 [btrfs]

The file extent generation of the free space inode is equal to the last
snapshot of the file root, so the inode will be passed to cow_file_rage.
But the inode was created and its extents were preallocated in
btrfs_save_ino_cache, there are no cow copies on disk.

The preallocated extent is not yet in the extent tree, and
btrfs_cross_ref_exist will ignore the -ENOENT returned by
check_committed_ref, so we can directly write the inode to the disk.

Fixes: 78d4295b1eee ("btrfs: lift some btrfs_cross_ref_exist checks in nocow path")
CC: stable@vger.kernel.org # 4.18+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Lu Fengqi <lufq.fnst@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7158b5b77c9d..8baef9e9160e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1373,7 +1373,8 @@ next_slot:
 			 * Do the same check as in btrfs_cross_ref_exist but
 			 * without the unnecessary search.
 			 */
-			if (btrfs_file_extent_generation(leaf, fi) <=
+			if (!nolock &&
+			    btrfs_file_extent_generation(leaf, fi) <=
 			    btrfs_root_last_snapshot(&root->root_item))
 				goto out_check;
 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
-- 
cgit v1.2.3


From 10b04210aabf4488930fac768994ff7d1d359ac3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 28 Nov 2018 14:54:28 +0000
Subject: Btrfs: fix fsync of files with multiple hard links in new directories

commit 41bd60676923822de1df2c50b3f9a10171f4338a upstream.

The log tree has a long standing problem that when a file is fsync'ed we
only check for new ancestors, created in the current transaction, by
following only the hard link for which the fsync was issued. We follow the
ancestors using the VFS' dget_parent() API. This means that if we create a
new link for a file in a directory that is new (or in an any other new
ancestor directory) and then fsync the file using an old hard link, we end
up not logging the new ancestor, and on log replay that new hard link and
ancestor do not exist. In some cases, involving renames, the file will not
exist at all.

Example:

  mkfs.btrfs -f /dev/sdb
  mount /dev/sdb /mnt

  mkdir /mnt/A
  touch /mnt/foo
  ln /mnt/foo /mnt/A/bar
  xfs_io -c fsync /mnt/foo

  <power failure>

In this example after log replay only the hard link named 'foo' exists
and directory A does not exist, which is unexpected. In other major linux
filesystems, such as ext4, xfs and f2fs for example, both hard links exist
and so does directory A after mounting again the filesystem.

Checking if any new ancestors are new and need to be logged was added in
2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"),
however only for the ancestors of the hard link (dentry) for which the
fsync was issued, instead of checking for all ancestors for all of the
inode's hard links.

So fix this by tracking the id of the last transaction where a hard link
was created for an inode and then on fsync fallback to a full transaction
commit when an inode has more than one hard link and at least one new hard
link was created in the current transaction. This is the simplest solution
since this is not a common use case (adding frequently hard links for
which there's an ancestor created in the current transaction and then
fsync the file). In case it ever becomes a common use case, a solution
that consists of iterating the fs/subvol btree for each hard link and
check if any ancestor is new, could be implemented.

This solves many unexpected scenarios reported by Jayashree Mohan and
Vijay Chidambaram, and for which there is a new test case for fstests
under review.

Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
CC: stable@vger.kernel.org # 4.4+
Reported-by: Vijay Chidambaram <vvijay03@gmail.com>
Reported-by: Jayashree Mohan <jayashree2912@gmail.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/btrfs_inode.h |  6 ++++++
 fs/btrfs/inode.c       | 17 +++++++++++++++++
 fs/btrfs/tree-log.c    | 16 ++++++++++++++++
 3 files changed, 39 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..7177d1d33584 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -146,6 +146,12 @@ struct btrfs_inode {
 	 */
 	u64 last_unlink_trans;
 
+	/*
+	 * Track the transaction id of the last transaction used to create a
+	 * hard link for the inode. This is used by the log tree (fsync).
+	 */
+	u64 last_link_trans;
+
 	/*
 	 * Number of bytes outstanding that are going to need csums.  This is
 	 * used in ENOSPC accounting.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8baef9e9160e..14c85e61134d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3689,6 +3689,21 @@ cache_index:
 	 * inode is not a directory, logging its parent unnecessarily.
 	 */
 	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+	/*
+	 * Similar reasoning for last_link_trans, needs to be set otherwise
+	 * for a case like the following:
+	 *
+	 * mkdir A
+	 * touch foo
+	 * ln foo A/bar
+	 * echo 2 > /proc/sys/vm/drop_caches
+	 * fsync foo
+	 * <power failure>
+	 *
+	 * Would result in link bar and directory A not existing after the power
+	 * failure.
+	 */
+	BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans;
 
 	path->slots[0]++;
 	if (inode->i_nlink != 1 ||
@@ -6647,6 +6662,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 			if (err)
 				goto fail;
 		}
+		BTRFS_I(inode)->last_link_trans = trans->transid;
 		d_instantiate(dentry, inode);
 		ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
 					 true, NULL);
@@ -9175,6 +9191,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->index_cnt = (u64)-1;
 	ei->dir_index = 0;
 	ei->last_unlink_trans = 0;
+	ei->last_link_trans = 0;
 	ei->last_log_commit = 0;
 
 	spin_lock_init(&ei->lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 16ecb76fa53c..0805f8c5e72d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5781,6 +5781,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 			goto end_trans;
 	}
 
+	/*
+	 * If a new hard link was added to the inode in the current transaction
+	 * and its link count is now greater than 1, we need to fallback to a
+	 * transaction commit, otherwise we can end up not logging all its new
+	 * parents for all the hard links. Here just from the dentry used to
+	 * fsync, we can not visit the ancestor inodes for all the other hard
+	 * links to figure out if any is new, so we fallback to a transaction
+	 * commit (instead of adding a lot of complexity of scanning a btree,
+	 * since this scenario is not a common use case).
+	 */
+	if (inode->vfs_inode.i_nlink > 1 &&
+	    inode->last_link_trans > last_committed) {
+		ret = -EMLINK;
+		goto end_trans;
+	}
+
 	while (1) {
 		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
 			break;
-- 
cgit v1.2.3


From 6911b074a0055f9b350d85fde530f61abbef2171 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 30 Nov 2018 11:52:14 -0500
Subject: btrfs: run delayed items before dropping the snapshot

commit 0568e82dbe2510fc1fa664f58e5c997d3f1e649e upstream.

With my delayed refs patches in place we started seeing a large amount
of aborts in __btrfs_free_extent:

 BTRFS error (device sdb1): unable to find ref byte nr 91947008 parent 0 root 35964  owner 1 offset 0
 Call Trace:
  ? btrfs_merge_delayed_refs+0xaf/0x340
  __btrfs_run_delayed_refs+0x6ea/0xfc0
  ? btrfs_set_path_blocking+0x31/0x60
  btrfs_run_delayed_refs+0xeb/0x180
  btrfs_commit_transaction+0x179/0x7f0
  ? btrfs_check_space_for_delayed_refs+0x30/0x50
  ? should_end_transaction.isra.19+0xe/0x40
  btrfs_drop_snapshot+0x41c/0x7c0
  btrfs_clean_one_deleted_snapshot+0xb5/0xd0
  cleaner_kthread+0xf6/0x120
  kthread+0xf8/0x130
  ? btree_invalidatepage+0x90/0x90
  ? kthread_bind+0x10/0x10
  ret_from_fork+0x35/0x40

This was because btrfs_drop_snapshot depends on the root not being
modified while it's dropping the snapshot.  It will unlock the root node
(and really every node) as it walks down the tree, only to re-lock it
when it needs to do something.  This is a problem because if we modify
the tree we could cow a block in our path, which frees our reference to
that block.  Then once we get back to that shared block we'll free our
reference to it again, and get ENOENT when trying to lookup our extent
reference to that block in __btrfs_free_extent.

This is ultimately happening because we have delayed items left to be
processed for our deleted snapshot _after_ all of the inodes are closed
for the snapshot.  We only run the delayed inode item if we're deleting
the inode, and even then we do not run the delayed insertions or delayed
removals.  These can be run at any point after our final inode does its
last iput, which is what triggers the snapshot deletion.  We can end up
with the snapshot deletion happening and then have the delayed items run
on that file system, resulting in the above problem.

This problem has existed forever, however my patches made it much easier
to hit as I wake up the cleaner much more often to deal with delayed
iputs, which made us more likely to start the snapshot dropping work
before the transaction commits, which is when the delayed items would
generally be run.  Before, generally speaking, we would run the delayed
items, commit the transaction, and wakeup the cleaner thread to start
deleting snapshots, which means we were less likely to hit this problem.
You could still hit it if you had multiple snapshots to be deleted and
ended up with lots of delayed items, but it was definitely harder.

Fix for now by simply running all the delayed items before starting to
drop the snapshot.  We could make this smarter in the future by making
the delayed items per-root, and then simply drop any delayed items for
roots that we are going to delete.  But for now just a quick and easy
solution is the safest.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 51e41e53d4ae..a16760b410b1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8911,6 +8911,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 		goto out_free;
 	}
 
+	err = btrfs_run_delayed_items(trans);
+	if (err)
+		goto out_end_trans;
+
 	if (block_rsv)
 		trans->block_rsv = block_rsv;
 
-- 
cgit v1.2.3


From 9eec74b48477df26e65994115eecf7380374c49b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 11 Dec 2018 10:19:45 +0000
Subject: Btrfs: send, fix race with transaction commits that create snapshots

commit be6821f82c3cc36e026f5afd10249988852b35ea upstream.

If we create a snapshot of a snapshot currently being used by a send
operation, we can end up with send failing unexpectedly (returning
-ENOENT error to user space for example). The following diagram shows
how this happens.

            CPU 1                                   CPU2                                CPU3

 btrfs_ioctl_send()
  (...)
                                     create_snapshot()
                                      -> creates snapshot of a
                                         root used by the send
                                         task
                                      btrfs_commit_transaction()
                                       create_pending_snapshot()
  __get_inode_info()
   btrfs_search_slot()
    btrfs_search_slot_get_root()
     down_read commit_root_sem

     get reference on eb of the
     commit root
      -> eb with bytenr == X

     up_read commit_root_sem

                                        btrfs_cow_block(root node)
                                         btrfs_free_tree_block()
                                          -> creates delayed ref to
                                             free the extent

                                       btrfs_run_delayed_refs()
                                        -> runs the delayed ref,
                                           adds extent to
                                           fs_info->pinned_extents

                                       btrfs_finish_extent_commit()
                                        unpin_extent_range()
                                         -> marks extent as free
                                            in the free space cache

                                      transaction commit finishes

                                                                       btrfs_start_transaction()
                                                                        (...)
                                                                        btrfs_cow_block()
                                                                         btrfs_alloc_tree_block()
                                                                          btrfs_reserve_extent()
                                                                           -> allocates extent at
                                                                              bytenr == X
                                                                          btrfs_init_new_buffer(bytenr X)
                                                                           btrfs_find_create_tree_block()
                                                                            alloc_extent_buffer(bytenr X)
                                                                             find_extent_buffer(bytenr X)
                                                                              -> returns existing eb,
                                                                                 which the send task got

                                                                        (...)
                                                                         -> modifies content of the
                                                                            eb with bytenr == X

    -> uses an eb that now
       belongs to some other
       tree and no more matches
       the commit root of the
       snapshot, resuts will be
       unpredictable

The consequences of this race can be various, and can lead to searches in
the commit root performed by the send task failing unexpectedly (unable to
find inode items, returning -ENOENT to user space, for example) or not
failing because an inode item with the same number was added to the tree
that reused the metadata extent, in which case send can behave incorrectly
in the worst case or just fail later for some reason.

Fix this by performing a copy of the commit root's extent buffer when doing
a search in the context of a send operation.

CC: stable@vger.kernel.org # 4.4.x: 1fc28d8e2e9: Btrfs: move get root out of btrfs_search_slot to a helper
CC: stable@vger.kernel.org # 4.4.x: f9ddfd0592a: Btrfs: remove unused check of skip_locking
CC: stable@vger.kernel.org # 4.4.x
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 089b46c4d97f..fa18520529f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2624,14 +2624,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
 	root_lock = BTRFS_READ_LOCK;
 
 	if (p->search_commit_root) {
-		/* The commit roots are read only so we always do read locks */
-		if (p->need_commit_sem)
+		/*
+		 * The commit roots are read only so we always do read locks,
+		 * and we always must hold the commit_root_sem when doing
+		 * searches on them, the only exception is send where we don't
+		 * want to block transaction commits for a long time, so
+		 * we need to clone the commit root in order to avoid races
+		 * with transaction commits that create a snapshot of one of
+		 * the roots used by a send operation.
+		 */
+		if (p->need_commit_sem) {
 			down_read(&fs_info->commit_root_sem);
-		b = root->commit_root;
-		extent_buffer_get(b);
-		level = btrfs_header_level(b);
-		if (p->need_commit_sem)
+			b = btrfs_clone_extent_buffer(root->commit_root);
 			up_read(&fs_info->commit_root_sem);
+			if (!b)
+				return ERR_PTR(-ENOMEM);
+
+		} else {
+			b = root->commit_root;
+			extent_buffer_get(b);
+		}
+		level = btrfs_header_level(b);
 		/*
 		 * Ensure that all callers have set skip_locking when
 		 * p->search_commit_root = 1.
@@ -2757,6 +2770,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 again:
 	prev_cmp = -1;
 	b = btrfs_search_slot_get_root(root, p, write_lock_level);
+	if (IS_ERR(b)) {
+		ret = PTR_ERR(b);
+		goto done;
+	}
 
 	while (b) {
 		level = btrfs_header_level(b);
-- 
cgit v1.2.3


From c555772c2a4e1042a12ea27413e93a89d01b04b7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Sat, 5 Jan 2019 11:45:08 -0800
Subject: dax: Don't access a freed inode

commit 55e56f06ed71d9441f3abd5b1d3c1a870812b3fe upstream.

After we drop the i_pages lock, the inode can be freed at any time.
The get_unlocked_entry() code has no choice but to reacquire the lock,
so it can't be used here.  Create a new wait_entry_unlocked() which takes
care not to acquire the lock or dereference the address_space in any way.

Fixes: c2a7d2a11552 ("filesystem-dax: Introduce dax_lock_mapping_entry()")
Cc: <stable@vger.kernel.org>
Signed-off-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/dax.c | 69 ++++++++++++++++++++++++++++++----------------------------------
 1 file changed, 32 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 3a2682a6c832..415605fafaeb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -229,8 +229,8 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
  *
  * Must be called with the i_pages lock held.
  */
-static void *__get_unlocked_mapping_entry(struct address_space *mapping,
-		pgoff_t index, void ***slotp, bool (*wait_fn)(void))
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+		pgoff_t index, void ***slotp)
 {
 	void *entry, **slot;
 	struct wait_exceptional_entry_queue ewait;
@@ -240,8 +240,6 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
 	ewait.wait.func = wake_exceptional_entry_func;
 
 	for (;;) {
-		bool revalidate;
-
 		entry = __radix_tree_lookup(&mapping->i_pages, index, NULL,
 					  &slot);
 		if (!entry ||
@@ -256,30 +254,39 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping,
 		prepare_to_wait_exclusive(wq, &ewait.wait,
 					  TASK_UNINTERRUPTIBLE);
 		xa_unlock_irq(&mapping->i_pages);
-		revalidate = wait_fn();
+		schedule();
 		finish_wait(wq, &ewait.wait);
 		xa_lock_irq(&mapping->i_pages);
-		if (revalidate) {
-			put_unlocked_mapping_entry(mapping, index, entry);
-			return ERR_PTR(-EAGAIN);
-		}
 	}
 }
 
-static bool entry_wait(void)
+/*
+ * The only thing keeping the address space around is the i_pages lock
+ * (it's cycled in clear_inode() after removing the entries from i_pages)
+ * After we call xas_unlock_irq(), we cannot touch xas->xa.
+ */
+static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index,
+		void ***slotp, void *entry)
 {
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq;
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+
+	wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
+	prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
+	xa_unlock_irq(&mapping->i_pages);
 	schedule();
+	finish_wait(wq, &ewait.wait);
+
 	/*
-	 * Never return an ERR_PTR() from
-	 * __get_unlocked_mapping_entry(), just keep looping.
+	 * Entry lock waits are exclusive. Wake up the next waiter since
+	 * we aren't sure we will acquire the entry lock and thus wake
+	 * the next waiter up on unlock.
 	 */
-	return false;
-}
-
-static void *get_unlocked_mapping_entry(struct address_space *mapping,
-		pgoff_t index, void ***slotp)
-{
-	return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait);
+	if (waitqueue_active(wq))
+		__wake_up(wq, TASK_NORMAL, 1, &ewait.key);
 }
 
 static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
@@ -398,19 +405,6 @@ static struct page *dax_busy_page(void *entry)
 	return NULL;
 }
 
-static bool entry_wait_revalidate(void)
-{
-	rcu_read_unlock();
-	schedule();
-	rcu_read_lock();
-
-	/*
-	 * Tell __get_unlocked_mapping_entry() to take a break, we need
-	 * to revalidate page->mapping after dropping locks
-	 */
-	return true;
-}
-
 bool dax_lock_mapping_entry(struct page *page)
 {
 	pgoff_t index;
@@ -446,14 +440,15 @@ bool dax_lock_mapping_entry(struct page *page)
 		}
 		index = page->index;
 
-		entry = __get_unlocked_mapping_entry(mapping, index, &slot,
-				entry_wait_revalidate);
+		entry = __radix_tree_lookup(&mapping->i_pages, index,
+						NULL, &slot);
 		if (!entry) {
 			xa_unlock_irq(&mapping->i_pages);
 			break;
-		} else if (IS_ERR(entry)) {
-			xa_unlock_irq(&mapping->i_pages);
-			WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
+		} else if (slot_locked(mapping, slot)) {
+			rcu_read_unlock();
+			wait_entry_unlocked(mapping, index, &slot, entry);
+			rcu_read_lock();
 			continue;
 		}
 		lock_slot(mapping, slot);
-- 
cgit v1.2.3


From 9621ea6b9c4c8268d715785759678b401ff36509 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 5 Jan 2019 11:45:13 -0800
Subject: dax: Use non-exclusive wait in wait_entry_unlocked()

commit d8a706414af4827fc0b4b1c0c631c607351938b9 upstream.

get_unlocked_entry() uses an exclusive wait because it is guaranteed to
eventually obtain the lock and follow on with an unlock+wakeup cycle.
The wait_entry_unlocked() path does not have the same guarantee. Rather
than open-code an extra wakeup, just switch to a non-exclusive wait.

Cc: Matthew Wilcox <willy@infradead.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/dax.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 415605fafaeb..09fa70683c41 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -275,18 +275,16 @@ static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index,
 	ewait.wait.func = wake_exceptional_entry_func;
 
 	wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
-	prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
+	/*
+	 * Unlike get_unlocked_entry() there is no guarantee that this
+	 * path ever successfully retrieves an unlocked entry before an
+	 * inode dies. Perform a non-exclusive wait in case this path
+	 * never successfully performs its own wake up.
+	 */
+	prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
 	xa_unlock_irq(&mapping->i_pages);
 	schedule();
 	finish_wait(wq, &ewait.wait);
-
-	/*
-	 * Entry lock waits are exclusive. Wake up the next waiter since
-	 * we aren't sure we will acquire the entry lock and thus wake
-	 * the next waiter up on unlock.
-	 */
-	if (waitqueue_active(wq))
-		__wake_up(wq, TASK_NORMAL, 1, &ewait.key);
 }
 
 static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
-- 
cgit v1.2.3


From ce5b0057f7687fab11c23b586b94766b1fc40e96 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Thu, 22 Nov 2018 18:58:46 +0800
Subject: f2fs: read page index before freeing

commit 0ea295dd853e0879a9a30ab61f923c26be35b902 upstream.

The function truncate_node frees the page with f2fs_put_page. However,
the page index is read after that. So, the patch reads the index before
freeing the page.

Fixes: bf39c00a9a7f ("f2fs: drop obsolete node page when it is truncated")
Cc: <stable@vger.kernel.org>
Signed-off-by: Pan Bian <bianpan2016@163.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/node.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 42ea42acb487..19a0d83aae65 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -827,6 +827,7 @@ static int truncate_node(struct dnode_of_data *dn)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct node_info ni;
 	int err;
+	pgoff_t index;
 
 	err = f2fs_get_node_info(sbi, dn->nid, &ni);
 	if (err)
@@ -846,10 +847,11 @@ static int truncate_node(struct dnode_of_data *dn)
 	clear_node_page_dirty(dn->node_page);
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
 
+	index = dn->node_page->index;
 	f2fs_put_page(dn->node_page, 1);
 
 	invalidate_mapping_pages(NODE_MAPPING(sbi),
-			dn->node_page->index, dn->node_page->index);
+			index, index);
 
 	dn->node_page = NULL;
 	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
-- 
cgit v1.2.3


From 58d7ab7163d9119a05c576506692a4d90ca22f65 Mon Sep 17 00:00:00 2001
From: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Date: Sat, 22 Dec 2018 11:22:26 +0100
Subject: f2fs: fix validation of the block count in sanity_check_raw_super

commit 88960068f25fcc3759455d85460234dcc9d43fef upstream.

Treat "block_count" from struct f2fs_super_block as 64-bit little endian
value in sanity_check_raw_super() because struct f2fs_super_block
declares "block_count" as "__le64".

This fixes a bug where the superblock validation fails on big endian
devices with the following error:
  F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0)
  F2FS-fs (sda1): Can't find valid F2FS filesystem in 1th superblock
  F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0)
  F2FS-fs (sda1): Can't find valid F2FS filesystem in 2th superblock
As result of this the partition cannot be mounted.

With this patch applied the superblock validation works fine and the
partition can be mounted again:
  F2FS-fs (sda1): Mounted with checkpoint version = 7c84

My little endian x86-64 hardware was able to mount the partition without
this fix.
To confirm that mounting f2fs filesystems works on big endian machines
again I tested this on a 32-bit MIPS big endian (lantiq) device.

Fixes: 0cfe75c5b01199 ("f2fs: enhance sanity_check_raw_super() to avoid potential overflows")
Cc: stable@vger.kernel.org
Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 287c9fe9fff9..338138b34993 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2267,10 +2267,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return 1;
 	}
 
-	if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) {
+	if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
 		f2fs_msg(sb, KERN_INFO,
-			"Wrong segment_count / block_count (%u > %u)",
-			segment_count, le32_to_cpu(raw_super->block_count));
+			"Wrong segment_count / block_count (%u > %llu)",
+			segment_count, le64_to_cpu(raw_super->block_count));
 		return 1;
 	}
 
-- 
cgit v1.2.3


From 5036fcd9b14516f62efae6ed0c42dfbb9798b643 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 26 Dec 2018 19:54:07 -0800
Subject: f2fs: sanity check of xattr entry size

commit 64beba0558fce7b59e9a8a7afd77290e82a22163 upstream.

There is a security report where f2fs_getxattr() has a hole to expose wrong
memory region when the image is malformed like this.

f2fs_getxattr: entry->e_name_len: 4, size: 12288, buffer_size: 16384, len: 4

Cc: <stable@vger.kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/xattr.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 77a010e625f5..087e53a2d96c 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -291,7 +291,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
 static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 				unsigned int index, unsigned int len,
 				const char *name, struct f2fs_xattr_entry **xe,
-				void **base_addr)
+				void **base_addr, int *base_size)
 {
 	void *cur_addr, *txattr_addr, *last_addr = NULL;
 	nid_t xnid = F2FS_I(inode)->i_xattr_nid;
@@ -302,8 +302,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 	if (!size && !inline_size)
 		return -ENODATA;
 
-	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode),
-			inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS);
+	*base_size = inline_size + size + XATTR_PADDING_SIZE;
+	txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
 	if (!txattr_addr)
 		return -ENOMEM;
 
@@ -315,8 +315,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
 
 		*xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
 						index, len, name);
-		if (*xe)
+		if (*xe) {
+			*base_size = inline_size;
 			goto check;
+		}
 	}
 
 	/* read from xattr node block */
@@ -477,6 +479,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 	int error = 0;
 	unsigned int size, len;
 	void *base_addr = NULL;
+	int base_size;
 
 	if (name == NULL)
 		return -EINVAL;
@@ -487,7 +490,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 
 	down_read(&F2FS_I(inode)->i_xattr_sem);
 	error = lookup_all_xattrs(inode, ipage, index, len, name,
-				&entry, &base_addr);
+				&entry, &base_addr, &base_size);
 	up_read(&F2FS_I(inode)->i_xattr_sem);
 	if (error)
 		return error;
@@ -501,6 +504,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
 
 	if (buffer) {
 		char *pval = entry->e_name + entry->e_name_len;
+
+		if (base_size - (pval - (char *)base_addr) < size) {
+			error = -ERANGE;
+			goto out;
+		}
 		memcpy(buffer, pval, size);
 	}
 	error = size;
-- 
cgit v1.2.3


From 1827d1c439bc7e0e7e1341aabd7e0ca6b6e194f6 Mon Sep 17 00:00:00 2001
From: Georgy A Bystrenin <gkot@altlinux.org>
Date: Fri, 21 Dec 2018 00:11:42 -0600
Subject: CIFS: Fix error mapping for SMB2_LOCK command which caused OFD lock
 problem

commit 9a596f5b39593414c0ec80f71b94a226286f084e upstream.

While resolving a bug with locks on samba shares found a strange behavior.
When a file locked by one node and we trying to lock it from another node
it fail with errno 5 (EIO) but in that case errno must be set to
(EACCES | EAGAIN).
This isn't happening when we try to lock file second time on same node.
In this case it returns EACCES as expected.
Also this issue not reproduces when we use SMB1 protocol (vers=1.0 in
mount options).

Further investigation showed that the mapping from status_to_posix_error
is different for SMB1 and SMB2+ implementations.
For SMB1 mapping is [NT_STATUS_LOCK_NOT_GRANTED to ERRlock]
(See fs/cifs/netmisc.c line 66)
but for SMB2+ mapping is [STATUS_LOCK_NOT_GRANTED to -EIO]
(see fs/cifs/smb2maperror.c line 383)

Quick changes in SMB2+ mapping from EIO to EACCES has fixed issue.

BUG: https://bugzilla.kernel.org/show_bug.cgi?id=201971

Signed-off-by: Georgy A Bystrenin <gkot@altlinux.org>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2maperror.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 20a2d304c603..c3ae8c1d6089 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -379,8 +379,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"},
 	{STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"},
 	{STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"},
-	{STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"},
-	{STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"},
+	{STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"},
+	{STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"},
 	{STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"},
 	{STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS,
 	"STATUS_CTL_FILE_NOT_SUPPORTED"},
-- 
cgit v1.2.3


From ba77e8c7f7044394c213e1221e630943c3379a6b Mon Sep 17 00:00:00 2001
From: Paul Aurich <paul@darkrain42.org>
Date: Mon, 31 Dec 2018 14:13:34 -0800
Subject: smb3: fix large reads on encrypted connections

commit 6d2f84eee098540ae857998fe32f29b9e2cd9613 upstream.

When passing a large read to receive_encrypted_read(), ensure that the
demultiplex_thread knows that a MID was processed.  Without this, those
operations never complete.

This is a similar issue/fix to lease break handling:
commit 7af929d6d05ba5564139718e30d5bc96bdbc716a
("smb3: fix lease break problem introduced by compounding")

CC: Stable <stable@vger.kernel.org> # 4.19+
Fixes: b24df3e30cbf ("cifs: update receive_encrypted_standard to handle compounded responses")
Signed-off-by: Paul Aurich <paul@darkrain42.org>
Tested-by: Yves-Alexis Perez <corsac@corsac.net>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2ops.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 812da3e56a22..f44bb4a304e9 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -3184,8 +3184,10 @@ smb3_receive_transform(struct TCP_Server_Info *server,
 	}
 
 	/* TODO: add support for compounds containing READ. */
-	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+	if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) {
+		*num_mids = 1;
 		return receive_encrypted_read(server, &mids[0]);
+	}
 
 	return receive_encrypted_standard(server, mids, bufs, num_mids);
 }
-- 
cgit v1.2.3


From 03acbec28a24fd961b0e78904d5644a30bb5cfc5 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:15:05 +0300
Subject: dlm: fixed memory leaks after failed ls_remove_names allocation

commit b982896cdb6e6a6b89d86dfb39df489d9df51e14 upstream.

If allocation fails on last elements of array need to free already
allocated elements.

v2: just move existing out_rsbtbl label to right place

Fixes 789924ba635f ("dlm: fix race between remove and lookup")
Cc: stable@kernel.org # 3.6

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lockspace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 5ba94be006ee..6a1529e478f3 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -680,11 +680,11 @@ static int new_lockspace(const char *name, const char *cluster,
 	kfree(ls->ls_recover_buf);
  out_lkbidr:
 	idr_destroy(&ls->ls_lkbidr);
+ out_rsbtbl:
 	for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) {
 		if (ls->ls_remove_names[i])
 			kfree(ls->ls_remove_names[i]);
 	}
- out_rsbtbl:
 	vfree(ls->ls_rsbtbl);
  out_lsfree:
 	if (do_unreg)
-- 
cgit v1.2.3


From 294776562f96efbd9b6a64348020b9e96ccf6e9e Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:18:18 +0300
Subject: dlm: possible memory leak on error path in create_lkb()

commit 23851e978f31eda8b2d01bd410d3026659ca06c7 upstream.

Fixes 3d6aa675fff9 ("dlm: keep lkbs in idr")
Cc: stable@kernel.org # 3.1

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index cc91963683de..2cb125cc21c9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1209,6 +1209,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
 
 	if (rv < 0) {
 		log_error(ls, "create_lkb idr error %d", rv);
+		dlm_free_lkb(lkb);
 		return rv;
 	}
 
-- 
cgit v1.2.3


From c5fa01a0153fc4df23777034bc47eae31ac27731 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:18:24 +0300
Subject: dlm: lost put_lkb on error path in receive_convert() and
 receive_unlock()

commit c0174726c3976e67da8649ac62cae43220ae173a upstream.

Fixes 6d40c4a708e0 ("dlm: improve error and debug messages")
Cc: stable@kernel.org # 3.5

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2cb125cc21c9..03d767b94f7b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4180,6 +4180,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
 			  (unsigned long long)lkb->lkb_recover_seq,
 			  ms->m_header.h_nodeid, ms->m_lkid);
 		error = -ENOENT;
+		dlm_put_lkb(lkb);
 		goto fail;
 	}
 
@@ -4233,6 +4234,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
 			  lkb->lkb_id, lkb->lkb_remid,
 			  ms->m_header.h_nodeid, ms->m_lkid);
 		error = -ENOENT;
+		dlm_put_lkb(lkb);
 		goto fail;
 	}
 
-- 
cgit v1.2.3


From 30d3dfd4c42005311a0a06aa5bb9d3557fdc2e0e Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 15 Nov 2018 13:18:56 +0300
Subject: dlm: memory leaks on error path in dlm_user_request()

commit d47b41aceeadc6b58abc9c7c6485bef7cfb75636 upstream.

According to comment in dlm_user_request() ua should be freed
in dlm_free_lkb() after successful attach to lkb.

However ua is attached to lkb not in set_lock_args() but later,
inside request_lock().

Fixes 597d0cae0f99 ("[DLM] dlm: user locks")
Cc: stable@kernel.org # 2.6.19

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dlm/lock.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 03d767b94f7b..a928ba008d7d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5795,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 			goto out;
 		}
 	}
-
-	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
-	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
-	   lock and that lkb_astparam is the dlm_user_args structure. */
-
 	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
 			      fake_astfn, ua, fake_bastfn, &args);
-	lkb->lkb_flags |= DLM_IFL_USER;
-
 	if (error) {
+		kfree(ua->lksb.sb_lvbptr);
+		ua->lksb.sb_lvbptr = NULL;
+		kfree(ua);
 		__put_lkb(ls, lkb);
 		goto out;
 	}
 
+	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
+	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   lock and that lkb_astparam is the dlm_user_args structure. */
+	lkb->lkb_flags |= DLM_IFL_USER;
 	error = request_lock(ls, lkb, name, namelen, &args);
 
 	switch (error) {
-- 
cgit v1.2.3


From 310486107d4105ceb0a1f7884074ff0da32e7d1c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Mon, 26 Nov 2018 18:45:35 +0100
Subject: gfs2: Get rid of potential double-freeing in gfs2_create_inode

commit 6ff9b09e00a441599f3aacdf577254455a048bc9 upstream.

In gfs2_create_inode, after setting and releasing the acl / default_acl, the
acl / default_acl pointers are not set to NULL as they should be.  In that
state, when the function reaches label fail_free_acls, gfs2_create_inode will
try to release the same acls again.

Fix that by setting the pointers to NULL after releasing the acls.  Slightly
simplify the logic.  Also, posix_acl_release checks for NULL already, so
there is no need to duplicate those checks here.

Fixes: e01580bf9e4d ("gfs2: use generic posix ACL infrastructure")
Reported-by: Pan Bian <bianpan2016@163.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org # v4.9+
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/inode.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 648f0ca1ad57..998051c4aea7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -744,17 +744,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 			       the gfs2 structures. */
 	if (default_acl) {
 		error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		if (error)
+			goto fail_gunlock3;
 		posix_acl_release(default_acl);
+		default_acl = NULL;
 	}
 	if (acl) {
-		if (!error)
-			error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		if (error)
+			goto fail_gunlock3;
 		posix_acl_release(acl);
+		acl = NULL;
 	}
 
-	if (error)
-		goto fail_gunlock3;
-
 	error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
 					     &gfs2_initxattrs, NULL);
 	if (error)
@@ -789,10 +791,8 @@ fail_free_inode:
 	}
 	gfs2_rsqa_delete(ip, NULL);
 fail_free_acls:
-	if (default_acl)
-		posix_acl_release(default_acl);
-	if (acl)
-		posix_acl_release(acl);
+	posix_acl_release(default_acl);
+	posix_acl_release(acl);
 fail_gunlock:
 	gfs2_dir_no_add(&da);
 	gfs2_glock_dq_uninit(ghs);
-- 
cgit v1.2.3


From 6ef56c9ad7f151f62992f524acce093f1a15d45b Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Tue, 4 Dec 2018 15:06:27 +0100
Subject: gfs2: Fix loop in gfs2_rbm_find

commit 2d29f6b96d8f80322ed2dd895bca590491c38d34 upstream.

Fix the resource group wrap-around logic in gfs2_rbm_find that commit
e579ed4f44 broke.  The bug can lead to unnecessary repeated scanning of the
same bitmaps; there is a risk that future changes will turn this into an
endless loop.

Fixes: e579ed4f44 ("GFS2: Introduce rbm field bii")
Cc: stable@vger.kernel.org # v3.13+
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 449d0cb45a84..e978f6930575 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1747,9 +1747,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 			goto next_iter;
 		}
 		if (ret == -E2BIG) {
+			n += rbm->bii - initial_bii;
 			rbm->bii = 0;
 			rbm->offset = 0;
-			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;
-- 
cgit v1.2.3


From 0b6001b941af35cde42ac3fdd4079488f2f19a82 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 1 Nov 2018 13:39:49 -0400
Subject: lockd: Show pid of lockd for remote locks

commit b8eee0e90f9797b747113638bc75e739b192ad38 upstream.

Commit 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid
for remote locks") specified that the l_pid returned for F_GETLK on a local
file that has a remote lock should be the pid of the lock manager process.
That commit, while updating other filesystems, failed to update lockd, such
that locks created by lockd had their fl_pid set to that of the remote
process holding the lock.  Fix that here to be the pid of lockd.

Also, fix the client case so that the returned lock pid is negative, which
indicates a remote lock on a remote file.

Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific...")
Cc: stable@vger.kernel.org

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/lockd/clntproc.c | 2 +-
 fs/lockd/xdr.c      | 4 ++--
 fs/lockd/xdr4.c     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index d20b92f271c2..0a67dd4250e9 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -442,7 +442,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 			fl->fl_start = req->a_res.lock.fl.fl_start;
 			fl->fl_end = req->a_res.lock.fl.fl_end;
 			fl->fl_type = req->a_res.lock.fl.fl_type;
-			fl->fl_pid = 0;
+			fl->fl_pid = -req->a_res.lock.fl.fl_pid;
 			break;
 		default:
 			status = nlm_stat_to_errno(req->a_res.status);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 7147e4aebecc..9846f7e95282 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -127,7 +127,7 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 
 	locks_init_lock(fl);
 	fl->fl_owner = current->files;
-	fl->fl_pid   = (pid_t)lock->svid;
+	fl->fl_pid   = current->tgid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	start = ntohl(*p++);
@@ -269,7 +269,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
 	lock->svid = ~(u32) 0;
-	lock->fl.fl_pid = (pid_t)lock->svid;
+	lock->fl.fl_pid = current->tgid;
 
 	if (!(p = nlm_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 7ed9edf9aed4..70154f376695 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -119,7 +119,7 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 
 	locks_init_lock(fl);
 	fl->fl_owner = current->files;
-	fl->fl_pid   = (pid_t)lock->svid;
+	fl->fl_pid   = current->tgid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	p = xdr_decode_hyper(p, &start);
@@ -266,7 +266,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
 	lock->svid = ~(u32) 0;
-	lock->fl.fl_pid = (pid_t)lock->svid;
+	lock->fl.fl_pid = current->tgid;
 
 	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
-- 
cgit v1.2.3


From 3f26e68af5c0a3a0704a44dd0286c785b603a5c8 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 15 Nov 2018 11:21:40 -0500
Subject: nfsd4: zero-length WRITE should succeed

commit fdec6114ee1f0f43b1ad081ad8d46b23ba126d70 upstream.

Zero-length writes are legal; from 5661 section 18.32.3: "If the count
is zero, the WRITE will succeed and return a count of zero subject to
permissions checking".

This check is unnecessary and is causing zero-length reads to return
EINVAL.

Cc: stable@vger.kernel.org
Fixes: 3fd9557aec91 "NFSD: Refactor the generic write vector fill helper"
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs4proc.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9d6b4f0f1a25..f35aa9f88b5e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1015,8 +1015,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
 				      &write->wr_head, write->wr_buflen);
-	if (!nvecs)
-		return nfserr_io;
 	WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
 
 	status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp,
-- 
cgit v1.2.3


From 9797226bf661499ddfb7d754d155f594d5742f5e Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 29 Nov 2018 11:22:50 +0800
Subject: ceph: don't update importing cap's mseq when handing cap export

commit 3c1392d4c49962a31874af14ae9ff289cb2b3851 upstream.

Updating mseq makes client think importer mds has accepted all prior
cap messages and importer mds knows what caps client wants. Actually
some cap messages may have been dropped because of mseq mismatch.

If mseq is left untouched, importing cap's mds_wanted later will get
reset by cap import message.

Cc: stable@vger.kernel.org
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/caps.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index dd7dfdd2ba13..eadffaa39f4e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3566,7 +3566,6 @@ retry:
 			tcap->cap_id = t_cap_id;
 			tcap->seq = t_seq - 1;
 			tcap->issue_seq = t_seq - 1;
-			tcap->mseq = t_mseq;
 			tcap->issued |= issued;
 			tcap->implemented |= issued;
 			if (cap == ci->i_auth_cap)
-- 
cgit v1.2.3


From d7068618ae1fbb80fc16bd7c58798e208a696483 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 8 Jan 2019 11:44:41 +0000
Subject: Btrfs: fix deadlock when using free space tree due to block group
 creation

commit a6d8654d885d7d79a3fb82da64eaa489ca332a82 upstream.

When modifying the free space tree we can end up COWing one of its extent
buffers which in turn might result in allocating a new chunk, which in
turn can result in flushing (finish creation) of pending block groups. If
that happens we can deadlock because creating a pending block group needs
to update the free space tree, and if any of the updates tries to modify
the same extent buffer that we are COWing, we end up in a deadlock since
we try to write lock twice the same extent buffer.

So fix this by skipping pending block group creation if we are COWing an
extent buffer from the free space tree. This is a case missed by commit
5ce555578e091 ("Btrfs: fix deadlock when writing out free space caches").

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202173
Fixes: 5ce555578e091 ("Btrfs: fix deadlock when writing out free space caches")
CC: stable@vger.kernel.org # 4.18+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fa18520529f3..7ad6f2eec711 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1051,19 +1051,21 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		parent_start = parent->start;
 
 	/*
-	 * If we are COWing a node/leaf from the extent, chunk or device trees,
-	 * make sure that we do not finish block group creation of pending block
-	 * groups. We do this to avoid a deadlock.
+	 * If we are COWing a node/leaf from the extent, chunk, device or free
+	 * space trees, make sure that we do not finish block group creation of
+	 * pending block groups. We do this to avoid a deadlock.
 	 * COWing can result in allocation of a new chunk, and flushing pending
 	 * block groups (btrfs_create_pending_block_groups()) can be triggered
 	 * when finishing allocation of a new chunk. Creation of a pending block
-	 * group modifies the extent, chunk and device trees, therefore we could
-	 * deadlock with ourselves since we are holding a lock on an extent
-	 * buffer that btrfs_create_pending_block_groups() may try to COW later.
+	 * group modifies the extent, chunk, device and free space trees,
+	 * therefore we could deadlock with ourselves since we are holding a
+	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
+	 * try to COW later.
 	 */
 	if (root == fs_info->extent_root ||
 	    root == fs_info->chunk_root ||
-	    root == fs_info->dev_root)
+	    root == fs_info->dev_root ||
+	    root == fs_info->free_space_root)
 		trans->can_flush_pending_bgs = false;
 
 	cow = btrfs_alloc_tree_block(trans, root, parent_start,
-- 
cgit v1.2.3


From d1130682d1270495f49282535854221bbdf20638 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Wed, 19 Dec 2018 22:49:09 +0000
Subject: CIFS: Fix adjustment of credits for MTU requests

commit b983f7e92348d7e7d091db1b78b7915e9dd3d63a upstream.

Currently for MTU requests we allocate maximum possible credits
in advance and then adjust them according to the request size.
While we were adjusting the number of credits belonging to the
server, we were skipping adjustment of credits belonging to the
request. This patch fixes it by setting request credits to
CreditCharge field value of SMB2 packet header.

Also ask 1 credit more for async read and write operations to
increase parallelism and match the behavior of other operations.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f54d07bda067..dba986524917 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3185,12 +3185,14 @@ smb2_async_readv(struct cifs_readdata *rdata)
 	if (rdata->credits) {
 		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
 						SMB2_MAX_BUFFER_SIZE));
-		shdr->CreditRequest = shdr->CreditCharge;
+		shdr->CreditRequest =
+			cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
 		spin_lock(&server->req_lock);
 		server->credits += rdata->credits -
 						le16_to_cpu(shdr->CreditCharge);
 		spin_unlock(&server->req_lock);
 		wake_up(&server->request_q);
+		rdata->credits = le16_to_cpu(shdr->CreditCharge);
 		flags |= CIFS_HAS_CREDITS;
 	}
 
@@ -3462,12 +3464,14 @@ smb2_async_writev(struct cifs_writedata *wdata,
 	if (wdata->credits) {
 		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
 						    SMB2_MAX_BUFFER_SIZE));
-		shdr->CreditRequest = shdr->CreditCharge;
+		shdr->CreditRequest =
+			cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1);
 		spin_lock(&server->req_lock);
 		server->credits += wdata->credits -
 						le16_to_cpu(shdr->CreditCharge);
 		spin_unlock(&server->req_lock);
 		wake_up(&server->request_q);
+		wdata->credits = le16_to_cpu(shdr->CreditCharge);
 		flags |= CIFS_HAS_CREDITS;
 	}
 
-- 
cgit v1.2.3


From c3606c64678369dc108f397109569cfa0ce0c101 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Thu, 3 Jan 2019 16:45:13 -0800
Subject: CIFS: Do not set credits to 1 if the server didn't grant anything

commit 33fa5c8b8a7dbe6353a56eaa654b790348890d42 upstream.

Currently we reset the number of total credits granted by the server
to 1 if the server didn't grant us anything int the response. This
violates the SMB3 protocol - we need to trust the server and use
the credit values from the response. Fix this by removing the
corresponding code.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/transport.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 333729cf46cd..a610381f8140 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -885,8 +885,6 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 	for (i = 0; i < num_rqst; i++)
 		if (midQ[i]->resp_buf)
 			credits += ses->server->ops->get_credits(midQ[i]);
-	if (!credits)
-		credits = 1;
 
 	for (i = 0; i < num_rqst; i++) {
 		if (rc < 0)
-- 
cgit v1.2.3


From d2f76f6f9fa963d337e72616f6e06b8bbac5eb7f Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Thu, 10 Jan 2019 11:27:28 -0800
Subject: CIFS: Do not hide EINTR after sending network packets

commit ee13919c2e8d1f904e035ad4b4239029a8994131 upstream.

Currently we hide EINTR code returned from sock_sendmsg()
and return 0 instead. This makes a caller think that we
successfully completed the network operation which is not
true. Fix this by properly returning EINTR to callers.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index a610381f8140..005c2af9d696 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -378,7 +378,7 @@ smbd_done:
 	if (rc < 0 && rc != -EINTR)
 		cifs_dbg(VFS, "Error %d sending data on socket to server\n",
 			 rc);
-	else
+	else if (rc > 0)
 		rc = 0;
 
 	return rc;
-- 
cgit v1.2.3


From 7dcc5b36ea7f5f5d15907e439befa14cecffc9f1 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Sat, 22 Dec 2018 12:40:05 -0800
Subject: CIFS: Fix credit computation for compounded requests

commit 8544f4aa9dd19a04d1244dae10feecc813ccf175 upstream.

In SMB3 protocol every part of the compound chain consumes credits
individually, so we need to call wait_for_free_credits() for each
of the PDUs in the chain. If an operation is interrupted, we must
ensure we return all credits taken from the server structure back.

Without this patch server can sometimes disconnect the session
due to credit mismatches, especially when first operation(s)
are large writes.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/transport.c | 59 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 005c2af9d696..66348b3d28e6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -786,7 +786,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 	int i, j, rc = 0;
 	int timeout, optype;
 	struct mid_q_entry *midQ[MAX_COMPOUND];
-	unsigned int credits = 0;
+	bool cancelled_mid[MAX_COMPOUND] = {false};
+	unsigned int credits[MAX_COMPOUND] = {0};
 	char *buf;
 
 	timeout = flags & CIFS_TIMEOUT_MASK;
@@ -804,13 +805,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 		return -ENOENT;
 
 	/*
-	 * Ensure that we do not send more than 50 overlapping requests
-	 * to the same server. We may make this configurable later or
-	 * use ses->maxReq.
+	 * Ensure we obtain 1 credit per request in the compound chain.
+	 * It can be optimized further by waiting for all the credits
+	 * at once but this can wait long enough if we don't have enough
+	 * credits due to some heavy operations in progress or the server
+	 * not granting us much, so a fallback to the current approach is
+	 * needed anyway.
 	 */
-	rc = wait_for_free_request(ses->server, timeout, optype);
-	if (rc)
-		return rc;
+	for (i = 0; i < num_rqst; i++) {
+		rc = wait_for_free_request(ses->server, timeout, optype);
+		if (rc) {
+			/*
+			 * We haven't sent an SMB packet to the server yet but
+			 * we already obtained credits for i requests in the
+			 * compound chain - need to return those credits back
+			 * for future use. Note that we need to call add_credits
+			 * multiple times to match the way we obtained credits
+			 * in the first place and to account for in flight
+			 * requests correctly.
+			 */
+			for (j = 0; j < i; j++)
+				add_credits(ses->server, 1, optype);
+			return rc;
+		}
+		credits[i] = 1;
+	}
 
 	/*
 	 * Make sure that we sign in the same order that we send on this socket
@@ -826,8 +845,10 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 			for (j = 0; j < i; j++)
 				cifs_delete_mid(midQ[j]);
 			mutex_unlock(&ses->server->srv_mutex);
+
 			/* Update # of requests on wire to server */
-			add_credits(ses->server, 1, optype);
+			for (j = 0; j < num_rqst; j++)
+				add_credits(ses->server, credits[j], optype);
 			return PTR_ERR(midQ[i]);
 		}
 
@@ -874,17 +895,16 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 			if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
 				midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
 				midQ[i]->callback = DeleteMidQEntry;
-				spin_unlock(&GlobalMid_Lock);
-				add_credits(ses->server, 1, optype);
-				return rc;
+				cancelled_mid[i] = true;
 			}
 			spin_unlock(&GlobalMid_Lock);
 		}
 	}
 
 	for (i = 0; i < num_rqst; i++)
-		if (midQ[i]->resp_buf)
-			credits += ses->server->ops->get_credits(midQ[i]);
+		if (!cancelled_mid[i] && midQ[i]->resp_buf
+		    && (midQ[i]->mid_state == MID_RESPONSE_RECEIVED))
+			credits[i] = ses->server->ops->get_credits(midQ[i]);
 
 	for (i = 0; i < num_rqst; i++) {
 		if (rc < 0)
@@ -892,8 +912,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 
 		rc = cifs_sync_mid_result(midQ[i], ses->server);
 		if (rc != 0) {
-			add_credits(ses->server, credits, optype);
-			return rc;
+			/* mark this mid as cancelled to not free it below */
+			cancelled_mid[i] = true;
+			goto out;
 		}
 
 		if (!midQ[i]->resp_buf ||
@@ -940,9 +961,11 @@ out:
 	 * This is prevented above by using a noop callback that will not
 	 * wake this thread except for the very last PDU.
 	 */
-	for (i = 0; i < num_rqst; i++)
-		cifs_delete_mid(midQ[i]);
-	add_credits(ses->server, credits, optype);
+	for (i = 0; i < num_rqst; i++) {
+		if (!cancelled_mid[i])
+			cifs_delete_mid(midQ[i]);
+		add_credits(ses->server, credits[i], optype);
+	}
 
 	return rc;
 }
-- 
cgit v1.2.3


From 2a71a47e03ffa7aa68893406f8134977cb164b59 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Tue, 8 Jan 2019 18:30:57 +0000
Subject: cifs: Fix potential OOB access of lock element array

commit b9a74cde94957d82003fb9f7ab4777938ca851cd upstream.

If maxBuf is small but non-zero, it could result in a zero sized lock
element array which we would then try and access OOB.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/file.c     | 8 ++++----
 fs/cifs/smb2file.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8d41ca7bfcf1..7b637fc27990 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1120,10 +1120,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf) {
+	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) {
 		free_xid(xid);
 		return -EINVAL;
 	}
@@ -1460,10 +1460,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf)
+	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
 		return -EINVAL;
 
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 4ed10dd086e6..2fc3d31967ee 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -122,10 +122,10 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 
 	/*
 	 * Accessing maxBuf is racy with cifs_reconnect - need to store value
-	 * and check it for zero before using.
+	 * and check it before using.
 	 */
 	max_buf = tcon->ses->server->maxBuf;
-	if (!max_buf)
+	if (max_buf < sizeof(struct smb2_lock_element))
 		return -EINVAL;
 
 	max_num = max_buf / sizeof(struct smb2_lock_element);
-- 
cgit v1.2.3


From 7c2ea25e1364d9cdcbc7c44a92dccaf86e799f3a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 24 Dec 2018 20:27:08 -0500
Subject: ext4: make sure enough credits are reserved for dioread_nolock writes

commit 812c0cab2c0dfad977605dbadf9148490ca5d93f upstream.

There are enough credits reserved for most dioread_nolock writes;
however, if the extent tree is sufficiently deep, and/or quota is
enabled, the code was not allowing for all eventualities when
reserving journal credits for the unwritten extent conversion.

This problem can be seen using xfstests ext4/034:

   WARNING: CPU: 1 PID: 257 at fs/ext4/ext4_jbd2.c:271 __ext4_handle_dirty_metadata+0x10c/0x180
   Workqueue: ext4-rsv-conversion ext4_end_io_rsv_work
   RIP: 0010:__ext4_handle_dirty_metadata+0x10c/0x180
   	...
   EXT4-fs: ext4_free_blocks:4938: aborting transaction: error 28 in __ext4_handle_dirty_metadata
   EXT4: jbd2_journal_dirty_metadata failed: handle type 11 started at line 4921, credits 4/0, errcode -28
   EXT4-fs error (device dm-1) in ext4_free_blocks:4950: error 28

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 36abbdafb26e..0a5388347fca 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2748,7 +2748,8 @@ static int ext4_writepages(struct address_space *mapping,
 		 * We may need to convert up to one extent per block in
 		 * the page and we may dirty the inode.
 		 */
-		rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
+		rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
+						PAGE_SIZE >> inode->i_blkbits);
 	}
 
 	/*
-- 
cgit v1.2.3


From 926cdac10439eda9ab7a11fd5f28eda203ccbb26 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 25 Dec 2018 00:56:33 -0500
Subject: ext4: fix a potential fiemap/page fault deadlock w/ inline_data

commit 2b08b1f12cd664dc7d5c84ead9ff25ae97ad5491 upstream.

The ext4_inline_data_fiemap() function calls fiemap_fill_next_extent()
while still holding the xattr semaphore.  This is not necessary and it
triggers a circular lockdep warning.  This is because
fiemap_fill_next_extent() could trigger a page fault when it writes
into page which triggers a page fault.  If that page is mmaped from
the inline file in question, this could very well result in a
deadlock.

This problem can be reproduced using generic/519 with a file system
configuration which has the inline_data feature enabled.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inline.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 27373d88b5f0..56f6e1782d5f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1890,12 +1890,12 @@ int ext4_inline_data_fiemap(struct inode *inode,
 	physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
 	physical += offsetof(struct ext4_inode, i_block);
 
-	if (physical)
-		error = fiemap_fill_next_extent(fieinfo, start, physical,
-						inline_len, flags);
 	brelse(iloc.bh);
 out:
 	up_read(&EXT4_I(inode)->xattr_sem);
+	if (physical)
+		error = fiemap_fill_next_extent(fieinfo, start, physical,
+						inline_len, flags);
 	return (error < 0 ? error : 0);
 }
 
-- 
cgit v1.2.3


From 01db6e5cf81f905028de903f2290c10172226043 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 30 Dec 2018 23:20:39 -0500
Subject: ext4: avoid kernel warning when writing the superblock to a dead
 device

commit e86807862e6880809f191c4cea7f88a489f0ed34 upstream.

The xfstests generic/475 test switches the underlying device with
dm-error while running a stress test.  This results in a large number
of file system errors, and since we can't lock the buffer head when
marking the superblock dirty in the ext4_grp_locked_error() case, it's
possible the superblock to be !buffer_uptodate() without
buffer_write_io_error() being true.

We need to set buffer_uptodate() before we call mark_buffer_dirty() or
this will trigger a WARN_ON.  It's safe to do this since the
superblock must have been properly read into memory or the mount would
have been successful.  So if buffer_uptodate() is not set, we can
safely assume that this happened due to a failed attempt to write the
superblock.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee0f30852835..a1cf7d68b4f0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4904,7 +4904,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	ext4_superblock_csum_set(sb);
 	if (sync)
 		lock_buffer(sbh);
-	if (buffer_write_io_error(sbh)) {
+	if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
 		 * superblock failed.  This could happen because the
-- 
cgit v1.2.3


From da38a1b47b02b01f96976bbf75466bc1ae519d76 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Dec 2018 00:10:48 -0500
Subject: ext4: use ext4_write_inode() when fsyncing w/o a journal

commit ad211f3e94b314a910d4af03178a0b52a7d1ee0a upstream.

In no-journal mode, we previously used __generic_file_fsync() in
no-journal mode.  This triggers a lockdep warning, and in addition,
it's not safe to depend on the inode writeback mechanism in the case
ext4.  We can solve both problems by calling ext4_write_inode()
directly.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/fsync.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 26a7fe5c4fd3..87a7ff00ef62 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -116,8 +116,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
+	ret = file_write_and_wait_range(file, start, end);
+	if (ret)
+		return ret;
+
 	if (!journal) {
-		ret = __generic_file_fsync(file, start, end, datasync);
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL
+		};
+
+		ret = ext4_write_inode(inode, &wbc);
 		if (!ret)
 			ret = ext4_sync_parent(inode);
 		if (test_opt(inode->i_sb, BARRIER))
@@ -125,9 +133,6 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		return ret;
 	/*
 	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-- 
cgit v1.2.3


From bb80ad0dc3924f193eaa752e9da2e9384f58e627 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Dec 2018 00:11:07 -0500
Subject: ext4: track writeback errors using the generic tracking
 infrastructure

commit 95cb67138746451cc84cf8e516e14989746e93b0 upstream.

We already using mapping_set_error() in fs/ext4/page_io.c, so all we
need to do is to use file_check_and_advance_wb_err() when handling
fsync() requests in ext4_sync_file().

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/fsync.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 87a7ff00ef62..712f00995390 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -164,6 +164,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 			ret = err;
 	}
 out:
+	err = file_check_and_advance_wb_err(file);
+	if (ret == 0)
+		ret = err;
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
-- 
cgit v1.2.3


From 5dc41af3d19e94253c95f62e8ac01158dd8df078 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 31 Dec 2018 22:34:31 -0500
Subject: ext4: fix special inode number checks in __ext4_iget()

commit 191ce17876c9367819c4b0a25b503c0f6d9054d8 upstream.

The check for special (reserved) inode number checks in __ext4_iget()
was broken by commit 8a363970d1dc: ("ext4: avoid declaring fs
inconsistent due to invalid file handles").  This was caused by a
botched reversal of the sense of the flag now known as
EXT4_IGET_SPECIAL (when it was previously named EXT4_IGET_NORMAL).
Fix the logic appropriately.

Fixes: 8a363970d1dc ("ext4: avoid declaring fs inconsistent...")
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0a5388347fca..2c43c5b92229 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4803,7 +4803,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	gid_t i_gid;
 	projid_t i_projid;
 
-	if (((flags & EXT4_IGET_NORMAL) &&
+	if ((!(flags & EXT4_IGET_SPECIAL) &&
 	     (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) ||
 	    (ino < EXT4_ROOT_INO) ||
 	    (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
-- 
cgit v1.2.3


From 829431a2a5a8495ee3a144de1be12e34dec45f91 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 19 Nov 2018 09:48:12 +0000
Subject: Btrfs: fix access to available allocation bits when starting balance

commit 5a8067c0d17feb7579db0476191417b441a8996e upstream.

The available allocation bits members from struct btrfs_fs_info are
protected by a sequence lock, and when starting balance we access them
incorrectly in two different ways:

1) In the read sequence lock loop at btrfs_balance() we use the values we
   read from fs_info->avail_*_alloc_bits and we can immediately do actions
   that have side effects and can not be undone (printing a message and
   jumping to a label). This is wrong because a retry might be needed, so
   our actions must not have side effects and must be repeatable as long
   as read_seqretry() returns a non-zero value. In other words, we were
   essentially ignoring the sequence lock;

2) Right below the read sequence lock loop, we were reading the values
   from avail_metadata_alloc_bits and avail_data_alloc_bits without any
   protection from concurrent writers, that is, reading them outside of
   the read sequence lock critical section.

So fix this by making sure we only read the available allocation bits
while in a read sequence lock critical section and that what we do in the
critical section is repeatable (has nothing that can not be undone) so
that any eventual retry that is needed is handled properly.

Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits")
Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or metadata")
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4405e430da6..223334f08530 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3712,6 +3712,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 	int ret;
 	u64 num_devices;
 	unsigned seq;
+	bool reducing_integrity;
 
 	if (btrfs_fs_closing(fs_info) ||
 	    atomic_read(&fs_info->balance_pause_req) ||
@@ -3796,24 +3797,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
 		     !(bctl->sys.target & allowed)) ||
 		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
 		     (fs_info->avail_metadata_alloc_bits & allowed) &&
-		     !(bctl->meta.target & allowed))) {
-			if (bctl->flags & BTRFS_BALANCE_FORCE) {
-				btrfs_info(fs_info,
-				"balance: force reducing metadata integrity");
-			} else {
-				btrfs_err(fs_info,
-	"balance: reduces metadata integrity, use --force if you want this");
-				ret = -EINVAL;
-				goto out;
-			}
-		}
+		     !(bctl->meta.target & allowed)))
+			reducing_integrity = true;
+		else
+			reducing_integrity = false;
+
+		/* if we're not converting, the target field is uninitialized */
+		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
+		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
+			bctl->data.target : fs_info->avail_data_alloc_bits;
 	} while (read_seqretry(&fs_info->profiles_lock, seq));
 
-	/* if we're not converting, the target field is uninitialized */
-	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
-		bctl->meta.target : fs_info->avail_metadata_alloc_bits;
-	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
-		bctl->data.target : fs_info->avail_data_alloc_bits;
+	if (reducing_integrity) {
+		if (bctl->flags & BTRFS_BALANCE_FORCE) {
+			btrfs_info(fs_info,
+				   "balance: force reducing metadata integrity");
+		} else {
+			btrfs_err(fs_info,
+	  "balance: reduces metadata integrity, use --force if you want this");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
 	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
 		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
 		int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
-- 
cgit v1.2.3


From 79aa5c0daa5c7a2c4de945f4964702f3fcb6f8a3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 19 Nov 2018 14:15:36 +0000
Subject: Btrfs: fix deadlock when enabling quotas due to concurrent snapshot
 creation

commit 9a6f209e36500efac51528132a3e3083586eda5f upstream.

If the quota enable and snapshot creation ioctls are called concurrently
we can get into a deadlock where the task enabling quotas will deadlock
on the fs_info->qgroup_ioctl_lock mutex because it attempts to lock it
twice, or the task creating a snapshot tries to commit the transaction
while the task enabling quota waits for the former task to commit the
transaction while holding the mutex. The following time diagrams show how
both cases happen.

First scenario:

           CPU 0                                    CPU 1

 btrfs_ioctl()
  btrfs_ioctl_quota_ctl()
   btrfs_quota_enable()
    mutex_lock(fs_info->qgroup_ioctl_lock)
    btrfs_start_transaction()

                                             btrfs_ioctl()
                                              btrfs_ioctl_snap_create_v2
                                               create_snapshot()
                                                --> adds snapshot to the
                                                    list pending_snapshots
                                                    of the current
                                                    transaction

    btrfs_commit_transaction()
     create_pending_snapshots()
       create_pending_snapshot()
        qgroup_account_snapshot()
         btrfs_qgroup_inherit()
	   mutex_lock(fs_info->qgroup_ioctl_lock)
	    --> deadlock, mutex already locked
	        by this task at
		btrfs_quota_enable()

Second scenario:

           CPU 0                                    CPU 1

 btrfs_ioctl()
  btrfs_ioctl_quota_ctl()
   btrfs_quota_enable()
    mutex_lock(fs_info->qgroup_ioctl_lock)
    btrfs_start_transaction()

                                             btrfs_ioctl()
                                              btrfs_ioctl_snap_create_v2
                                               create_snapshot()
                                                --> adds snapshot to the
                                                    list pending_snapshots
                                                    of the current
                                                    transaction

                                                btrfs_commit_transaction()
                                                 --> waits for task at
                                                     CPU 0 to release
                                                     its transaction
                                                     handle

    btrfs_commit_transaction()
     --> sees another task started
         the transaction commit first
     --> releases its transaction
         handle
     --> waits for the transaction
         commit to be completed by
         the task at CPU 1

                                                 create_pending_snapshot()
                                                  qgroup_account_snapshot()
                                                   btrfs_qgroup_inherit()
                                                    mutex_lock(fs_info->qgroup_ioctl_lock)
                                                     --> deadlock, task at CPU 0
                                                         has the mutex locked but
                                                         it is waiting for us to
                                                         finish the transaction
                                                         commit

So fix this by setting the quota enabled flag in fs_info after committing
the transaction at btrfs_quota_enable(). This ends up serializing quota
enable and snapshot creation as if the snapshot creation happened just
before the quota enable request. The quota rescan task, scheduled after
committing the transaction in btrfs_quote_enable(), will do the accounting.

Fixes: 6426c7ad697d ("btrfs: qgroup: Fix qgroup accounting when creating snapshot")
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/qgroup.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index ff434663d65b..e1fcb28ad4cc 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1013,16 +1013,22 @@ out_add_root:
 		btrfs_abort_transaction(trans, ret);
 		goto out_free_path;
 	}
-	spin_lock(&fs_info->qgroup_lock);
-	fs_info->quota_root = quota_root;
-	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
-	spin_unlock(&fs_info->qgroup_lock);
 
 	ret = btrfs_commit_transaction(trans);
 	trans = NULL;
 	if (ret)
 		goto out_free_path;
 
+	/*
+	 * Set quota enabled flag after committing the transaction, to avoid
+	 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot
+	 * creation.
+	 */
+	spin_lock(&fs_info->qgroup_lock);
+	fs_info->quota_root = quota_root;
+	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+	spin_unlock(&fs_info->qgroup_lock);
+
 	ret = qgroup_rescan_init(fs_info, 0, 1);
 	if (!ret) {
 	        qgroup_rescan_zero_tracking(fs_info);
-- 
cgit v1.2.3


From 7a1b9b76bac76498657b9322164957c2a50c573b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 10 Dec 2018 17:53:35 +0000
Subject: Btrfs: use nofs context when initializing security xattrs to avoid
 deadlock

commit 827aa18e7b903c5ff3b3cd8fec328a99b1dbd411 upstream.

When initializing the security xattrs, we are holding a transaction handle
therefore we need to use a GFP_NOFS context in order to avoid a deadlock
with reclaim in case it's triggered.

Fixes: 39a27ec1004e8 ("btrfs: use GFP_KERNEL for xattr and acl allocations")
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/xattr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ea78c3d6dcfc..f141b45ce349 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -11,6 +11,7 @@
 #include <linux/security.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/iversion.h>
+#include <linux/sched/mm.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
@@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode,
 {
 	const struct xattr *xattr;
 	struct btrfs_trans_handle *trans = fs_info;
+	unsigned int nofs_flag;
 	char *name;
 	int err = 0;
 
+	/*
+	 * We're holding a transaction handle, so use a NOFS memory allocation
+	 * context to avoid deadlock if reclaim happens.
+	 */
+	nofs_flag = memalloc_nofs_save();
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
 		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
 			       strlen(xattr->name) + 1, GFP_KERNEL);
@@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode,
 		if (err < 0)
 			break;
 	}
+	memalloc_nofs_restore(nofs_flag);
 	return err;
 }
 
-- 
cgit v1.2.3


From 4675f90ef892b7071c09e30f80565adf628fff3b Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 9 Jan 2019 15:02:23 +0100
Subject: Revert "btrfs: balance dirty metadata pages in
 btrfs_finish_ordered_io"

commit 77b7aad195099e7c6da11e94b7fa6ef5e6fb0025 upstream.

This reverts commit e73e81b6d0114d4a303205a952ab2e87c44bd279.

This patch causes a few problems:

- adds latency to btrfs_finish_ordered_io
- as btrfs_finish_ordered_io is used for free space cache, generating
  more work from btrfs_btree_balance_dirty_nodelay could end up in the
  same workque, effectively deadlocking

12260 kworker/u96:16+btrfs-freespace-write D
[<0>] balance_dirty_pages+0x6e6/0x7ad
[<0>] balance_dirty_pages_ratelimited+0x6bb/0xa90
[<0>] btrfs_finish_ordered_io+0x3da/0x770
[<0>] normal_work_helper+0x1c5/0x5a0
[<0>] process_one_work+0x1ee/0x5a0
[<0>] worker_thread+0x46/0x3d0
[<0>] kthread+0xf5/0x130
[<0>] ret_from_fork+0x24/0x30
[<0>] 0xffffffffffffffff

Transaction commit will wait on the freespace cache:

838 btrfs-transacti D
[<0>] btrfs_start_ordered_extent+0x154/0x1e0
[<0>] btrfs_wait_ordered_range+0xbd/0x110
[<0>] __btrfs_wait_cache_io+0x49/0x1a0
[<0>] btrfs_write_dirty_block_groups+0x10b/0x3b0
[<0>] commit_cowonly_roots+0x215/0x2b0
[<0>] btrfs_commit_transaction+0x37e/0x910
[<0>] transaction_kthread+0x14d/0x180
[<0>] kthread+0xf5/0x130
[<0>] ret_from_fork+0x24/0x30
[<0>] 0xffffffffffffffff

And then writepages ends up waiting on transaction commit:

9520 kworker/u96:13+flush-btrfs-1 D
[<0>] wait_current_trans+0xac/0xe0
[<0>] start_transaction+0x21b/0x4b0
[<0>] cow_file_range_inline+0x10b/0x6b0
[<0>] cow_file_range.isra.69+0x329/0x4a0
[<0>] run_delalloc_range+0x105/0x3c0
[<0>] writepage_delalloc+0x119/0x180
[<0>] __extent_writepage+0x10c/0x390
[<0>] extent_write_cache_pages+0x26f/0x3d0
[<0>] extent_writepages+0x4f/0x80
[<0>] do_writepages+0x17/0x60
[<0>] __writeback_single_inode+0x59/0x690
[<0>] writeback_sb_inodes+0x291/0x4e0
[<0>] __writeback_inodes_wb+0x87/0xb0
[<0>] wb_writeback+0x3bb/0x500
[<0>] wb_workfn+0x40d/0x610
[<0>] process_one_work+0x1ee/0x5a0
[<0>] worker_thread+0x1e0/0x3d0
[<0>] kthread+0xf5/0x130
[<0>] ret_from_fork+0x24/0x30
[<0>] 0xffffffffffffffff

Eventually, we have every process in the system waiting on
balance_dirty_pages(), and nobody is able to make progress on page
writeback.

The original patch tried to fix an OOM condition, that happened on 4.4 but no
success reproducing that on later kernels (4.19 and 4.20). This is more likely
a problem in OOM itself.

Link: https://lore.kernel.org/linux-btrfs/20180528054821.9092-1-ethanlien@synology.com/
Reported-by: Chris Mason <clm@fb.com>
CC: stable@vger.kernel.org # 4.18+
CC: ethanlien <ethanlien@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 14c85e61134d..4f6dc56b4f4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3151,9 +3151,6 @@ out:
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	/* Try to release some metadata so we don't get an OOM but don't wait */
-	btrfs_btree_balance_dirty_nodelay(fs_info);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 01634ac56393272de45f1f88be9d28f52f91a272 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 21 Nov 2018 14:05:45 -0500
Subject: btrfs: wait on ordered extents on abort cleanup

commit 74d5d229b1bf60f93bff244b2dfc0eb21ec32a07 upstream.

If we flip read-only before we initiate writeback on all dirty pages for
ordered extents we've created then we'll have ordered extents left over
on umount, which results in all sorts of bad things happening.  Fix this
by making sure we wait on ordered extents if we have to do the aborted
transaction cleanup stuff.

generic/475 can produce this warning:

 [ 8531.177332] WARNING: CPU: 2 PID: 11997 at fs/btrfs/disk-io.c:3856 btrfs_free_fs_root+0x95/0xa0 [btrfs]
 [ 8531.183282] CPU: 2 PID: 11997 Comm: umount Tainted: G        W 5.0.0-rc1-default+ #394
 [ 8531.185164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014
 [ 8531.187851] RIP: 0010:btrfs_free_fs_root+0x95/0xa0 [btrfs]
 [ 8531.193082] RSP: 0018:ffffb1ab86163d98 EFLAGS: 00010286
 [ 8531.194198] RAX: ffff9f3449494d18 RBX: ffff9f34a2695000 RCX:0000000000000000
 [ 8531.195629] RDX: 0000000000000002 RSI: 0000000000000001 RDI:0000000000000000
 [ 8531.197315] RBP: ffff9f344e930000 R08: 0000000000000001 R09:0000000000000000
 [ 8531.199095] R10: 0000000000000000 R11: ffff9f34494d4ff8 R12:ffffb1ab86163dc0
 [ 8531.200870] R13: ffff9f344e9300b0 R14: ffffb1ab86163db8 R15:0000000000000000
 [ 8531.202707] FS:  00007fc68e949fc0(0000) GS:ffff9f34bd800000(0000)knlGS:0000000000000000
 [ 8531.204851] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [ 8531.205942] CR2: 00007ffde8114dd8 CR3: 000000002dfbd000 CR4:00000000000006e0
 [ 8531.207516] Call Trace:
 [ 8531.208175]  btrfs_free_fs_roots+0xdb/0x170 [btrfs]
 [ 8531.210209]  ? wait_for_completion+0x5b/0x190
 [ 8531.211303]  close_ctree+0x157/0x350 [btrfs]
 [ 8531.212412]  generic_shutdown_super+0x64/0x100
 [ 8531.213485]  kill_anon_super+0x14/0x30
 [ 8531.214430]  btrfs_kill_super+0x12/0xa0 [btrfs]
 [ 8531.215539]  deactivate_locked_super+0x29/0x60
 [ 8531.216633]  cleanup_mnt+0x3b/0x70
 [ 8531.217497]  task_work_run+0x98/0xc0
 [ 8531.218397]  exit_to_usermode_loop+0x83/0x90
 [ 8531.219324]  do_syscall_64+0x15b/0x180
 [ 8531.220192]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 [ 8531.221286] RIP: 0033:0x7fc68e5e4d07
 [ 8531.225621] RSP: 002b:00007ffde8116608 EFLAGS: 00000246 ORIG_RAX:00000000000000a6
 [ 8531.227512] RAX: 0000000000000000 RBX: 00005580c2175970 RCX:00007fc68e5e4d07
 [ 8531.229098] RDX: 0000000000000001 RSI: 0000000000000000 RDI:00005580c2175b80
 [ 8531.230730] RBP: 0000000000000000 R08: 00005580c2175ba0 R09:00007ffde8114e80
 [ 8531.232269] R10: 0000000000000000 R11: 0000000000000246 R12:00005580c2175b80
 [ 8531.233839] R13: 00007fc68eac61c4 R14: 00005580c2175a68 R15:0000000000000000

Leaving a tree in the rb-tree:

3853 void btrfs_free_fs_root(struct btrfs_root *root)
3854 {
3855         iput(root->ino_cache_inode);
3856         WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));

CC: stable@vger.kernel.org
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add stacktrace ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d4a7f7ca4145..d96d1390068a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4155,6 +4155,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 		spin_lock(&fs_info->ordered_root_lock);
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
+
+	/*
+	 * We need this here because if we've been flipped read-only we won't
+	 * get sync() from the umount, so we need to make sure any ordered
+	 * extents that haven't had their dirty pages IO start writeout yet
+	 * actually get run and error out properly.
+	 */
+	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 }
 
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-- 
cgit v1.2.3


From 483ac8e65a8a645d0ba31ab7077ac50debeb4127 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 20 Jan 2019 14:33:34 -0800
Subject: pstore/ram: Avoid allocation and leak of platform data

commit 5631e8576a3caf606cdc375f97425a67983b420c upstream.

Yue Hu noticed that when parsing device tree the allocated platform data
was never freed. Since it's not used beyond the function scope, this
switches to using a stack variable instead.

Reported-by: Yue Hu <huyue2@yulong.com>
Fixes: 35da60941e44 ("pstore/ram: add Device Tree bindings")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/ram.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 03cd59375abe..eb67bb7f04de 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -713,18 +713,15 @@ static int ramoops_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct ramoops_platform_data *pdata = dev->platform_data;
+	struct ramoops_platform_data pdata_local;
 	struct ramoops_context *cxt = &oops_cxt;
 	size_t dump_mem_sz;
 	phys_addr_t paddr;
 	int err = -EINVAL;
 
 	if (dev_of_node(dev) && !pdata) {
-		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
-		if (!pdata) {
-			pr_err("cannot allocate platform data buffer\n");
-			err = -ENOMEM;
-			goto fail_out;
-		}
+		pdata = &pdata_local;
+		memset(pdata, 0, sizeof(*pdata));
 
 		err = ramoops_parse_dt(pdev, pdata);
 		if (err < 0)
-- 
cgit v1.2.3


From 1e11b1d630f8fdbca3843259e581fdec0941dc03 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 14 Jan 2019 09:48:10 +0100
Subject: blockdev: Fix livelocks on loop device

commit 04906b2f542c23626b0ef6219b808406f8dddbe9 upstream.

bd_set_size() updates also block device's block size. This is somewhat
unexpected from its name and at this point, only blkdev_open() uses this
functionality. Furthermore, this can result in changing block size under
a filesystem mounted on a loop device which leads to livelocks inside
__getblk_gfp() like:

Sending NMI from CPU 0 to CPUs 1:
NMI backtrace for cpu 1
CPU: 1 PID: 10863 Comm: syz-executor0 Not tainted 4.18.0-rc5+ #151
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google
01/01/2011
RIP: 0010:__sanitizer_cov_trace_pc+0x3f/0x50 kernel/kcov.c:106
...
Call Trace:
 init_page_buffers+0x3e2/0x530 fs/buffer.c:904
 grow_dev_page fs/buffer.c:947 [inline]
 grow_buffers fs/buffer.c:1009 [inline]
 __getblk_slow fs/buffer.c:1036 [inline]
 __getblk_gfp+0x906/0xb10 fs/buffer.c:1313
 __bread_gfp+0x2d/0x310 fs/buffer.c:1347
 sb_bread include/linux/buffer_head.h:307 [inline]
 fat12_ent_bread+0x14e/0x3d0 fs/fat/fatent.c:75
 fat_ent_read_block fs/fat/fatent.c:441 [inline]
 fat_alloc_clusters+0x8ce/0x16e0 fs/fat/fatent.c:489
 fat_add_cluster+0x7a/0x150 fs/fat/inode.c:101
 __fat_get_block fs/fat/inode.c:148 [inline]
...

Trivial reproducer for the problem looks like:

truncate -s 1G /tmp/image
losetup /dev/loop0 /tmp/image
mkfs.ext4 -b 1024 /dev/loop0
mount -t ext4 /dev/loop0 /mnt
losetup -c /dev/loop0
l /mnt

Fix the problem by moving initialization of a block device block size
into a separate function and call it when needed.

Thanks to Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> for help with
debugging the problem.

Reported-by: syzbot+9933e4476f365f5d5a1b@syzkaller.appspotmail.com
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/block_dev.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 38b8ce05cbc7..cdbb888a8d4a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev)
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
+static void set_init_blocksize(struct block_device *bdev)
+{
+	unsigned bsize = bdev_logical_block_size(bdev);
+	loff_t size = i_size_read(bdev->bd_inode);
+
+	while (bsize < PAGE_SIZE) {
+		if (size & bsize)
+			break;
+		bsize <<= 1;
+	}
+	bdev->bd_block_size = bsize;
+	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
 int set_blocksize(struct block_device *bdev, int size)
 {
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
@@ -1408,18 +1422,9 @@ EXPORT_SYMBOL(check_disk_change);
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_logical_block_size(bdev);
-
 	inode_lock(bdev->bd_inode);
 	i_size_write(bdev->bd_inode, size);
 	inode_unlock(bdev->bd_inode);
-	while (bsize < PAGE_SIZE) {
-		if (size & bsize)
-			break;
-		bsize <<= 1;
-	}
-	bdev->bd_block_size = bsize;
-	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 }
 EXPORT_SYMBOL(bd_set_size);
 
@@ -1496,8 +1501,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret)
+			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+				set_init_blocksize(bdev);
+			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1532,6 +1539,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+			set_init_blocksize(bdev);
 		}
 
 		if (bdev->bd_bdi == &noop_backing_dev_info)
-- 
cgit v1.2.3


From c356972f27cc8194ecb1c0473d318a607300e7c2 Mon Sep 17 00:00:00 2001
From: Daniel Santos <daniel.santos@pobox.com>
Date: Fri, 19 Oct 2018 03:30:20 -0500
Subject: jffs2: Fix use of uninitialized delayed_work, lockdep breakage

[ Upstream commit a788c5272769ddbcdbab297cf386413eeac04463 ]

jffs2_sync_fs makes the assumption that if CONFIG_JFFS2_FS_WRITEBUFFER
is defined then a write buffer is available and has been initialized.
However, this does is not the case when the mtd device has no
out-of-band buffer:

int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
{
        if (!c->mtd->oobsize)
                return 0;
...

The resulting call to cancel_delayed_work_sync passing a uninitialized
(but zeroed) delayed_work struct forces lockdep to become disabled.

[   90.050639] overlayfs: upper fs does not support tmpfile.
[   90.652264] INFO: trying to register non-static key.
[   90.662171] the code is fine but needs lockdep annotation.
[   90.673090] turning off the locking correctness validator.
[   90.684021] CPU: 0 PID: 1762 Comm: mount_root Not tainted 4.14.63 #0
[   90.696672] Stack : 00000000 00000000 80d8f6a2 00000038 805f0000 80444600 8fe364f4 805dfbe7
[   90.713349]         80563a30 000006e2 8068370c 00000001 00000000 00000001 8e2fdc48 ffffffff
[   90.730020]         00000000 00000000 80d90000 00000000 00000106 00000000 6465746e 312e3420
[   90.746690]         6b636f6c 03bf0000 f8000000 20676e69 00000000 80000000 00000000 8e2c2a90
[   90.763362]         80d90000 00000001 00000000 8e2c2a90 00000003 80260dc0 08052098 80680000
[   90.780033]         ...
[   90.784902] Call Trace:
[   90.789793] [<8000f0d8>] show_stack+0xb8/0x148
[   90.798659] [<8005a000>] register_lock_class+0x270/0x55c
[   90.809247] [<8005cb64>] __lock_acquire+0x13c/0xf7c
[   90.818964] [<8005e314>] lock_acquire+0x194/0x1dc
[   90.828345] [<8003f27c>] flush_work+0x200/0x24c
[   90.837374] [<80041dfc>] __cancel_work_timer+0x158/0x210
[   90.847958] [<801a8770>] jffs2_sync_fs+0x20/0x54
[   90.857173] [<80125cf4>] iterate_supers+0xf4/0x120
[   90.866729] [<80158fc4>] sys_sync+0x44/0x9c
[   90.875067] [<80014424>] syscall_common+0x34/0x58

Signed-off-by: Daniel Santos <daniel.santos@pobox.com>
Reviewed-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Boris Brezillon <boris.brezillon@bootlin.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/jffs2/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 902a7dd10e5c..bb6ae387469f 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
-	cancel_delayed_work_sync(&c->wbuf_dwork);
+	if (jffs2_is_writebuffered(c))
+		cancel_delayed_work_sync(&c->wbuf_dwork);
 #endif
 
 	mutex_lock(&c->alloc_sem);
-- 
cgit v1.2.3


From 265242d82a3c6a8bd9120d06b4801f8d7ae9a346 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 3 Nov 2018 16:38:18 -0700
Subject: pstore/ram: Do not treat empty buffers as valid

[ Upstream commit 30696378f68a9e3dad6bfe55938b112e72af00c2 ]

The ramoops backend currently calls persistent_ram_save_old() even
if a buffer is empty. While this appears to work, it is does not seem
like the right thing to do and could lead to future bugs so lets avoid
that. It also prevents misleading prints in the logs which claim the
buffer is valid.

I got something like:

	found existing buffer, size 0, start 0

When I was expecting:

	no valid data in buffer (sig = ...)

This bails out early (and reports with pr_debug()), since it's an
acceptable state.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/pstore/ram_core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 0792595ebcfb..3c777ec80d47 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -496,6 +496,11 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
 	sig ^= PERSISTENT_RAM_SIG;
 
 	if (prz->buffer->sig == sig) {
+		if (buffer_size(prz) == 0) {
+			pr_debug("found existing empty buffer\n");
+			return 0;
+		}
+
 		if (buffer_size(prz) > prz->buffer_size ||
 		    buffer_start(prz) > buffer_size(prz))
 			pr_info("found existing invalid buffer, size %zu, start %zu\n",
-- 
cgit v1.2.3


From bb5717a4a16592d22f3d5ca244b3e0a341b4062c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 5 Oct 2018 17:45:54 +0800
Subject: btrfs: volumes: Make sure there is no overlap of dev extents at mount
 time

[ Upstream commit 5eb193812a42dc49331f25137a38dfef9612d3e4 ]

Enhance btrfs_verify_dev_extents() to remember previous checked dev
extents, so it can verify no dev extents can overlap.

Analysis from Hans:

"Imagine allocating a DATA|DUP chunk.

 In the chunk allocator, we first set...
   max_stripe_size = SZ_1G;
   max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE
 ... which is 10GiB.

 Then...
   /* we don't want a chunk larger than 10% of writeable space */
   max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
       		 max_chunk_size);

 Imagine we only have one 7880MiB block device in this filesystem. Now
 max_chunk_size is down to 788MiB.

 The next step in the code is to search for max_stripe_size * dev_stripes
 amount of free space on the device, which is in our example 1GiB * 2 =
 2GiB. Imagine the device has exactly 1578MiB free in one contiguous
 piece. This amount of bytes will be put in devices_info[ndevs - 1].max_avail

 Next we recalculate the stripe_size (which is actually the device extent
 length), based on the actual maximum amount of available raw disk space:
   stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);

 stripe_size is now 789MiB

 Next we do...
   data_stripes = num_stripes / ncopies
 ...where data_stripes ends up as 1, because num_stripes is 2 (the amount
 of device extents we're going to have), and DUP has ncopies 2.

 Next there's a check...
   if (stripe_size * data_stripes > max_chunk_size)
 ...which matches because 789MiB * 1 > 788MiB.

 We go into the if code, and next is...
   stripe_size = div_u64(max_chunk_size, data_stripes);
 ...which resets stripe_size to max_chunk_size: 788MiB

 Next is a fun one...
   /* bump the answer up to a 16MB boundary */
   stripe_size = round_up(stripe_size, SZ_16M);
 ...which changes stripe_size from 788MiB to 800MiB.

 We're not done changing stripe_size yet...
   /* But don't go higher than the limits we found while searching
    * for free extents
    */
   stripe_size = min(devices_info[ndevs - 1].max_avail,
       	      stripe_size);

 This is bad. max_avail is twice the stripe_size (we need to fit 2 device
 extents on the same device for DUP).

 The result here is that 800MiB < 1578MiB, so it's unchanged. However,
 the resulting DUP chunk will need 1600MiB disk space, which isn't there,
 and the second dev_extent might extend into the next thing (next
 dev_extent? end of device?) for 22MiB.

 The last shown line of code relies on a situation where there's twice
 the value of stripe_size present as value for the variable stripe_size
 when it's DUP. This was actually the case before commit 92e222df7b
 "btrfs: alloc_chunk: fix DUP stripe size handling", from which I quote:
   "[...] in the meantime there's a check to see if the stripe_size does
 not exceed max_chunk_size. Since during this check stripe_size is twice
 the amount as intended, the check will reduce the stripe_size to
 max_chunk_size if the actual correct to be used stripe_size is more than
 half the amount of max_chunk_size."

 In the previous version of the code, the 16MiB alignment (why is this
 done, by the way?) would result in a 50% chance that it would actually
 do an 8MiB alignment for the individual dev_extents, since it was
 operating on double the size. Does this matter?

 Does it matter that stripe_size can be set to anything which is not
 16MiB aligned because of the amount of remaining available disk space
 which is just taken?

 What is the main purpose of this round_up?

 The most straightforward thing to do seems something like...
   stripe_size = min(
       div_u64(devices_info[ndevs - 1].max_avail, dev_stripes),
       stripe_size
   )
 ..just putting half of the max_avail into stripe_size."

Link: https://lore.kernel.org/linux-btrfs/b3461a38-e5f8-f41d-c67c-2efac8129054@mendix.com/
Reported-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ add analysis from report ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 223334f08530..dbb893d0ae81 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7474,6 +7474,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->dev_root;
 	struct btrfs_key key;
+	u64 prev_devid = 0;
+	u64 prev_dev_ext_end = 0;
 	int ret = 0;
 
 	key.objectid = 1;
@@ -7518,10 +7520,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
 		physical_len = btrfs_dev_extent_length(leaf, dext);
 
+		/* Check if this dev extent overlaps with the previous one */
+		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
+			btrfs_err(fs_info,
+"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
+				  devid, physical_offset, prev_dev_ext_end);
+			ret = -EUCLEAN;
+			goto out;
+		}
+
 		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
 					    physical_offset, physical_len);
 		if (ret < 0)
 			goto out;
+		prev_devid = devid;
+		prev_dev_ext_end = physical_offset + physical_len;
+
 		ret = btrfs_next_item(root, path);
 		if (ret < 0)
 			goto out;
-- 
cgit v1.2.3


From 720b86a53a1024996708121abfd7bc490d68768f Mon Sep 17 00:00:00 2001
From: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Date: Thu, 4 Oct 2018 23:24:40 +0200
Subject: btrfs: alloc_chunk: fix more DUP stripe size handling

[ Upstream commit baf92114c7e6dd6124aa3d506e4bc4b694da3bc3 ]

Commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling"
fixed calculating the stripe_size for a new DUP chunk.

However, the same calculation reappears a bit later, and that one was
not changed yet. The resulting bug that is exposed is that the newly
allocated device extents ('stripes') can have a few MiB overlap with the
next thing stored after them, which is another device extent or the end
of the disk.

The scenario in which this can happen is:
* The block device for the filesystem is less than 10GiB in size.
* The amount of contiguous free unallocated disk space chosen to use for
  chunk allocation is 20% of the total device size, or a few MiB more or
  less.

An example:
- The filesystem device is 7880MiB (max_chunk_size gets set to 788MiB)
- There's 1578MiB unallocated raw disk space left in one contiguous
  piece.

In this case stripe_size is first calculated as 789MiB, (half of
1578MiB).

Since 789MiB (stripe_size * data_stripes) > 788MiB (max_chunk_size), we
enter the if block. Now stripe_size value is immediately overwritten
while calculating an adjusted value based on max_chunk_size, which ends
up as 788MiB.

Next, the value is rounded up to a 16MiB boundary, 800MiB, which is
actually more than the value we had before. However, the last comparison
fails to detect this, because it's comparing the value with the total
amount of free space, which is about twice the size of stripe_size.

In the example above, this means that the resulting raw disk space being
allocated is 1600MiB, while only a gap of 1578MiB has been found. The
second device extent object for this DUP chunk will overlap for 22MiB
with whatever comes next.

The underlying problem here is that the stripe_size is reused all the
time for different things. So, when entering the code in the if block,
stripe_size is immediately overwritten with something else. If later we
decide we want to have the previous value back, then the logic to
compute it was copy pasted in again.

With this change, the value in stripe_size is not unnecessarily
destroyed, so the duplicated calculation is not needed any more.

Signed-off-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dbb893d0ae81..0ee1cd4b56fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4768,19 +4768,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	/*
 	 * Use the number of data stripes to figure out how big this chunk
 	 * is really going to be in terms of logical address space,
-	 * and compare that answer with the max chunk size
+	 * and compare that answer with the max chunk size. If it's higher,
+	 * we try to reduce stripe_size.
 	 */
 	if (stripe_size * data_stripes > max_chunk_size) {
-		stripe_size = div_u64(max_chunk_size, data_stripes);
-
-		/* bump the answer up to a 16MB boundary */
-		stripe_size = round_up(stripe_size, SZ_16M);
-
 		/*
-		 * But don't go higher than the limits we found while searching
-		 * for free extents
+		 * Reduce stripe_size, round it up to a 16MB boundary again and
+		 * then use it, unless it ends up being even bigger than the
+		 * previous value we had already.
 		 */
-		stripe_size = min(devices_info[ndevs - 1].max_avail,
+		stripe_size = min(round_up(div_u64(max_chunk_size,
+						   data_stripes), SZ_16M),
 				  stripe_size);
 	}
 
-- 
cgit v1.2.3


From 38b17eee7074490c15128b596a35c837d9d94c60 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 14 Nov 2018 13:50:26 +0800
Subject: btrfs: fix use-after-free due to race between replace start and
 cancel

[ Upstream commit d189dd70e2556181732598956d808ea53cc8774e ]

The device replace cancel thread can race with the replace start thread
and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will
fail to stop the scrub thread.

The scrub thread continues with the scrub for replace which then will
try to write to the target device and which is already freed by the
cancel thread.

scrub_setup_ctx() warns as tgtdev is NULL.

  struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
  {
  ...
	  if (is_dev_replace) {
		  WARN_ON(!fs_info->dev_replace.tgtdev);  <===
		  sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
		  sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
		  sctx->flush_all_writes = false;
	  }

  [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started
  [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled
  [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
  ...
  [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs]
  ...
  [ 6852.432970] Call Trace:
  [ 6852.433202]  btrfs_scrub_dev+0x19b/0x5c0 [btrfs]
  [ 6852.433471]  btrfs_dev_replace_start+0x48c/0x6a0 [btrfs]
  [ 6852.433800]  btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs]
  [ 6852.434097]  btrfs_ioctl+0x2476/0x2d20 [btrfs]
  [ 6852.434365]  ? do_sigaction+0x7d/0x1e0
  [ 6852.434623]  do_vfs_ioctl+0xa9/0x6c0
  [ 6852.434865]  ? syscall_trace_enter+0x1c8/0x310
  [ 6852.435124]  ? syscall_trace_enter+0x1c8/0x310
  [ 6852.435387]  ksys_ioctl+0x60/0x90
  [ 6852.435663]  __x64_sys_ioctl+0x16/0x20
  [ 6852.435907]  do_syscall_64+0x50/0x180
  [ 6852.436150]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Further, as the replace thread enters scrub_write_page_to_dev_replace()
without the target device it panics:

  static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
				      struct scrub_page *spage)
  {
  ...
	bio_set_dev(bio, sbio->dev->bdev); <======

  [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0
  ..
  [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs]
  [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260
  [btrfs]
  ..
  [ 6929.721430] Call Trace:
  [ 6929.721663]  scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs]
  [ 6929.721975]  scrub_bio_end_io_worker+0x1af/0x490 [btrfs]
  [ 6929.722277]  normal_work_helper+0xf0/0x4c0 [btrfs]
  [ 6929.722552]  process_one_work+0x1f4/0x520
  [ 6929.722805]  ? process_one_work+0x16e/0x520
  [ 6929.723063]  worker_thread+0x46/0x3d0
  [ 6929.723313]  kthread+0xf8/0x130
  [ 6929.723544]  ? process_one_work+0x520/0x520
  [ 6929.723800]  ? kthread_delayed_work_timer_fn+0x80/0x80
  [ 6929.724081]  ret_from_fork+0x3a/0x50

Fix this by letting the btrfs_dev_replace_finishing() to do the job of
cleaning after the cancel, including freeing of the target device.
btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns
along with the scrub return status.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/dev-replace.c | 63 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index b2b283e48439..8fed470bb7e1 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -800,39 +800,58 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
 		btrfs_dev_replace_write_unlock(dev_replace);
-		goto leave;
+		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+		tgt_device = dev_replace->tgtdev;
+		src_device = dev_replace->srcdev;
+		btrfs_dev_replace_write_unlock(dev_replace);
+		btrfs_scrub_cancel(fs_info);
+		/* btrfs_dev_replace_finishing() will handle the cleanup part */
+		btrfs_info_in_rcu(fs_info,
+			"dev_replace from %s (devid %llu) to %s canceled",
+			btrfs_dev_name(src_device), src_device->devid,
+			btrfs_dev_name(tgt_device));
+		break;
 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+		/*
+		 * Scrub doing the replace isn't running so we need to do the
+		 * cleanup step of btrfs_dev_replace_finishing() here
+		 */
 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 		tgt_device = dev_replace->tgtdev;
 		src_device = dev_replace->srcdev;
 		dev_replace->tgtdev = NULL;
 		dev_replace->srcdev = NULL;
-		break;
-	}
-	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
-	dev_replace->time_stopped = ktime_get_real_seconds();
-	dev_replace->item_needs_writeback = 1;
-	btrfs_dev_replace_write_unlock(dev_replace);
-	btrfs_scrub_cancel(fs_info);
+		dev_replace->replace_state =
+				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+		dev_replace->time_stopped = ktime_get_real_seconds();
+		dev_replace->item_needs_writeback = 1;
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans)) {
-		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
-		return PTR_ERR(trans);
-	}
-	ret = btrfs_commit_transaction(trans);
-	WARN_ON(ret);
+		btrfs_dev_replace_write_unlock(dev_replace);
 
-	btrfs_info_in_rcu(fs_info,
-		"dev_replace from %s (devid %llu) to %s canceled",
-		btrfs_dev_name(src_device), src_device->devid,
-		btrfs_dev_name(tgt_device));
+		btrfs_scrub_cancel(fs_info);
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans);
+		WARN_ON(ret);
 
-	if (tgt_device)
-		btrfs_destroy_dev_replace_tgtdev(tgt_device);
+		btrfs_info_in_rcu(fs_info,
+		"suspended dev_replace from %s (devid %llu) to %s canceled",
+			btrfs_dev_name(src_device), src_device->devid,
+			btrfs_dev_name(tgt_device));
+
+		if (tgt_device)
+			btrfs_destroy_dev_replace_tgtdev(tgt_device);
+		break;
+	default:
+		result = -EINVAL;
+	}
 
-leave:
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 	return result;
 }
-- 
cgit v1.2.3


From 310f8296d6305249ea1f4bbb6fb9a9690e003451 Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Wed, 12 Dec 2018 15:14:17 +0100
Subject: btrfs: improve error handling of btrfs_add_link

[ Upstream commit 1690dd41e0cb1dade80850ed8a3eb0121b96d22f ]

In the error handling block, err holds the return value of either
btrfs_del_root_ref() or btrfs_del_inode_ref() but it hasn't been checked
since it's introduction with commit fe66a05a0679 (Btrfs: improve error
handling for btrfs_insert_dir_item callers) in 2012.

If the error handling in the error handling fails, there's not much left
to do and the abort either happened earlier in the callees or is
necessary here.

So if one of btrfs_del_root_ref() or btrfs_del_inode_ref() failed, abort
the transaction, but still return the original code of the failure
stored in 'ret' as this will be reported to the user.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f6dc56b4f4d..83b3a626c796 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6440,14 +6440,19 @@ fail_dir_item:
 		err = btrfs_del_root_ref(trans, key.objectid,
 					 root->root_key.objectid, parent_ino,
 					 &local_index, name, name_len);
-
+		if (err)
+			btrfs_abort_transaction(trans, err);
 	} else if (add_backref) {
 		u64 local_index;
 		int err;
 
 		err = btrfs_del_inode_ref(trans, root, name, name_len,
 					  ino, parent_ino, &local_index);
+		if (err)
+			btrfs_abort_transaction(trans, err);
 	}
+
+	/* Return the original error code */
 	return ret;
 }
 
-- 
cgit v1.2.3


From 876b79b973f2ca50cd9a57cc4263f91576cf25af Mon Sep 17 00:00:00 2001
From: Javier Barrio <javier.barrio.mart@gmail.com>
Date: Thu, 13 Dec 2018 01:06:29 +0100
Subject: quota: Lock s_umount in exclusive mode for Q_XQUOTA{ON,OFF}
 quotactls.

[ Upstream commit 41c4f85cdac280d356df1f483000ecec4a8868be ]

Commit 1fa5efe3622db58cb8c7b9a50665e9eb9a6c7e97 (ext4: Use generic helpers for quotaon
and quotaoff) made possible to call quotactl(Q_XQUOTAON/OFF) on ext4 filesystems
with sysfile quota support. This leads to calling dquot_enable/disable without s_umount
held in excl. mode, because quotactl_cmd_onoff checks only for Q_QUOTAON/OFF.

The following WARN_ON_ONCE triggers (in this case for dquot_enable, ext4, latest Linus' tree):

[  117.807056] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: quota,prjquota

[...]

[  155.036847] WARNING: CPU: 0 PID: 2343 at fs/quota/dquot.c:2469 dquot_enable+0x34/0xb9
[  155.036851] Modules linked in: quota_v2 quota_tree ipv6 af_packet joydev mousedev psmouse serio_raw pcspkr i2c_piix4 intel_agp intel_gtt e1000 ttm drm_kms_helper drm agpgart fb_sys_fops syscopyarea sysfillrect sysimgblt i2c_core input_leds kvm_intel kvm irqbypass qemu_fw_cfg floppy evdev parport_pc parport button crc32c_generic dm_mod ata_generic pata_acpi ata_piix libata loop ext4 crc16 mbcache jbd2 usb_storage usbcore sd_mod scsi_mod
[  155.036901] CPU: 0 PID: 2343 Comm: qctl Not tainted 4.20.0-rc6-00025-gf5d582777bcb #9
[  155.036903] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  155.036911] RIP: 0010:dquot_enable+0x34/0xb9
[  155.036915] Code: 41 56 41 55 41 54 55 53 4c 8b 6f 28 74 02 0f 0b 4d 8d 7d 70 49 89 fc 89 cb 41 89 d6 89 f5 4c 89 ff e8 23 09 ea ff 85 c0 74 0a <0f> 0b 4c 89 ff e8 8b 09 ea ff 85 db 74 6a 41 8b b5 f8 00 00 00 0f
[  155.036918] RSP: 0018:ffffb09b00493e08 EFLAGS: 00010202
[  155.036922] RAX: 0000000000000001 RBX: 0000000000000008 RCX: 0000000000000008
[  155.036924] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9781b67cd870
[  155.036926] RBP: 0000000000000002 R08: 0000000000000000 R09: 61c8864680b583eb
[  155.036929] R10: ffffb09b00493e48 R11: ffffffffff7ce7d4 R12: ffff9781b7ee8d78
[  155.036932] R13: ffff9781b67cd800 R14: 0000000000000004 R15: ffff9781b67cd870
[  155.036936] FS:  00007fd813250b88(0000) GS:ffff9781ba000000(0000) knlGS:0000000000000000
[  155.036939] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  155.036942] CR2: 00007fd812ff61d6 CR3: 000000007c882000 CR4: 00000000000006b0
[  155.036951] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  155.036953] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  155.036955] Call Trace:
[  155.037004]  dquot_quota_enable+0x8b/0xd0
[  155.037011]  kernel_quotactl+0x628/0x74e
[  155.037027]  ? do_mprotect_pkey+0x2a6/0x2cd
[  155.037034]  __x64_sys_quotactl+0x1a/0x1d
[  155.037041]  do_syscall_64+0x55/0xe4
[  155.037078]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  155.037105] RIP: 0033:0x7fd812fe1198
[  155.037109] Code: 02 77 0d 48 89 c1 48 c1 e9 3f 75 04 48 8b 04 24 48 83 c4 50 5b c3 48 83 ec 08 49 89 ca 48 63 d2 48 63 ff b8 b3 00 00 00 0f 05 <48> 89 c7 e8 c1 eb ff ff 5a c3 48 63 ff b8 bb 00 00 00 0f 05 48 89
[  155.037112] RSP: 002b:00007ffe8cd7b050 EFLAGS: 00000206 ORIG_RAX: 00000000000000b3
[  155.037116] RAX: ffffffffffffffda RBX: 00007ffe8cd7b148 RCX: 00007fd812fe1198
[  155.037119] RDX: 0000000000000000 RSI: 00007ffe8cd7cea9 RDI: 0000000000580102
[  155.037121] RBP: 00007ffe8cd7b0f0 R08: 000055fc8eba8a9d R09: 0000000000000000
[  155.037124] R10: 00007ffe8cd7b074 R11: 0000000000000206 R12: 00007ffe8cd7b168
[  155.037126] R13: 000055fc8eba8897 R14: 0000000000000000 R15: 0000000000000000
[  155.037131] ---[ end trace 210f864257175c51 ]---

and then the syscall proceeds without s_umount locking.

This patch locks the superblock ->s_umount sem. in exclusive mode for all Q_XQUOTAON/OFF
quotactls too in addition to Q_QUOTAON/OFF.

AFAICT, other than ext4, only xfs and ocfs2 are affected by this change.
The VFS will now call in xfs_quota_* functions with s_umount held, which wasn't the case
before. This looks good to me but I can not say for sure. Ext4 and ocfs2 where already
beeing called with s_umount exclusive via quota_quotaon/off which is basically the same.

Signed-off-by: Javier Barrio <javier.barrio.mart@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/quota/quota.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index f0cbf58ad4da..fd5dd806f1b9 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -791,7 +791,8 @@ static int quotactl_cmd_write(int cmd)
 /* Return true if quotactl command is manipulating quota on/off state */
 static bool quotactl_cmd_onoff(int cmd)
 {
-	return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF);
+	return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) ||
+		 (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF);
 }
 
 /*
-- 
cgit v1.2.3


From c9dcb871b1a99301ba65685be95cbd93a5c540fe Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 21 Dec 2018 08:42:50 -0800
Subject: iomap: don't search past page end in iomap_is_partially_uptodate

[ Upstream commit 3cc31fa65d85610574c0f6a474e89f4c419923d5 ]

iomap_is_partially_uptodate() is intended to check wither blocks within
the selected range of a not-uptodate page are uptodate; if the range we
care about is up to date, it's an optimization.

However, the iomap implementation continues to check all blocks up to
from+count, which is beyond the page, and can even be well beyond the
iop->uptodate bitmap.

I think the worst that will happen is that we may eventually find a zero
bit and return "not partially uptodate" when it would have otherwise
returned true, and skip the optimization.  Still, it's clearly an invalid
memory access that must be fixed.

So: fix this by limiting the search to within the page as is done in the
non-iomap variant, block_is_partially_uptodate().

Zorro noticed thiswhen KASAN went off for 512 byte blocks on a 64k
page system:

 BUG: KASAN: slab-out-of-bounds in iomap_is_partially_uptodate+0x1a0/0x1e0
 Read of size 8 at addr ffff800120c3a318 by task fsstress/22337

Reported-by: Zorro Lang <zlang@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/iomap.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap.c b/fs/iomap.c
index ec15cf2ec696..e57fb1e534c5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -488,16 +488,29 @@ done:
 }
 EXPORT_SYMBOL_GPL(iomap_readpages);
 
+/*
+ * iomap_is_partially_uptodate checks whether blocks within a page are
+ * uptodate or not.
+ *
+ * Returns true if all blocks which correspond to a file portion
+ * we want to read within the page are uptodate.
+ */
 int
 iomap_is_partially_uptodate(struct page *page, unsigned long from,
 		unsigned long count)
 {
 	struct iomap_page *iop = to_iomap_page(page);
 	struct inode *inode = page->mapping->host;
-	unsigned first = from >> inode->i_blkbits;
-	unsigned last = (from + count - 1) >> inode->i_blkbits;
+	unsigned len, first, last;
 	unsigned i;
 
+	/* Limit range to one page */
+	len = min_t(unsigned, PAGE_SIZE - from, count);
+
+	/* First and last blocks in range within page */
+	first = from >> inode->i_blkbits;
+	last = (from + len - 1) >> inode->i_blkbits;
+
 	if (iop) {
 		for (i = first; i <= last; i++)
 			if (!test_bit(i, iop->uptodate))
-- 
cgit v1.2.3


From 5a404f39f8fad2249387dfd0d15c43942f6bb4fb Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 28 Dec 2018 00:32:50 -0800
Subject: ocfs2: fix panic due to unrecovered local alloc

[ Upstream commit 532e1e54c8140188e192348c790317921cb2dc1c ]

mount.ocfs2 ignore the inconsistent error that journal is clean but
local alloc is unrecovered.  After mount, local alloc not empty, then
reserver cluster didn't alloc a new local alloc window, reserveration
map is empty(ocfs2_reservation_map.m_bitmap_len = 0), that triggered the
following panic.

This issue was reported at

  https://oss.oracle.com/pipermail/ocfs2-devel/2015-May/010854.html

and was advised to fixed during mount.  But this is a very unusual
inconsistent state, usually journal dirty flag should be cleared at the
last stage of umount until every other things go right.  We may need do
further debug to check that.  Any way to avoid possible futher
corruption, mount should be abort and fsck should be run.

  (mount.ocfs2,1765,1):ocfs2_load_local_alloc:353 ERROR: Local alloc hasn't been recovered!
  found = 6518, set = 6518, taken = 8192, off = 15912372
  ocfs2: Mounting device (202,64) on (node 0, slot 3) with ordered data mode.
  o2dlm: Joining domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 8 ) 8 nodes
  ocfs2: Mounting device (202,80) on (node 0, slot 3) with ordered data mode.
  o2hb: Region 89CEAC63CC4F4D03AC185B44E0EE0F3F (xvdf) is now a quorum device
  o2net: Accepted connection from node yvwsoa17p (num 7) at 172.22.77.88:7777
  o2dlm: Node 7 joins domain 64FE421C8C984E6D96ED12C55FEE2435 ( 0 1 2 3 4 5 6 7 8 ) 9 nodes
  o2dlm: Node 7 joins domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 7 8 ) 9 nodes
  ------------[ cut here ]------------
  kernel BUG at fs/ocfs2/reservations.c:507!
  invalid opcode: 0000 [#1] SMP
  Modules linked in: ocfs2 rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sunrpc ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 ovmapi ppdev parport_pc parport xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea acpi_cpufreq pcspkr i2c_piix4 i2c_core sg ext4 jbd2 mbcache2 sr_mod cdrom xen_blkfront pata_acpi ata_generic ata_piix floppy dm_mirror dm_region_hash dm_log dm_mod
  CPU: 0 PID: 4349 Comm: startWebLogic.s Not tainted 4.1.12-124.19.2.el6uek.x86_64 #2
  Hardware name: Xen HVM domU, BIOS 4.4.4OVM 09/06/2018
  task: ffff8803fb04e200 ti: ffff8800ea4d8000 task.ti: ffff8800ea4d8000
  RIP: 0010:[<ffffffffa05e96a8>]  [<ffffffffa05e96a8>] __ocfs2_resv_find_window+0x498/0x760 [ocfs2]
  Call Trace:
    ocfs2_resmap_resv_bits+0x10d/0x400 [ocfs2]
    ocfs2_claim_local_alloc_bits+0xd0/0x640 [ocfs2]
    __ocfs2_claim_clusters+0x178/0x360 [ocfs2]
    ocfs2_claim_clusters+0x1f/0x30 [ocfs2]
    ocfs2_convert_inline_data_to_extents+0x634/0xa60 [ocfs2]
    ocfs2_write_begin_nolock+0x1c6/0x1da0 [ocfs2]
    ocfs2_write_begin+0x13e/0x230 [ocfs2]
    generic_perform_write+0xbf/0x1c0
    __generic_file_write_iter+0x19c/0x1d0
    ocfs2_file_write_iter+0x589/0x1360 [ocfs2]
    __vfs_write+0xb8/0x110
    vfs_write+0xa9/0x1b0
    SyS_write+0x46/0xb0
    system_call_fastpath+0x18/0xd7
  Code: ff ff 8b 75 b8 39 75 b0 8b 45 c8 89 45 98 0f 84 e5 fe ff ff 45 8b 74 24 18 41 8b 54 24 1c e9 56 fc ff ff 85 c0 0f 85 48 ff ff ff <0f> 0b 48 8b 05 cf c3 de ff 48 ba 00 00 00 00 00 00 00 10 48 85
  RIP   __ocfs2_resv_find_window+0x498/0x760 [ocfs2]
   RSP <ffff8800ea4db668>
  ---[ end trace 566f07529f2edf3c ]---
  Kernel panic - not syncing: Fatal exception
  Kernel Offset: disabled

Link: http://lkml.kernel.org/r/20181121020023.3034-2-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Acked-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/localalloc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 7642b6712c39..30208233f65b 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 	if (num_used
 	    || alloc->id1.bitmap1.i_used
 	    || alloc->id1.bitmap1.i_total
-	    || la->la_bm_off)
-		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
+	    || la->la_bm_off) {
+		mlog(ML_ERROR, "inconsistent detected, clean journal with"
+		     " unrecovered local alloc, please run fsck.ocfs2!\n"
 		     "found = %u, set = %u, taken = %u, off = %u\n",
 		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
 		     le32_to_cpu(alloc->id1.bitmap1.i_total),
 		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
 
+		status = -EINVAL;
+		goto bail;
+	}
+
 	osb->local_alloc_bh = alloc_bh;
 	osb->local_alloc_state = OCFS2_LA_ENABLED;
 
-- 
cgit v1.2.3


From 2011eb74180356d56afa47c238b8861f8e45ecf6 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 28 Dec 2018 00:38:47 -0800
Subject: userfaultfd: clear flag if remap event not enabled

[ Upstream commit 3cfd22be0ad663248fadfc8f6ffa3e255c394552 ]

When the process being tracked does mremap() without
UFFD_FEATURE_EVENT_REMAP on the corresponding tracking uffd file handle,
we should not generate the remap event, and at the same time we should
clear all the uffd flags on the new VMA.  Without this patch, we can still
have the VM_UFFD_MISSING|VM_UFFD_WP flags on the new VMA even the fault
handling process does not even know the existance of the VMA.

Link: http://lkml.kernel.org/r/20181211053409.20317-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Pravin Shedge <pravin.shedge4linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/userfaultfd.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7a85e609fc27..d8b8323e80f4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -736,10 +736,18 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	struct userfaultfd_ctx *ctx;
 
 	ctx = vma->vm_userfaultfd_ctx.ctx;
-	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+
+	if (!ctx)
+		return;
+
+	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
 		WRITE_ONCE(ctx->mmap_changing, true);
+	} else {
+		/* Drop uffd context if remap feature not enabled */
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
 	}
 }
 
-- 
cgit v1.2.3


From 3e05ceedf1439eea0a1306e30935ef5ee65e5d4f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 10 Jan 2019 15:41:09 +0800
Subject: ceph: clear inode pointer when snap realm gets dropped by its inode

commit d95e674c01cfb5461e8b9fdeebf6d878c9b80b2f upstream.

snap realm and corresponding inode have pointers to each other.
The two pointer should get clear at the same time. Otherwise,
snap realm's pointer may reference freed inode.

Cc: stable@vger.kernel.org # 4.17+
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Luis Henriques <lhenriques@suse.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/caps.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index eadffaa39f4e..c7542e8dd096 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1030,6 +1030,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci)
 	list_del_init(&ci->i_snap_realm_item);
 	ci->i_snap_realm_counter++;
 	ci->i_snap_realm = NULL;
+	if (realm->ino == ci->i_vino.ino)
+		realm->inode = NULL;
 	spin_unlock(&realm->inodes_with_caps_lock);
 	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
 			    realm);
-- 
cgit v1.2.3


From a719cbe07847346f6f42a6133e231603d752c69d Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 1 Jan 2019 18:54:26 +0900
Subject: inotify: Fix fd refcount leak in inotify_add_watch().

commit 125892edfe69915a227d8d125ff0e1cd713178f4 upstream.

Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for
inotify_add_watch()") forgot to call fdput() before bailing out.

Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()")
CC: stable@vger.kernel.org
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/notify/inotify/inotify_user.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ac6978d3208c..780bba695453 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -724,8 +724,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		return -EBADF;
 
 	/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
-	if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
-		return -EINVAL;
+	if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
 
 	/* verify that this is indeed an inotify instance */
 	if (unlikely(f.file->f_op != &inotify_fops)) {
-- 
cgit v1.2.3


From 07b9e5e35e8f4189a0a3b87beaca1094d5c18b24 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Thu, 17 Jan 2019 08:21:24 -0800
Subject: CIFS: Fix possible hang during async MTU reads and writes

commit acc58d0bab55a50e02c25f00bd6a210ee121595f upstream.

When doing MTU i/o we need to leave some credits for
possible reopen requests and other operations happening
in parallel. Currently we leave 1 credit which is not
enough even for reopen only: we need at least 2 credits
if durable handle reconnect fails. Also there may be
other operations at the same time including compounding
ones which require 3 credits at a time each. Fix this
by leaving 8 credits which is big enough to cover most
scenarios.

Was able to reproduce this when server was configured
to give out fewer credits than usual.

The proper fix would be to reconnect a file handle first
and then obtain credits for an MTU request but this leads
to bigger code changes and should happen in other patches.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2ops.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f44bb4a304e9..0c2bbeab5f64 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -154,14 +154,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
 
 			scredits = server->credits;
 			/* can deadlock with reopen */
-			if (scredits == 1) {
+			if (scredits <= 8) {
 				*num = SMB2_MAX_BUFFER_SIZE;
 				*credits = 0;
 				break;
 			}
 
-			/* leave one credit for a possible reopen */
-			scredits--;
+			/* leave some credits for reopen and other ops */
+			scredits -= 8;
 			*num = min_t(unsigned int, size,
 				     scredits * SMB2_MAX_BUFFER_SIZE);
 
-- 
cgit v1.2.3


From 0380ed9b1cd39a582cd14b21072d59fb66bc7d50 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Thu, 17 Jan 2019 15:29:26 -0800
Subject: CIFS: Fix credits calculations for reads with errors

commit 8004c78c68e894e4fd5ac3c22cc22eb7dc24cabc upstream.

Currently we mark MID as malformed if we get an error from server
in a read response. This leads to not properly processing credits
in the readv callback. Fix this by marking such a response as
normal received response and process it appropriately.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/cifssmb.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 5657b79dbc99..269471c8f42b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1458,18 +1458,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server)
 }
 
 static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+		     bool malformed)
 {
 	int length;
-	struct cifs_readdata *rdata = mid->callback_data;
 
 	length = cifs_discard_remaining_data(server);
-	dequeue_mid(mid, rdata->result);
+	dequeue_mid(mid, malformed);
 	mid->resp_buf = server->smallbuf;
 	server->smallbuf = NULL;
 	return length;
 }
 
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	struct cifs_readdata *rdata = mid->callback_data;
+
+	return  __cifs_readv_discard(server, mid, rdata->result);
+}
+
 int
 cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 {
@@ -1511,12 +1519,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		return -1;
 	}
 
+	/* set up first two iov for signature check and to get credits */
+	rdata->iov[0].iov_base = buf;
+	rdata->iov[0].iov_len = 4;
+	rdata->iov[1].iov_base = buf + 4;
+	rdata->iov[1].iov_len = server->total_read - 4;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+	cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
 	/* Was the SMB read successful? */
 	rdata->result = server->ops->map_error(buf, false);
 	if (rdata->result != 0) {
 		cifs_dbg(FYI, "%s: server returned error %d\n",
 			 __func__, rdata->result);
-		return cifs_readv_discard(server, mid);
+		/* normal error on read response */
+		return __cifs_readv_discard(server, mid, false);
 	}
 
 	/* Is there enough to get to the rest of the READ_RSP header? */
@@ -1560,14 +1579,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		server->total_read += length;
 	}
 
-	/* set up first iov for signature check */
-	rdata->iov[0].iov_base = buf;
-	rdata->iov[0].iov_len = 4;
-	rdata->iov[1].iov_base = buf + 4;
-	rdata->iov[1].iov_len = server->total_read - 4;
-	cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n",
-		 rdata->iov[0].iov_base, server->total_read);
-
 	/* how much data is in the response? */
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	use_rdma_mr = rdata->mr;
-- 
cgit v1.2.3


From 2ae6fedbd5cb4144f5cf40681b9a3ea5464d31bd Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Fri, 18 Jan 2019 15:38:11 -0800
Subject: CIFS: Fix credit calculation for encrypted reads with errors

commit ec678eae746dd25766a61c4095e2b649d3b20b09 upstream.

We do need to account for credits received in error responses
to read requests on encrypted sessions.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2ops.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 0c2bbeab5f64..ffb55c174482 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -2901,11 +2901,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
 			server->ops->is_status_pending(buf, server, 0))
 		return -1;
 
-	rdata->result = server->ops->map_error(buf, false);
+	/* set up first two iov to get credits */
+	rdata->iov[0].iov_base = buf;
+	rdata->iov[0].iov_len = 4;
+	rdata->iov[1].iov_base = buf + 4;
+	rdata->iov[1].iov_len =
+		min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+	cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[1].iov_base, rdata->iov[1].iov_len);
+
+	rdata->result = server->ops->map_error(buf, true);
 	if (rdata->result != 0) {
 		cifs_dbg(FYI, "%s: server returned error %d\n",
 			 __func__, rdata->result);
-		dequeue_mid(mid, rdata->result);
+		/* normal error on read response */
+		dequeue_mid(mid, false);
 		return 0;
 	}
 
@@ -2978,14 +2990,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
 		return 0;
 	}
 
-	/* set up first iov for signature check */
-	rdata->iov[0].iov_base = buf;
-	rdata->iov[0].iov_len = 4;
-	rdata->iov[1].iov_base = buf + 4;
-	rdata->iov[1].iov_len = server->vals->read_rsp_size - 4;
-	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
-		 rdata->iov[0].iov_base, server->vals->read_rsp_size);
-
 	length = rdata->copy_into_pages(server, rdata, &iter);
 
 	kfree(bvec);
-- 
cgit v1.2.3


From 779c65bb77391b9e096a0cd1d22c039e9a133911 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Fri, 18 Jan 2019 17:25:36 -0800
Subject: CIFS: Do not reconnect TCP session in add_credits()

commit ef68e831840c40c7d01b328b3c0f5d8c4796c232 upstream.

When executing add_credits() we currently call cifs_reconnect()
if the number of credits is zero and there are no requests in
flight. In this case we may call cifs_reconnect() recursively
twice and cause memory corruption given the following sequence
of functions:

mid1.callback() -> add_credits() -> cifs_reconnect() ->
-> mid2.callback() -> add_credits() -> cifs_reconnect().

Fix this by avoiding to call cifs_reconnect() in add_credits()
and checking for zero credits in the demultiplex thread.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/connect.c | 21 +++++++++++++++++++++
 fs/cifs/smb2ops.c | 32 +++++++++++++++++++++++++-------
 2 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 52d71b64c0c6..d0bba175117c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -533,6 +533,21 @@ server_unresponsive(struct TCP_Server_Info *server)
 	return false;
 }
 
+static inline bool
+zero_credits(struct TCP_Server_Info *server)
+{
+	int val;
+
+	spin_lock(&server->req_lock);
+	val = server->credits + server->echo_credits + server->oplock_credits;
+	if (server->in_flight == 0 && val == 0) {
+		spin_unlock(&server->req_lock);
+		return true;
+	}
+	spin_unlock(&server->req_lock);
+	return false;
+}
+
 static int
 cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
 {
@@ -545,6 +560,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
 	for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
 		try_to_freeze();
 
+		/* reconnect if no credits and no requests in flight */
+		if (zero_credits(server)) {
+			cifs_reconnect(server);
+			return -ECONNABORTED;
+		}
+
 		if (server_unresponsive(server))
 			return -ECONNABORTED;
 		if (cifs_rdma_enabled(server) && server->smbd_conn)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index ffb55c174482..237d7281ada3 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -34,6 +34,7 @@
 #include "cifs_ioctl.h"
 #include "smbdirect.h"
 
+/* Change credits for different ops and return the total number of credits */
 static int
 change_conf(struct TCP_Server_Info *server)
 {
@@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server)
 	server->oplock_credits = server->echo_credits = 0;
 	switch (server->credits) {
 	case 0:
-		return -1;
+		return 0;
 	case 1:
 		server->echoes = false;
 		server->oplocks = false;
-		cifs_dbg(VFS, "disabling echoes and oplocks\n");
 		break;
 	case 2:
 		server->echoes = true;
 		server->oplocks = false;
 		server->echo_credits = 1;
-		cifs_dbg(FYI, "disabling oplocks\n");
 		break;
 	default:
 		server->echoes = true;
@@ -64,14 +63,15 @@ change_conf(struct TCP_Server_Info *server)
 		server->echo_credits = 1;
 	}
 	server->credits -= server->echo_credits + server->oplock_credits;
-	return 0;
+	return server->credits + server->echo_credits + server->oplock_credits;
 }
 
 static void
 smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
 		 const int optype)
 {
-	int *val, rc = 0;
+	int *val, rc = -1;
+
 	spin_lock(&server->req_lock);
 	val = server->ops->get_credits_field(server, optype);
 	*val += add;
@@ -95,8 +95,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add,
 	}
 	spin_unlock(&server->req_lock);
 	wake_up(&server->request_q);
-	if (rc)
-		cifs_reconnect(server);
+
+	if (server->tcpStatus == CifsNeedReconnect)
+		return;
+
+	switch (rc) {
+	case -1:
+		/* change_conf hasn't been executed */
+		break;
+	case 0:
+		cifs_dbg(VFS, "Possible client or server bug - zero credits\n");
+		break;
+	case 1:
+		cifs_dbg(VFS, "disabling echoes and oplocks\n");
+		break;
+	case 2:
+		cifs_dbg(FYI, "disabling oplocks\n");
+		break;
+	default:
+		cifs_dbg(FYI, "add %u credits total=%d\n", add, rc);
+	}
 }
 
 static void
-- 
cgit v1.2.3


From 06d9f987201feeada1e48e501db5f4a202541db3 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Wed, 23 Jan 2019 16:20:38 +1000
Subject: smb3: add credits we receive from oplock/break PDUs

commit 2e5700bdde438ed708b36d8acd0398dc73cbf759 upstream.

Otherwise we gradually leak credits leading to potential
hung session.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
CC: Stable <stable@vger.kernel.org>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2misc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 6a9c47541c53..7b8b58fb4d3f 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -648,6 +648,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 	if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK)
 		return false;
 
+	if (rsp->sync_hdr.CreditRequest) {
+		spin_lock(&server->req_lock);
+		server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest);
+		spin_unlock(&server->req_lock);
+		wake_up(&server->request_q);
+	}
+
 	if (rsp->StructureSize !=
 				smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
 		if (le16_to_cpu(rsp->StructureSize) == 44)
-- 
cgit v1.2.3


From c6961288a5f465d7753408a8b6b8158907476ea6 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Sat, 26 Jan 2019 12:21:32 -0800
Subject: CIFS: Do not count -ENODATA as failure for query directory

commit 8e6e72aeceaaed5aeeb1cb43d3085de7ceb14f79 upstream.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index dba986524917..ff13f8325540 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3714,8 +3714,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 		    rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) {
 			srch_inf->endOfSearch = true;
 			rc = 0;
-		}
-		cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
+		} else
+			cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
 		goto qdir_exit;
 	}
 
-- 
cgit v1.2.3


From 6e7045ec336be8eafa26270efe013e5a17371cf6 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Fri, 25 Jan 2019 11:38:53 -0800
Subject: CIFS: Fix trace command logging for SMB2 reads and writes

commit 7d42e72fe8ee5ab70b1af843dd7d8615e6fb0abe upstream.

Currently we log success once we send an async IO request to
the server. Instead we need to analyse a response and then log
success or failure for a particular command. Also fix argument
list for read logging.

Cc: <stable@vger.kernel.org> # 4.18
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index ff13f8325540..f5a8139f61c0 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3127,8 +3127,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
 		rdata->mr = NULL;
 	}
 #endif
-	if (rdata->result)
+	if (rdata->result) {
 		cifs_stats_fail_inc(tcon, SMB2_READ_HE);
+		trace_smb3_read_err(0 /* xid */,
+				    rdata->cfile->fid.persistent_fid,
+				    tcon->tid, tcon->ses->Suid, rdata->offset,
+				    rdata->bytes, rdata->result);
+	} else
+		trace_smb3_read_done(0 /* xid */,
+				     rdata->cfile->fid.persistent_fid,
+				     tcon->tid, tcon->ses->Suid,
+				     rdata->offset, rdata->got_bytes);
 
 	queue_work(cifsiod_wq, &rdata->work);
 	DeleteMidQEntry(mid);
@@ -3203,13 +3212,11 @@ smb2_async_readv(struct cifs_readdata *rdata)
 	if (rc) {
 		kref_put(&rdata->refcount, cifs_readdata_release);
 		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
-		trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid,
-				   io_parms.tcon->tid, io_parms.tcon->ses->Suid,
-				   io_parms.offset, io_parms.length);
-	} else
-		trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid,
-				   io_parms.tcon->tid, io_parms.tcon->ses->Suid,
-				   io_parms.offset, io_parms.length);
+		trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid,
+				    io_parms.tcon->tid,
+				    io_parms.tcon->ses->Suid,
+				    io_parms.offset, io_parms.length, rc);
+	}
 
 	cifs_small_buf_release(buf);
 	return rc;
@@ -3253,10 +3260,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 		if (rc != -ENODATA) {
 			cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE);
 			cifs_dbg(VFS, "Send error in read = %d\n", rc);
+			trace_smb3_read_err(xid, req->PersistentFileId,
+					    io_parms->tcon->tid, ses->Suid,
+					    io_parms->offset, io_parms->length,
+					    rc);
 		}
-		trace_smb3_read_err(rc, xid, req->PersistentFileId,
-				    io_parms->tcon->tid, ses->Suid,
-				    io_parms->offset, io_parms->length);
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 		return rc == -ENODATA ? 0 : rc;
 	} else
@@ -3342,8 +3350,17 @@ smb2_writev_callback(struct mid_q_entry *mid)
 		wdata->mr = NULL;
 	}
 #endif
-	if (wdata->result)
+	if (wdata->result) {
 		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
+		trace_smb3_write_err(0 /* no xid */,
+				     wdata->cfile->fid.persistent_fid,
+				     tcon->tid, tcon->ses->Suid, wdata->offset,
+				     wdata->bytes, wdata->result);
+	} else
+		trace_smb3_write_done(0 /* no xid */,
+				      wdata->cfile->fid.persistent_fid,
+				      tcon->tid, tcon->ses->Suid,
+				      wdata->offset, wdata->bytes);
 
 	queue_work(cifsiod_wq, &wdata->work);
 	DeleteMidQEntry(mid);
@@ -3485,10 +3502,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
 				     wdata->bytes, rc);
 		kref_put(&wdata->refcount, release);
 		cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
-	} else
-		trace_smb3_write_done(0 /* no xid */, req->PersistentFileId,
-				     tcon->tid, tcon->ses->Suid, wdata->offset,
-				     wdata->bytes);
+	}
 
 async_writev_out:
 	cifs_small_buf_release(req);
-- 
cgit v1.2.3


From e9d56f920bb26eee1a1e574463e7a1fea5a1164e Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Fri, 18 Jan 2019 15:54:34 -0800
Subject: CIFS: Do not consider -ENODATA as stat failure for reads

commit 082aaa8700415f6471ec9c5ef0c8307ca214989a upstream.

When doing reads beyound the end of a file the server returns
error STATUS_END_OF_FILE error which is mapped to -ENODATA.
Currently we report it as a failure which confuses read stats.
Change it to not consider -ENODATA as failure for stat purposes.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index f5a8139f61c0..8a01e89ff827 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3127,7 +3127,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
 		rdata->mr = NULL;
 	}
 #endif
-	if (rdata->result) {
+	if (rdata->result && rdata->result != -ENODATA) {
 		cifs_stats_fail_inc(tcon, SMB2_READ_HE);
 		trace_smb3_read_err(0 /* xid */,
 				    rdata->cfile->fid.persistent_fid,
-- 
cgit v1.2.3


From bb4e1ff5a8dde1f036e2aefcbdbe098f4098c92d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Wed, 30 Jan 2019 13:52:36 -0500
Subject: fs/dcache: Fix incorrect nr_dentry_unused accounting in
 shrink_dcache_sb()

commit 1dbd449c9943e3145148cc893c2461b72ba6fef0 upstream.

The nr_dentry_unused per-cpu counter tracks dentries in both the LRU
lists and the shrink lists where the DCACHE_LRU_LIST bit is set.

The shrink_dcache_sb() function moves dentries from the LRU list to a
shrink list and subtracts the dentry count from nr_dentry_unused.  This
is incorrect as the nr_dentry_unused count will also be decremented in
shrink_dentry_list() via d_shrink_del().

To fix this double decrement, the decrement in the shrink_dcache_sb()
function is taken out.

Fixes: 4e717f5c1083 ("list_lru: remove special case function list_lru_dispose_all."
Cc: stable@kernel.org
Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcache.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 2e7e8d85e9b4..cb515f183482 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1202,15 +1202,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-	long freed;
-
 	do {
 		LIST_HEAD(dispose);
 
-		freed = list_lru_walk(&sb->s_dentry_lru,
+		list_lru_walk(&sb->s_dentry_lru,
 			dentry_lru_isolate_shrink, &dispose, 1024);
-
-		this_cpu_sub(nr_dentry_unused, freed);
 		shrink_dentry_list(&dispose);
 	} while (list_lru_count(&sb->s_dentry_lru) > 0);
 }
-- 
cgit v1.2.3


From 0a3275d785858074d5e462222c7182b34a89bdc9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trondmy@gmail.com>
Date: Tue, 29 Jan 2019 15:52:55 -0500
Subject: NFS: Fix up return value on fatal errors in nfs_page_async_flush()

commit 8fc75bed96bb94e23ca51bd9be4daf65c57697bf upstream.

Ensure that we return the fatal error value that caused us to exit
nfs_page_async_flush().

Fixes: c373fff7bd25 ("NFSv4: Don't special case "launder"")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: stable@vger.kernel.org # v4.12+
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/write.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 586726a590d8..d790faff8e47 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	nfs_set_page_writeback(page);
 	WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
 
-	ret = 0;
+	ret = req->wb_context->error;
 	/* If there is a fatal error that covers this write, just exit */
-	if (nfs_error_is_fatal_on_server(req->wb_context->error))
+	if (nfs_error_is_fatal_on_server(ret))
 		goto out_launder;
 
+	ret = 0;
 	if (!nfs_pageio_add_request(pgio, req)) {
 		ret = pgio->pg_error;
 		/*
@@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 			nfs_context_set_write_error(req->wb_context, ret);
 			if (nfs_error_is_fatal_on_server(ret))
 				goto out_launder;
-		}
+		} else
+			ret = -EAGAIN;
 		nfs_redirty_request(req);
-		ret = -EAGAIN;
 	} else
 		nfs_add_stats(page_file_mapping(page)->host,
 				NFSIOS_WRITEPAGES, 1);
-- 
cgit v1.2.3


From 8b9be9db8a2ac5ec2046cc95df35487e5aaa136c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 30 Jan 2019 21:30:36 +0100
Subject: gfs2: Revert "Fix loop in gfs2_rbm_find"

commit e74c98ca2d6ae4376cc15fa2a22483430909d96b upstream.

This reverts commit 2d29f6b96d8f80322ed2dd895bca590491c38d34.

It turns out that the fix can lead to a ~20 percent performance regression
in initial writes to the page cache according to iozone.  Let's revert this
for now to have more time for a proper fix.

Cc: stable@vger.kernel.org # v3.13+
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/rgrp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index e978f6930575..449d0cb45a84 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1747,9 +1747,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
 			goto next_iter;
 		}
 		if (ret == -E2BIG) {
-			n += rbm->bii - initial_bii;
 			rbm->bii = 0;
 			rbm->offset = 0;
+			n += (rbm->bii - initial_bii);
 			goto res_covered_end_of_rgrp;
 		}
 		return ret;
-- 
cgit v1.2.3


From 5bce143671f36cf63f70eab2185b2c5910203873 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 25 Jan 2019 11:48:51 +0000
Subject: Btrfs: fix deadlock when allocating tree block during leaf/node split

commit a6279470762c19ba97e454f90798373dccdf6148 upstream.

When splitting a leaf or node from one of the trees that are modified when
flushing pending block groups (extent, chunk, device and free space trees),
we need to allocate a new tree block, which in turn can result in the need
to allocate a new block group. After allocating the new block group we may
need to flush new block groups that were previously allocated during the
course of the current transaction, which is what may cause a deadlock due
to attempts to write lock twice the same leaf or node, as when splitting
a leaf or node we are holding a write lock on it and its parent node.

The same type of deadlock can also happen when increasing the tree's
height, since we are holding a lock on the existing root while allocating
the tree block to use as the new root node.

An example trace when the deadlock happens during the leaf split path is:

  [27175.293054] CPU: 0 PID: 3005 Comm: kworker/u17:6 Tainted: G        W         4.19.16 #1
  [27175.293942] Hardware name: Penguin Computing Relion 1900/MD90-FS0-ZB-XX, BIOS R15 06/25/2018
  [27175.294846] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
  (...)
  [27175.298384] RSP: 0018:ffffab2087107758 EFLAGS: 00010246
  [27175.299269] RAX: 0000000000000bbd RBX: ffff9fadc7141c48 RCX: 0000000000000001
  [27175.300155] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9fadc7141c48
  [27175.301023] RBP: 0000000000000001 R08: ffff9faeb6ac1040 R09: ffff9fa9c0000000
  [27175.301887] R10: 0000000000000000 R11: 0000000000000040 R12: ffff9fb21aac8000
  [27175.302743] R13: ffff9fb1a64d6a20 R14: 0000000000000001 R15: ffff9fb1a64d6a18
  [27175.303601] FS:  0000000000000000(0000) GS:ffff9fb21fa00000(0000) knlGS:0000000000000000
  [27175.304468] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [27175.305339] CR2: 00007fdc8743ead8 CR3: 0000000763e0a006 CR4: 00000000003606f0
  [27175.306220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [27175.307087] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [27175.307940] Call Trace:
  [27175.308802]  btrfs_search_slot+0x779/0x9a0 [btrfs]
  [27175.309669]  ? update_space_info+0xba/0xe0 [btrfs]
  [27175.310534]  btrfs_insert_empty_items+0x67/0xc0 [btrfs]
  [27175.311397]  btrfs_insert_item+0x60/0xd0 [btrfs]
  [27175.312253]  btrfs_create_pending_block_groups+0xee/0x210 [btrfs]
  [27175.313116]  do_chunk_alloc+0x25f/0x300 [btrfs]
  [27175.313984]  find_free_extent+0x706/0x10d0 [btrfs]
  [27175.314855]  btrfs_reserve_extent+0x9b/0x1d0 [btrfs]
  [27175.315707]  btrfs_alloc_tree_block+0x100/0x5b0 [btrfs]
  [27175.316548]  split_leaf+0x130/0x610 [btrfs]
  [27175.317390]  btrfs_search_slot+0x94d/0x9a0 [btrfs]
  [27175.318235]  btrfs_insert_empty_items+0x67/0xc0 [btrfs]
  [27175.319087]  alloc_reserved_file_extent+0x84/0x2c0 [btrfs]
  [27175.319938]  __btrfs_run_delayed_refs+0x596/0x1150 [btrfs]
  [27175.320792]  btrfs_run_delayed_refs+0xed/0x1b0 [btrfs]
  [27175.321643]  delayed_ref_async_start+0x81/0x90 [btrfs]
  [27175.322491]  normal_work_helper+0xd0/0x320 [btrfs]
  [27175.323328]  ? move_linked_works+0x6e/0xa0
  [27175.324160]  process_one_work+0x191/0x370
  [27175.324976]  worker_thread+0x4f/0x3b0
  [27175.325763]  kthread+0xf8/0x130
  [27175.326531]  ? rescuer_thread+0x320/0x320
  [27175.327284]  ? kthread_create_worker_on_cpu+0x50/0x50
  [27175.328027]  ret_from_fork+0x35/0x40
  [27175.328741] ---[ end trace 300a1b9f0ac30e26 ]---

Fix this by preventing the flushing of new blocks groups when splitting a
leaf/node and when inserting a new root node for one of the trees modified
by the flushing operation, similar to what is done when COWing a node/leaf
from on of these trees.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202383
Reported-by: Eli V <eliventer@gmail.com>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.c | 78 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 50 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7ad6f2eec711..48ac8b7c43a5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1003,6 +1003,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static struct extent_buffer *alloc_tree_block_no_bg_flush(
+					  struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  u64 parent_start,
+					  const struct btrfs_disk_key *disk_key,
+					  int level,
+					  u64 hint,
+					  u64 empty_size)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct extent_buffer *ret;
+
+	/*
+	 * If we are COWing a node/leaf from the extent, chunk, device or free
+	 * space trees, make sure that we do not finish block group creation of
+	 * pending block groups. We do this to avoid a deadlock.
+	 * COWing can result in allocation of a new chunk, and flushing pending
+	 * block groups (btrfs_create_pending_block_groups()) can be triggered
+	 * when finishing allocation of a new chunk. Creation of a pending block
+	 * group modifies the extent, chunk, device and free space trees,
+	 * therefore we could deadlock with ourselves since we are holding a
+	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
+	 * try to COW later.
+	 * For similar reasons, we also need to delay flushing pending block
+	 * groups when splitting a leaf or node, from one of those trees, since
+	 * we are holding a write lock on it and its parent or when inserting a
+	 * new root node for one of those trees.
+	 */
+	if (root == fs_info->extent_root ||
+	    root == fs_info->chunk_root ||
+	    root == fs_info->dev_root ||
+	    root == fs_info->free_space_root)
+		trans->can_flush_pending_bgs = false;
+
+	ret = btrfs_alloc_tree_block(trans, root, parent_start,
+				     root->root_key.objectid, disk_key, level,
+				     hint, empty_size);
+	trans->can_flush_pending_bgs = true;
+
+	return ret;
+}
+
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -1050,28 +1092,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
 		parent_start = parent->start;
 
-	/*
-	 * If we are COWing a node/leaf from the extent, chunk, device or free
-	 * space trees, make sure that we do not finish block group creation of
-	 * pending block groups. We do this to avoid a deadlock.
-	 * COWing can result in allocation of a new chunk, and flushing pending
-	 * block groups (btrfs_create_pending_block_groups()) can be triggered
-	 * when finishing allocation of a new chunk. Creation of a pending block
-	 * group modifies the extent, chunk, device and free space trees,
-	 * therefore we could deadlock with ourselves since we are holding a
-	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
-	 * try to COW later.
-	 */
-	if (root == fs_info->extent_root ||
-	    root == fs_info->chunk_root ||
-	    root == fs_info->dev_root ||
-	    root == fs_info->free_space_root)
-		trans->can_flush_pending_bgs = false;
-
-	cow = btrfs_alloc_tree_block(trans, root, parent_start,
-			root->root_key.objectid, &disk_key, level,
-			search_start, empty_size);
-	trans->can_flush_pending_bgs = true;
+	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
+					   level, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -3383,8 +3405,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-				   &lower_key, level, root->node->start, 0);
+	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
+					 root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -3513,8 +3535,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	mid = (c_nritems + 1) / 2;
 	btrfs_node_key(c, &disk_key, mid);
 
-	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-			&disk_key, level, c->start, 0);
+	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
+					     c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -4298,8 +4320,8 @@ again:
 	else
 		btrfs_item_key(l, &disk_key, mid);
 
-	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
-			&disk_key, 0, l->start, 0);
+	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
+					     l->start, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
-- 
cgit v1.2.3


From 9ee5987f311fc2df9b75fe08fac51cea520f8e84 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 30 Jan 2019 07:54:12 -0600
Subject: btrfs: On error always free subvol_name in btrfs_mount

commit 532b618bdf237250d6d4566536d4b6ce3d0a31fe upstream.

The subvol_name is allocated in btrfs_parse_subvol_options and is
consumed and freed in mount_subvol.  Add a free to the error paths that
don't call mount_subvol so that it is guaranteed that subvol_name is
freed when an error happens.

Fixes: 312c89fbca06 ("btrfs: cleanup btrfs_mount() using btrfs_mount_root()")
Cc: stable@vger.kernel.org # v4.19+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/super.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8ad145820ea8..8888337a95b6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1677,6 +1677,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 				flags | SB_RDONLY, device_name, data);
 			if (IS_ERR(mnt_root)) {
 				root = ERR_CAST(mnt_root);
+				kfree(subvol_name);
 				goto out;
 			}
 
@@ -1686,12 +1687,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 			if (error < 0) {
 				root = ERR_PTR(error);
 				mntput(mnt_root);
+				kfree(subvol_name);
 				goto out;
 			}
 		}
 	}
 	if (IS_ERR(mnt_root)) {
 		root = ERR_CAST(mnt_root);
+		kfree(subvol_name);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From c0be624777ba33bac9bf17e2f9854e90ac6d78d8 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <paulo@paulo.ac>
Date: Tue, 20 Nov 2018 15:16:36 -0200
Subject: cifs: Always resolve hostname before reconnecting

commit 28eb24ff75c5ac130eb326b3b4d0dcecfc0f427d upstream.

In case a hostname resolves to a different IP address (e.g. long
running mounts), make sure to resolve it every time prior to calling
generic_ip_connect() in reconnect.

Suggested-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Paulo Alcantara <palcantara@suse.de>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/connect.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d0bba175117c..a5ea742654aa 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -50,6 +50,7 @@
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "dns_resolve.h"
 #include "ntlmssp.h"
 #include "nterr.h"
 #include "rfc1002pdu.h"
@@ -317,6 +318,53 @@ static void cifs_prune_tlinks(struct work_struct *work);
 static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
 					const char *devname, bool is_smb3);
 
+/*
+ * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may
+ * get their ip addresses changed at some point.
+ *
+ * This should be called with server->srv_mutex held.
+ */
+#ifdef CONFIG_CIFS_DFS_UPCALL
+static int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+	int rc;
+	int len;
+	char *unc, *ipaddr = NULL;
+
+	if (!server->hostname)
+		return -EINVAL;
+
+	len = strlen(server->hostname) + 3;
+
+	unc = kmalloc(len, GFP_KERNEL);
+	if (!unc) {
+		cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__);
+		return -ENOMEM;
+	}
+	snprintf(unc, len, "\\\\%s", server->hostname);
+
+	rc = dns_resolve_server_name_to_ip(unc, &ipaddr);
+	kfree(unc);
+
+	if (rc < 0) {
+		cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n",
+			 __func__, server->hostname, rc);
+		return rc;
+	}
+
+	rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
+				  strlen(ipaddr));
+	kfree(ipaddr);
+
+	return !rc ? -1 : 0;
+}
+#else
+static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
+{
+	return 0;
+}
+#endif
+
 /*
  * cifs tcp session reconnection
  *
@@ -417,6 +465,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
 			rc = generic_ip_connect(server);
 		if (rc) {
 			cifs_dbg(FYI, "reconnect error %d\n", rc);
+			rc = reconn_set_ipaddr(server);
+			if (rc) {
+				cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+					 __func__, rc);
+			}
 			mutex_unlock(&server->srv_mutex);
 			msleep(3000);
 		} else {
-- 
cgit v1.2.3


From ee9268a9b55b82bcc77c64bf137eb9aa7cf87aa9 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 8 Nov 2018 14:04:50 -0500
Subject: dlm: Don't swamp the CPU with callbacks queued during recovery

[ Upstream commit 216f0efd19b9cc32207934fd1b87a45f2c4c593e ]

Before this patch, recovery would cause all callbacks to be delayed,
put on a queue, and afterward they were all queued to the callback
work queue. This patch does the same thing, but occasionally takes
a break after 25 of them so it won't swamp the CPU at the expense
of other RT processes like corosync.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/dlm/ast.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 562fa8c3edff..47ee66d70109 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -292,6 +292,8 @@ void dlm_callback_suspend(struct dlm_ls *ls)
 		flush_workqueue(ls->ls_callback_wq);
 }
 
+#define MAX_CB_QUEUE 25
+
 void dlm_callback_resume(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb, *safe;
@@ -302,15 +304,23 @@ void dlm_callback_resume(struct dlm_ls *ls)
 	if (!ls->ls_callback_wq)
 		return;
 
+more:
 	mutex_lock(&ls->ls_cb_mutex);
 	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
 		list_del_init(&lkb->lkb_cb_list);
 		queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
 		count++;
+		if (count == MAX_CB_QUEUE)
+			break;
 	}
 	mutex_unlock(&ls->ls_cb_mutex);
 
 	if (count)
 		log_rinfo(ls, "dlm_callback_resume %d", count);
+	if (count == MAX_CB_QUEUE) {
+		count = 0;
+		cond_resched();
+		goto more;
+	}
 }
 
-- 
cgit v1.2.3


From c96d2b9d35b9e3e75360152d45cfc8d6fa0e521a Mon Sep 17 00:00:00 2001
From: Yunlei He <heyunlei@huawei.com>
Date: Tue, 6 Nov 2018 10:25:29 +0800
Subject: f2fs: move dir data flush to write checkpoint process

[ Upstream commit b61ac5b720146c619c7cdf17eff2551b934399e5 ]

This patch move dir data flush to write checkpoint process, by
doing this, it may reduce some time for dir fsync.

pre:
	-f2fs_do_sync_file enter
		-file_write_and_wait_range  <- flush & wait
		-write_checkpoint
			-do_checkpoint	    <- wait all
	-f2fs_do_sync_file exit

now:
	-f2fs_do_sync_file enter
		-write_checkpoint
			-block_operations   <- flush dir & no wait
			-do_checkpoint	    <- wait all
	-f2fs_do_sync_file exit

Signed-off-by: Yunlei He <heyunlei@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5474aaa274b9..fd36aa6569dc 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -220,6 +220,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 
 	trace_f2fs_sync_file_enter(inode);
 
+	if (S_ISDIR(inode->i_mode))
+		goto go_write;
+
 	/* if fdatasync is triggered, let's do in-place-update */
 	if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
 		set_inode_flag(inode, FI_NEED_IPU);
-- 
cgit v1.2.3


From ddab3d0a38a5824462a16a7a123ad4abd165f3ad Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Wed, 14 Nov 2018 19:34:28 +0800
Subject: f2fs: fix race between write_checkpoint and write_begin

[ Upstream commit 2866fb16d67992195b0526d19e65acb6640fb87f ]

The following race could lead to inconsistent SIT bitmap:

Task A                          Task B
======                          ======
f2fs_write_checkpoint
  block_operations
    f2fs_lock_all
      down_write(node_change)
      down_write(node_write)
      ... sync ...
      up_write(node_change)
                                f2fs_file_write_iter
                                  set_inode_flag(FI_NO_PREALLOC)
                                  ......
                                  f2fs_write_begin(index=0, has inline data)
                                    prepare_write_begin
                                      __do_map_lock(AIO) => down_read(node_change)
                                      f2fs_convert_inline_page => update SIT
                                      __do_map_lock(AIO) => up_read(node_change)
  f2fs_flush_sit_entries <= inconsistent SIT
  finish write checkpoint
  sudden-power-off

If SPO occurs after checkpoint is finished, SIT bitmap will be set
incorrectly.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/data.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 11f28342f641..08314fb42652 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2259,6 +2259,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 	bool locked = false;
 	struct extent_info ei = {0,0,0};
 	int err = 0;
+	int flag;
 
 	/*
 	 * we already allocated all the blocks, so we don't need to get
@@ -2268,9 +2269,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 			!is_inode_flag_set(inode, FI_NO_PREALLOC))
 		return 0;
 
+	/* f2fs_lock_op avoids race between write CP and convert_inline_page */
+	if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
+		flag = F2FS_GET_BLOCK_DEFAULT;
+	else
+		flag = F2FS_GET_BLOCK_PRE_AIO;
+
 	if (f2fs_has_inline_data(inode) ||
 			(pos & PAGE_MASK) >= i_size_read(inode)) {
-		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true);
+		__do_map_lock(sbi, flag, true);
 		locked = true;
 	}
 restart:
@@ -2308,6 +2315,7 @@ restart:
 				f2fs_put_dnode(&dn);
 				__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
 								true);
+				WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
 				locked = true;
 				goto restart;
 			}
@@ -2321,7 +2329,7 @@ out:
 	f2fs_put_dnode(&dn);
 unlock_out:
 	if (locked)
-		__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
+		__do_map_lock(sbi, flag, false);
 	return err;
 }
 
-- 
cgit v1.2.3


From d54d612a73a6105201ed8662e283665d079ff6ab Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <kernelpatch@126.com>
Date: Wed, 21 Nov 2018 07:21:38 +0800
Subject: f2fs: fix wrong return value of f2fs_acl_create

[ Upstream commit f6176473a0c7472380eef72ebeb330cf9485bf0a ]

When call f2fs_acl_create_masq() failed, the caller f2fs_acl_create()
should return -EIO instead of -ENOMEM, this patch makes it consistent
with posix_acl_create() which has been fixed in commit beaf226b863a
("posix_acl: don't ignore return value of posix_acl_create_masq()").

Fixes: 83dfe53c185e ("f2fs: fix reference leaks in f2fs_acl_create")
Signed-off-by: Tiezhu Yang <kernelpatch@126.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/acl.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 111824199a88..b9fe937a3c70 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -352,12 +352,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 		return PTR_ERR(p);
 
 	clone = f2fs_acl_clone(p, GFP_NOFS);
-	if (!clone)
-		goto no_mem;
+	if (!clone) {
+		ret = -ENOMEM;
+		goto release_acl;
+	}
 
 	ret = f2fs_acl_create_masq(clone, mode);
 	if (ret < 0)
-		goto no_mem_clone;
+		goto release_clone;
 
 	if (ret == 0)
 		posix_acl_release(clone);
@@ -371,11 +373,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
 
 	return 0;
 
-no_mem_clone:
+release_clone:
 	posix_acl_release(clone);
-no_mem:
+release_acl:
 	posix_acl_release(p);
-	return -ENOMEM;
+	return ret;
 }
 
 int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
-- 
cgit v1.2.3


From 2f0fb76b3e411462b69e7df2e3cb0063051f2492 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 27 Nov 2018 15:54:17 -0500
Subject: nfsd4: fix crash on writing v4_end_grace before nfsd startup

[ Upstream commit 62a063b8e7d1db684db3f207261a466fa3194e72 ]

Anatoly Trosinenko reports that this:

1) Checkout fresh master Linux branch (tested with commit e195ca6cb)
2) Copy x84_64-config-4.14 to .config, then enable NFS server v4 and build
3) From `kvm-xfstests shell`:

results in NULL dereference in locks_end_grace.

Check that nfsd has been started before trying to end the grace period.

Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfsd/nfsctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7fb9f7c667b1..899174c7a8ae 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1126,6 +1126,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
 		case 'Y':
 		case 'y':
 		case '1':
+			if (nn->nfsd_serv)
+				return -EBUSY;
 			nfsd4_end_grace(nn);
 			break;
 		default:
-- 
cgit v1.2.3


From 6a1d712b43811fa2992b56108a633531406eac48 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Dec 2018 14:29:20 +0100
Subject: udf: Fix BUG on corrupted inode

[ Upstream commit d288d95842f1503414b7eebce3773bac3390457e ]

When inode is corrupted so that extent type is invalid, some functions
(such as udf_truncate_extents()) will just BUG. Check that extent type
is valid when loading the inode to memory.

Reported-by: Anatoly Trosinenko <anatoly.trosinenko@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/udf/inode.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 5df554a9f9c9..ae796e10f68b 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1357,6 +1357,12 @@ reread:
 
 	iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) &
 							ICBTAG_FLAG_AD_MASK;
+	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT &&
+	    iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG &&
+	    iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
+		ret = -EIO;
+		goto out;
+	}
 	iinfo->i_unique = 0;
 	iinfo->i_lenEAttr = 0;
 	iinfo->i_lenExtents = 0;
-- 
cgit v1.2.3


From 3733632e8bc09971dccf8550e3872fd696d89d92 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Mon, 15 Oct 2018 10:45:17 +0800
Subject: btrfs: harden agaist duplicate fsid on scanned devices

[ Upstream commit a9261d4125c97ce8624e9941b75dee1b43ad5df9 ]

It's not that impossible to imagine that a device OR a btrfs image is
copied just by using the dd or the cp command. Which in case both the
copies of the btrfs will have the same fsid. If on the system with
automount enabled, the copied FS gets scanned.

We have a known bug in btrfs, that we let the device path be changed
after the device has been mounted. So using this loop hole the new
copied device would appears as if its mounted immediately after it's
been copied.

For example:

Initially.. /dev/mmcblk0p4 is mounted as /

  $ lsblk
  NAME        MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
  mmcblk0     179:0    0 29.2G  0 disk
  |-mmcblk0p4 179:4    0    4G  0 part /
  |-mmcblk0p2 179:2    0  500M  0 part /boot
  |-mmcblk0p3 179:3    0  256M  0 part [SWAP]
  `-mmcblk0p1 179:1    0  256M  0 part /boot/efi

  $ btrfs fi show
     Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
     Total devices 1 FS bytes used 1.40GiB
     devid    1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4

Copy mmcblk0 to sda

  $ dd if=/dev/mmcblk0 of=/dev/sda

And immediately after the copy completes the change in the device
superblock is notified which the automount scans using btrfs device scan
and the new device sda becomes the mounted root device.

  $ lsblk
  NAME        MAJ:MIN RM  SIZE RO TYPE MOUNTPOINT
  sda           8:0    1 14.9G  0 disk
  |-sda4        8:4    1    4G  0 part /
  |-sda2        8:2    1  500M  0 part
  |-sda3        8:3    1  256M  0 part
  `-sda1        8:1    1  256M  0 part
  mmcblk0     179:0    0 29.2G  0 disk
  |-mmcblk0p4 179:4    0    4G  0 part
  |-mmcblk0p2 179:2    0  500M  0 part /boot
  |-mmcblk0p3 179:3    0  256M  0 part [SWAP]
  `-mmcblk0p1 179:1    0  256M  0 part /boot/efi

  $ btrfs fi show /
    Label: none  uuid: 07892354-ddaa-4443-90ea-f76a06accaba
    Total devices 1 FS bytes used 1.40GiB
    devid    1 size 4.00GiB used 3.00GiB path /dev/sda4

The bug is quite nasty that you can't either unmount /dev/sda4 or
/dev/mmcblk0p4. And the problem does not get solved until you take sda
out of the system on to another system to change its fsid using the
'btrfstune -u' command.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/volumes.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0ee1cd4b56fb..285f64f2de5f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -850,6 +850,35 @@ static noinline struct btrfs_device *device_list_add(const char *path,
 			return ERR_PTR(-EEXIST);
 		}
 
+		/*
+		 * We are going to replace the device path for a given devid,
+		 * make sure it's the same device if the device is mounted
+		 */
+		if (device->bdev) {
+			struct block_device *path_bdev;
+
+			path_bdev = lookup_bdev(path);
+			if (IS_ERR(path_bdev)) {
+				mutex_unlock(&fs_devices->device_list_mutex);
+				return ERR_CAST(path_bdev);
+			}
+
+			if (device->bdev != path_bdev) {
+				bdput(path_bdev);
+				mutex_unlock(&fs_devices->device_list_mutex);
+				btrfs_warn_in_rcu(device->fs_info,
+			"duplicate device fsid:devid for %pU:%llu old:%s new:%s",
+					disk_super->fsid, devid,
+					rcu_str_deref(device->name), path);
+				return ERR_PTR(-EEXIST);
+			}
+			bdput(path_bdev);
+			btrfs_info_in_rcu(device->fs_info,
+				"device fsid %pU devid %llu moved old:%s new:%s",
+				disk_super->fsid, devid,
+				rcu_str_deref(device->name), path);
+		}
+
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
 			mutex_unlock(&fs_devices->device_list_mutex);
-- 
cgit v1.2.3


From f5d5b54349125f4765b51153960676ea4d81b82b Mon Sep 17 00:00:00 2001
From: Ethan Lien <ethanlien@synology.com>
Date: Thu, 1 Nov 2018 14:49:03 +0800
Subject: btrfs: use tagged writepage to mitigate livelock of snapshot

[ Upstream commit 3cd24c698004d2f7668e0eb9fc1f096f533c791b ]

Snapshot is expected to be fast. But if there are writers steadily
creating dirty pages in our subvolume, the snapshot may take a very long
time to complete. To fix the problem, we use tagged writepage for
snapshot flusher as we do in the generic write_cache_pages(), so we can
omit pages dirtied after the snapshot command.

This does not change the semantics regarding which data get to the
snapshot, if there are pages being dirtied during the snapshotting
operation.  There's a sync called before snapshot is taken in old/new
case, any IO in flight just after that may be in the snapshot but this
depends on other system effects that might still sync the IO.

We do a simple snapshot speed test on a Intel D-1531 box:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
--direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 1m58sec
patched:  6.54sec

This is the best case for this patch since for a sequential write case,
we omit nearly all pages dirtied after the snapshot command.

For a multi writers, random write test:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
--direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 15.83sec
patched:  10.35sec

The improvement is smaller compared to the sequential write case,
since we omit only half of the pages dirtied after snapshot command.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Ethan Lien <ethanlien@synology.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent_io.c   | 17 +++++++++++++++--
 fs/btrfs/inode.c       | 11 +++++++----
 fs/btrfs/ioctl.c       |  2 +-
 5 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7177d1d33584..45f5cf9cd203 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ enum {
 	BTRFS_INODE_IN_DELALLOC_LIST,
 	BTRFS_INODE_READDIO_NEED_LOCK,
 	BTRFS_INODE_HAS_PROPS,
+	BTRFS_INODE_SNAPSHOT_FLUSH,
 };
 
 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..82682da5a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..79f82f2ec4d5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3928,12 +3928,25 @@ static int extent_write_cache_pages(struct address_space *mapping,
 			range_whole = 1;
 		scanned = 1;
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+
+	/*
+	 * We do the tagged writepage as long as the snapshot flush bit is set
+	 * and we are the first one who do the filemap_flush() on this inode.
+	 *
+	 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
+	 * not race in and drop the bit.
+	 */
+	if (range_whole && wbc->nr_to_write == LONG_MAX &&
+	    test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+			       &BTRFS_I(inode)->runtime_flags))
+		wbc->tagged_writepages = 1;
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && !nr_to_write_done && (index <= end) &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 83b3a626c796..59f361f7d0c1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10005,7 +10005,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -10033,6 +10033,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
 		}
 		spin_unlock(&root->delalloc_lock);
 
+		if (snapshot)
+			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
+				&binode->runtime_flags);
 		work = btrfs_alloc_delalloc_work(inode);
 		if (!work) {
 			iput(inode);
@@ -10066,7 +10069,7 @@ out:
 	return ret;
 }
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
@@ -10074,7 +10077,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
-	ret = start_delalloc_inodes(root, -1);
+	ret = start_delalloc_inodes(root, -1, true);
 	if (ret > 0)
 		ret = 0;
 	return ret;
@@ -10103,7 +10106,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = start_delalloc_inodes(root, nr);
+		ret = start_delalloc_inodes(root, nr, false);
 		btrfs_put_fs_root(root);
 		if (ret < 0)
 			goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c9152155fcbf..8bf9cce11213 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -778,7 +778,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	wait_event(root->subv_writers->wait,
 		   percpu_counter_sum(&root->subv_writers->counter) == 0);
 
-	ret = btrfs_start_delalloc_inodes(root);
+	ret = btrfs_start_delalloc_snapshot(root);
 	if (ret)
 		goto dec_and_free;
 
-- 
cgit v1.2.3


From 8c642d71906b54820ee30c91dbbdd593eabf796e Mon Sep 17 00:00:00 2001
From: Chris Perl <cperl@janestreet.com>
Date: Mon, 17 Dec 2018 10:56:38 -0500
Subject: NFS: nfs_compare_mount_options always compare auth flavors.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 594d1644cd59447f4fceb592448d5cd09eb09b5e ]

This patch removes the check from nfs_compare_mount_options to see if a
`sec' option was passed for the current mount before comparing auth
flavors and instead just always compares auth flavors.

Consider the following scenario:

You have a server with the address 192.168.1.1 and two exports /export/a
and /export/b.  The first export supports `sys' and `krb5' security, the
second just `sys'.

Assume you start with no mounts from the server.

The following results in EIOs being returned as the kernel nfs client
incorrectly thinks it can share the underlying `struct nfs_server's:

$ mkdir /tmp/{a,b}
$ sudo mount -t nfs -o vers=3,sec=krb5 192.168.1.1:/export/a /tmp/a
$ sudo mount -t nfs -o vers=3          192.168.1.1:/export/b /tmp/b
$ df >/dev/null
df: ‘/tmp/b’: Input/output error

Signed-off-by: Chris Perl <cperl@janestreet.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/super.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ac4b2f005778..5ef2c71348bd 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2409,8 +2409,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
 		goto Ebusy;
 	if (a->acdirmax != b->acdirmax)
 		goto Ebusy;
-	if (b->auth_info.flavor_len > 0 &&
-	   clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
+	if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
 		goto Ebusy;
 	return 1;
 Ebusy:
-- 
cgit v1.2.3


From 5d3b4cd8734b41ea04e384d231a09244cfcaebca Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Thu, 13 Dec 2018 08:06:16 +1000
Subject: cifs: check ntwrk_buf_start for NULL before dereferencing it

[ Upstream commit 59a63e479ce36a3f24444c3a36efe82b78e4a8e0 ]

RHBZ: 1021460

There is an issue where when multiple threads open/close the same directory
ntwrk_buf_start might end up being NULL, causing the call to smbCalcSize
later to oops with a NULL deref.

The real bug is why this happens and why this can become NULL for an
open cfile, which should not be allowed.
This patch tries to avoid a oops until the time when we fix the underlying
issue.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/readdir.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index e169e1a5fd35..3925a7bfc74d 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -655,7 +655,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
 		/* scan and find it */
 		int i;
 		char *cur_ent;
-		char *end_of_smb = cfile->srch_inf.ntwrk_buf_start +
+		char *end_of_smb;
+
+		if (cfile->srch_inf.ntwrk_buf_start == NULL) {
+			cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n");
+			return -EIO;
+		}
+
+		end_of_smb = cfile->srch_inf.ntwrk_buf_start +
 			server->ops->calc_smb_size(
 					cfile->srch_inf.ntwrk_buf_start,
 					server);
-- 
cgit v1.2.3


From 69e7f87745e70ca8d9b8a72e045738236d367670 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Wed, 26 Dec 2018 11:20:29 +0530
Subject: f2fs: fix use-after-free issue when accessing sbi->stat_info

[ Upstream commit 60aa4d5536ab7fe32433ca1173bd9d6633851f27 ]

iput() on sbi->node_inode can update sbi->stat_info
in the below context, if the f2fs_write_checkpoint()
has failed with error.

f2fs_balance_fs_bg+0x1ac/0x1ec
f2fs_write_node_pages+0x4c/0x260
do_writepages+0x80/0xbc
__writeback_single_inode+0xdc/0x4ac
writeback_single_inode+0x9c/0x144
write_inode_now+0xc4/0xec
iput+0x194/0x22c
f2fs_put_super+0x11c/0x1e8
generic_shutdown_super+0x70/0xf4
kill_block_super+0x2c/0x5c
kill_f2fs_super+0x44/0x50
deactivate_locked_super+0x60/0x8c
deactivate_super+0x68/0x74
cleanup_mnt+0x40/0x78

Fix this by moving f2fs_destroy_stats() further below iput() in
both f2fs_put_super() and f2fs_fill_super() paths.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/super.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 338138b34993..c9639ef0e8d5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1039,9 +1039,6 @@ static void f2fs_put_super(struct super_block *sb)
 		f2fs_write_checkpoint(sbi, &cpc);
 	}
 
-	/* f2fs_write_checkpoint can update stat informaion */
-	f2fs_destroy_stats(sbi);
-
 	/*
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
@@ -1061,6 +1058,12 @@ static void f2fs_put_super(struct super_block *sb)
 	iput(sbi->node_inode);
 	iput(sbi->meta_inode);
 
+	/*
+	 * iput() can update stat information, if f2fs_write_checkpoint()
+	 * above failed with error.
+	 */
+	f2fs_destroy_stats(sbi);
+
 	/* destroy f2fs internal modules */
 	f2fs_destroy_node_manager(sbi);
 	f2fs_destroy_segment_manager(sbi);
@@ -2980,30 +2983,30 @@ try_onemore:
 
 	f2fs_build_gc_manager(sbi);
 
+	err = f2fs_build_stats(sbi);
+	if (err)
+		goto free_nm;
+
 	/* get an inode for node space */
 	sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
 	if (IS_ERR(sbi->node_inode)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
 		err = PTR_ERR(sbi->node_inode);
-		goto free_nm;
+		goto free_stats;
 	}
 
-	err = f2fs_build_stats(sbi);
-	if (err)
-		goto free_node_inode;
-
 	/* read root inode and dentry */
 	root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
 	if (IS_ERR(root)) {
 		f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
 		err = PTR_ERR(root);
-		goto free_stats;
+		goto free_node_inode;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks ||
 			!root->i_size || !root->i_nlink) {
 		iput(root);
 		err = -EINVAL;
-		goto free_stats;
+		goto free_node_inode;
 	}
 
 	sb->s_root = d_make_root(root); /* allocate root dentry */
@@ -3121,12 +3124,12 @@ free_sysfs:
 free_root_inode:
 	dput(sb->s_root);
 	sb->s_root = NULL;
-free_stats:
-	f2fs_destroy_stats(sbi);
 free_node_inode:
 	f2fs_release_ino_entry(sbi, true);
 	truncate_inode_pages_final(NODE_MAPPING(sbi));
 	iput(sbi->node_inode);
+free_stats:
+	f2fs_destroy_stats(sbi);
 free_nm:
 	f2fs_destroy_node_manager(sbi);
 free_sm:
-- 
cgit v1.2.3


From df13b0369bc007b044b6d3c634892b325f783b83 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Tue, 18 Dec 2018 16:39:24 +0530
Subject: f2fs: fix sbi->extent_list corruption issue

[ Upstream commit e4589fa545e0020dbbc3c9bde35f35f949901392 ]

When there is a failure in f2fs_fill_super() after/during
the recovery of fsync'd nodes, it frees the current sbi and
retries again. This time the mount is successful, but the files
that got recovered before retry, still holds the extent tree,
whose extent nodes list is corrupted since sbi and sbi->extent_list
is freed up. The list_del corruption issue is observed when the
file system is getting unmounted and when those recoverd files extent
node is being freed up in the below context.

list_del corruption. prev->next should be fffffff1e1ef5480, but was (null)
<...>
kernel BUG at kernel/msm-4.14/lib/list_debug.c:53!
lr : __list_del_entry_valid+0x94/0xb4
pc : __list_del_entry_valid+0x94/0xb4
<...>
Call trace:
__list_del_entry_valid+0x94/0xb4
__release_extent_node+0xb0/0x114
__free_extent_tree+0x58/0x7c
f2fs_shrink_extent_tree+0xdc/0x3b0
f2fs_leave_shrinker+0x28/0x7c
f2fs_put_super+0xfc/0x1e0
generic_shutdown_super+0x70/0xf4
kill_block_super+0x2c/0x5c
kill_f2fs_super+0x44/0x50
deactivate_locked_super+0x60/0x8c
deactivate_super+0x68/0x74
cleanup_mnt+0x40/0x78
__cleanup_mnt+0x1c/0x28
task_work_run+0x48/0xd0
do_notify_resume+0x678/0xe98
work_pending+0x8/0x14

Fix this by not creating extents for those recovered files if shrinker is
not registered yet. Once mount is successful and shrinker is registered,
those files can have extents again.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/f2fs.h     | 11 ++++++++++-
 fs/f2fs/shrinker.c |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ecb735142276..42aef5c94927 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2613,10 +2613,19 @@ static inline bool is_dot_dotdot(const struct qstr *str)
 
 static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
-	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+	if (!test_opt(sbi, EXTENT_CACHE) ||
 			is_inode_flag_set(inode, FI_NO_EXTENT))
 		return false;
 
+	/*
+	 * for recovered files during mount do not create extents
+	 * if shrinker is not registered.
+	 */
+	if (list_empty(&sbi->s_list))
+		return false;
+
 	return S_ISREG(inode->i_mode);
 }
 
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 36cfd816c160..29042e6d5126 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -138,6 +138,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
 	f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
 
 	spin_lock(&f2fs_list_lock);
-	list_del(&sbi->s_list);
+	list_del_init(&sbi->s_list);
 	spin_unlock(&f2fs_list_lock);
 }
-- 
cgit v1.2.3


From 69e63b49ddf9bc786f1930d5aaa9b8673c326ee8 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Fri, 28 Dec 2018 00:32:57 -0800
Subject: ocfs2: don't clear bh uptodate for block read

[ Upstream commit 70306d9dce75abde855cefaf32b3f71eed8602a3 ]

For sync io read in ocfs2_read_blocks_sync(), first clear bh uptodate flag
and submit the io, second wait io done, last check whether bh uptodate, if
not return io error.

If two sync io for the same bh were issued, it could be the first io done
and set uptodate flag, but just before check that flag, the second io came
in and cleared uptodate, then ocfs2_read_blocks_sync() for the first io
will return IO error.

Indeed it's not necessary to clear uptodate flag, as the io end handler
end_buffer_read_sync() will set or clear it based on io succeed or failed.

The following message was found from a nfs server but the underlying
storage returned no error.

[4106438.567376] (nfsd,7146,3):ocfs2_get_suballoc_slot_bit:2780 ERROR: read block 1238823695 failed -5
[4106438.567569] (nfsd,7146,3):ocfs2_get_suballoc_slot_bit:2812 ERROR: status = -5
[4106438.567611] (nfsd,7146,3):ocfs2_test_inode_bit:2894 ERROR: get alloc slot and bit failed -5
[4106438.567643] (nfsd,7146,3):ocfs2_test_inode_bit:2932 ERROR: status = -5
[4106438.567675] (nfsd,7146,3):ocfs2_get_dentry:94 ERROR: test inode bit failed -5

Same issue in non sync read ocfs2_read_blocks(), fixed it as well.

Link: http://lkml.kernel.org/r/20181121020023.3034-4-junxiao.bi@oracle.com
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Changwei Ge <ge.changwei@h3c.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/buffer_head_io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1d098c3c00e0..9f8250df99f1 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -152,7 +152,6 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 #endif
 		}
 
-		clear_buffer_uptodate(bh);
 		get_bh(bh); /* for end_buffer_read_sync() */
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(REQ_OP_READ, 0, bh);
@@ -306,7 +305,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 				continue;
 			}
 
-			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
 			if (validate)
 				set_buffer_needs_validate(bh);
-- 
cgit v1.2.3


From 066206bc6e9e3cacd4334703a18d2d6b04fe763a Mon Sep 17 00:00:00 2001
From: Larry Chen <lchen@suse.com>
Date: Fri, 28 Dec 2018 00:32:46 -0800
Subject: ocfs2: improve ocfs2 Makefile

[ Upstream commit 9e6aea22802b5684c7e1d69822aeb0844dd01953 ]

Included file path was hard-wired in the ocfs2 makefile, which might
causes some confusion when compiling ocfs2 as an external module.

Say if we compile ocfs2 module as following.
cp -r /kernel/tree/fs/ocfs2 /other/dir/ocfs2
cd /other/dir/ocfs2
make -C /path/to/kernel_source M=`pwd` modules

Acutally, the compiler wil try to find included file in
/kernel/tree/fs/ocfs2, rather than the directory /other/dir/ocfs2.

To fix this little bug, we introduce the var $(src) provided by kbuild.
$(src) means the absolute path of the running kbuild file.

Link: http://lkml.kernel.org/r/20181108085546.15149-1-lchen@suse.com
Signed-off-by: Larry Chen <lchen@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/Makefile       | 2 +-
 fs/ocfs2/dlm/Makefile   | 2 +-
 fs/ocfs2/dlmfs/Makefile | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 99ee093182cb..cc9b32b9db7c 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)
 
 obj-$(CONFIG_OCFS2_FS) += 	\
 	ocfs2.o			\
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index bd1aab1f49a4..ef2854422a6e 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)/..
 
 obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
 
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index eed3db8c5b49..33431a0296a3 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
-ccflags-y := -Ifs/ocfs2
+ccflags-y := -I$(src)/..
 
 obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
 
-- 
cgit v1.2.3


From 9cb8f8088d9a105af7e763a315771e39eb8c16a4 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Thu, 3 Jan 2019 15:27:09 -0800
Subject: fs/epoll: drop ovflist branch prediction

[ Upstream commit 76699a67f3041ff4c7af6d6ee9be2bfbf1ffb671 ]

The ep->ovflist is a secondary ready-list to temporarily store events
that might occur when doing sproc without holding the ep->wq.lock.  This
accounts for every time we check for ready events and also send events
back to userspace; both callbacks, particularly the latter because of
copy_to_user, can account for a non-trivial time.

As such, the unlikely() check to see if the pointer is being used, seems
both misleading and sub-optimal.  In fact, we go to an awful lot of
trouble to sync both lists, and populating the ovflist is far from an
uncommon scenario.

For example, profiling a concurrent epoll_wait(2) benchmark, with
CONFIG_PROFILE_ANNOTATED_BRANCHES shows that for a two threads a 33%
incorrect rate was seen; and when incrementally increasing the number of
epoll instances (which is used, for example for multiple queuing load
balancing models), up to a 90% incorrect rate was seen.

Similarly, by deleting the prediction, 3% throughput boost was seen
across incremental threads.

Link: http://lkml.kernel.org/r/20181108051006.18751-4-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jason Baron <jbaron@akamai.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/eventpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 42bbe6824b4b..58f48ea0db23 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1154,7 +1154,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
 	 * semantics). All the events that happen during that period of time are
 	 * chained in ep->ovflist and requeued later on.
 	 */
-	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+	if (ep->ovflist != EP_UNACTIVE_PTR) {
 		if (epi->next == EP_UNACTIVE_PTR) {
 			epi->next = ep->ovflist;
 			ep->ovflist = epi;
-- 
cgit v1.2.3


From ab5f7407125778bb05274f4a6999c67c4a72d927 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 3 Jan 2019 15:28:07 -0800
Subject: exec: load_script: don't blindly truncate shebang string

[ Upstream commit 8099b047ecc431518b9bb6bdbba3549bbecdc343 ]

load_script() simply truncates bprm->buf and this is very wrong if the
length of shebang string exceeds BINPRM_BUF_SIZE-2.  This can silently
truncate i_arg or (worse) we can execute the wrong binary if buf[2:126]
happens to be the valid executable path.

Change load_script() to return ENOEXEC if it can't find '\n' or zero in
bprm->buf.  Note that '\0' can come from either
prepare_binprm()->memset() or from kernel_read(), we do not care.

Link: http://lkml.kernel.org/r/20181112160931.GA28463@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Ben Woodard <woodard@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/binfmt_script.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index 7cde3f46ad26..d0078cbb718b 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -42,10 +42,14 @@ static int load_script(struct linux_binprm *bprm)
 	fput(bprm->file);
 	bprm->file = NULL;
 
-	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
-	if ((cp = strchr(bprm->buf, '\n')) == NULL)
-		cp = bprm->buf+BINPRM_BUF_SIZE-1;
+	for (cp = bprm->buf+2;; cp++) {
+		if (cp >= bprm->buf + BINPRM_BUF_SIZE)
+			return -ENOEXEC;
+		if (!*cp || (*cp == '\n'))
+			break;
+	}
 	*cp = '\0';
+
 	while (cp > bprm->buf) {
 		cp--;
 		if ((*cp == ' ') || (*cp == '\t'))
-- 
cgit v1.2.3


From 62c7c0a8709b254060cb5f76e9ed6a39f6aae20f Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Mon, 4 Feb 2019 08:54:18 -0800
Subject: xfs: Fix xqmstats offsets in /proc/fs/xfs/xqmstat

commit 41657e5507b13e963be906d5d874f4f02374fd5c upstream.

The addition of FIBT, RMAP and REFCOUNT changed the offsets into
__xfssats structure.

This caused xqmstat_proc_show() to display garbage data via
/proc/fs/xfs/xqmstat, once it relies on the offsets marked via macros.

Fix it.

Fixes: 00f4e4f9 xfs: add rmap btree stats infrastructure
Fixes: aafc3c24 xfs: support the XFS_BTNUM_FINOBT free inode btree type
Fixes: 46eeb521 xfs: introduce refcount btree definitions
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 4e4423153071..740ac9674848 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -119,7 +119,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
 	int j;
 
 	seq_printf(m, "qm");
-	for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+	for (j = XFSSTAT_END_REFCOUNT; j < XFSSTAT_END_XQMSTAT; j++)
 		seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
 	seq_putc(m, '\n');
 	return 0;
-- 
cgit v1.2.3


From a585ac0e767b26e74b0156f014331b48b56f0974 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 4 Feb 2019 08:54:19 -0800
Subject: xfs: cancel COW blocks before swapext

commit 96987eea537d6ccd98704a71958f9ba02da80843 upstream.

We need to make sure we have no outstanding COW blocks before we swap
extents, as there is nothing preventing us from having preallocated COW
delalloc on either inode that swapext is called on.  That case can
easily be reproduced by running generic/324 in always_cow mode:

[  620.760572] XFS: Assertion failed: tip->i_delayed_blks == 0, file: fs/xfs/xfs_bmap_util.c, line: 1669
[  620.761608] ------------[ cut here ]------------
[  620.762171] kernel BUG at fs/xfs/xfs_message.c:102!
[  620.762732] invalid opcode: 0000 [#1] SMP PTI
[  620.763272] CPU: 0 PID: 24153 Comm: xfs_fsr Tainted: G        W         4.19.0-rc1+ #4182
[  620.764203] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014
[  620.765202] RIP: 0010:assfail+0x20/0x28
[  620.765646] Code: 31 ff e8 83 fc ff ff 0f 0b c3 48 89 f1 41 89 d0 48 c7 c6 48 ca 8d 82 48 89 fa 38
[  620.767758] RSP: 0018:ffffc9000898bc10 EFLAGS: 00010202
[  620.768359] RAX: 0000000000000000 RBX: ffff88012f14ba40 RCX: 0000000000000000
[  620.769174] RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff828560d9
[  620.769982] RBP: ffff88012f14b300 R08: 0000000000000000 R09: 0000000000000000
[  620.770788] R10: 000000000000000a R11: f000000000000000 R12: ffffc9000898bc98
[  620.771638] R13: ffffc9000898bc9c R14: ffff880130b5e2b8 R15: ffff88012a1fa2a8
[  620.772504] FS:  00007fdc36e0fbc0(0000) GS:ffff88013ba00000(0000) knlGS:0000000000000000
[  620.773475] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  620.774168] CR2: 00007fdc3604d000 CR3: 0000000132afc000 CR4: 00000000000006f0
[  620.774978] Call Trace:
[  620.775274]  xfs_swap_extent_forks+0x2a0/0x2e0
[  620.775792]  xfs_swap_extents+0x38b/0xab0
[  620.776256]  xfs_ioc_swapext+0x121/0x140
[  620.776709]  xfs_file_ioctl+0x328/0xc90
[  620.777154]  ? rcu_read_lock_sched_held+0x50/0x60
[  620.777694]  ? xfs_iunlock+0x233/0x260
[  620.778127]  ? xfs_setattr_nonsize+0x3be/0x6a0
[  620.778647]  do_vfs_ioctl+0x9d/0x680
[  620.779071]  ? ksys_fchown+0x47/0x80
[  620.779552]  ksys_ioctl+0x35/0x70
[  620.780040]  __x64_sys_ioctl+0x11/0x20
[  620.780530]  do_syscall_64+0x4b/0x190
[  620.780927]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  620.781467] RIP: 0033:0x7fdc364d0f07
[  620.781900] Code: b3 66 90 48 8b 05 81 5f 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 28
[  620.784044] RSP: 002b:00007ffe2a766038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  620.784896] RAX: ffffffffffffffda RBX: 0000000000000025 RCX: 00007fdc364d0f07
[  620.785667] RDX: 0000560296ca2fc0 RSI: 00000000c0c0586d RDI: 0000000000000005
[  620.786398] RBP: 0000000000000025 R08: 0000000000001200 R09: 0000000000000000
[  620.787283] R10: 0000000000000432 R11: 0000000000000246 R12: 0000000000000005
[  620.788051] R13: 0000000000000000 R14: 0000000000001000 R15: 0000000000000006
[  620.788927] Modules linked in:
[  620.789340] ---[ end trace 9503b7417ffdbdb0 ]---
[  620.790065] RIP: 0010:assfail+0x20/0x28
[  620.790642] Code: 31 ff e8 83 fc ff ff 0f 0b c3 48 89 f1 41 89 d0 48 c7 c6 48 ca 8d 82 48 89 fa 38
[  620.793038] RSP: 0018:ffffc9000898bc10 EFLAGS: 00010202
[  620.793609] RAX: 0000000000000000 RBX: ffff88012f14ba40 RCX: 0000000000000000
[  620.794317] RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff828560d9
[  620.795025] RBP: ffff88012f14b300 R08: 0000000000000000 R09: 0000000000000000
[  620.795778] R10: 000000000000000a R11: f000000000000000 R12: ffffc9000898bc98
[  620.796675] R13: ffffc9000898bc9c R14: ffff880130b5e2b8 R15: ffff88012a1fa2a8
[  620.797782] FS:  00007fdc36e0fbc0(0000) GS:ffff88013ba00000(0000) knlGS:0000000000000000
[  620.798908] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  620.799594] CR2: 00007fdc3604d000 CR3: 0000000132afc000 CR4: 00000000000006f0
[  620.800424] Kernel panic - not syncing: Fatal exception
[  620.801191] Kernel Offset: disabled
[  620.801597] ---[ end Kernel panic - not syncing: Fatal exception ]---

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_bmap_util.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 6de8d90041ff..9d1e5c3a661e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1824,6 +1824,12 @@ xfs_swap_extents(
 	if (error)
 		goto out_unlock;
 
+	if (xfs_inode_has_cow_data(tip)) {
+		error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+		if (error)
+			return error;
+	}
+
 	/*
 	 * Extent "swapping" with rmap requires a permanent reservation and
 	 * a block reservation because it's really just a remap operation
-- 
cgit v1.2.3


From b6095cbd7841fff310c1ad135ae2c7da14df9da4 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 4 Feb 2019 08:54:20 -0800
Subject: xfs: Fix error code in 'xfs_ioc_getbmap()'

commit 132bf6723749f7219c399831eeb286dbbb985429 upstream.

In this function, once 'buf' has been allocated, we unconditionally
return 0.
However, 'error' is set to some error codes in several error handling
paths.
Before commit 232b51948b99 ("xfs: simplify the xfs_getbmap interface")
this was not an issue because all error paths were returning directly,
but now that some cleanup at the end may be needed, we must propagate the
error code.

Fixes: 232b51948b99 ("xfs: simplify the xfs_getbmap interface")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0ef5ece5634c..bad90479ade2 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1616,7 +1616,7 @@ xfs_ioc_getbmap(
 	error = 0;
 out_free_buf:
 	kmem_free(buf);
-	return 0;
+	return error;
 }
 
 struct getfsmap_info {
-- 
cgit v1.2.3


From a96f3a55143a4716ed10088c8637eced24ed1e75 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 4 Feb 2019 08:54:21 -0800
Subject: xfs: fix overflow in xfs_attr3_leaf_verify

commit 837514f7a4ca4aca06aec5caa5ff56d33ef06976 upstream.

generic/070 on 64k block size filesystems is failing with a verifier
corruption on writeback or an attribute leaf block:

[   94.973083] XFS (pmem0): Metadata corruption detected at xfs_attr3_leaf_verify+0x246/0x260, xfs_attr3_leaf block 0x811480
[   94.975623] XFS (pmem0): Unmount and run xfs_repair
[   94.976720] XFS (pmem0): First 128 bytes of corrupted metadata buffer:
[   94.978270] 000000004b2e7b45: 00 00 00 00 00 00 00 00 3b ee 00 00 00 00 00 00  ........;.......
[   94.980268] 000000006b1db90b: 00 00 00 00 00 81 14 80 00 00 00 00 00 00 00 00  ................
[   94.982251] 00000000433f2407: 22 7b 5c 82 2d 5c 47 4c bb 31 1c 37 fa a9 ce d6  "{\.-\GL.1.7....
[   94.984157] 0000000010dc7dfb: 00 00 00 00 00 81 04 8a 00 0a 18 e8 dd 94 01 00  ................
[   94.986215] 00000000d5a19229: 00 a0 dc f4 fe 98 01 68 f0 d8 07 e0 00 00 00 00  .......h........
[   94.988171] 00000000521df36c: 0c 2d 32 e2 fe 20 01 00 0c 2d 58 65 fe 0c 01 00  .-2.. ...-Xe....
[   94.990162] 000000008477ae06: 0c 2d 5b 66 fe 8c 01 00 0c 2d 71 35 fe 7c 01 00  .-[f.....-q5.|..
[   94.992139] 00000000a4a6bca6: 0c 2d 72 37 fc d4 01 00 0c 2d d8 b8 f0 90 01 00  .-r7.....-......
[   94.994789] XFS (pmem0): xfs_do_force_shutdown(0x8) called from line 1453 of file fs/xfs/xfs_buf.c. Return address = ffffffff815365f3

This is failing this check:

                end = ichdr.freemap[i].base + ichdr.freemap[i].size;
                if (end < ichdr.freemap[i].base)
>>>>>                   return __this_address;
                if (end > mp->m_attr_geo->blksize)
                        return __this_address;

And from the buffer output above, the freemap array is:

	freemap[0].base = 0x00a0
	freemap[0].size = 0xdcf4	end = 0xdd94
	freemap[1].base = 0xfe98
	freemap[1].size = 0x0168	end = 0x10000
	freemap[2].base = 0xf0d8
	freemap[2].size = 0x07e0	end = 0xf8b8

These all look valid - the block size is 0x10000 and so from the
last check in the above verifier fragment we know that the end
of freemap[1] is valid. The problem is that end is declared as:

	uint16_t	end;

And (uint16_t)0x10000 = 0. So we have a verifier bug here, not a
corruption. Fix the verifier to use uint32_t types for the check and
hence avoid the overflow.

Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=201577
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/libxfs/xfs_attr_leaf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6fc5425b1474..2652d00842d6 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -243,7 +243,7 @@ xfs_attr3_leaf_verify(
 	struct xfs_mount		*mp = bp->b_target->bt_mount;
 	struct xfs_attr_leafblock	*leaf = bp->b_addr;
 	struct xfs_attr_leaf_entry	*entries;
-	uint16_t			end;
+	uint32_t			end;	/* must be 32bit - see below */
 	int				i;
 
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -293,6 +293,11 @@ xfs_attr3_leaf_verify(
 	/*
 	 * Quickly check the freemap information.  Attribute data has to be
 	 * aligned to 4-byte boundaries, and likewise for the free space.
+	 *
+	 * Note that for 64k block size filesystems, the freemap entries cannot
+	 * overflow as they are only be16 fields. However, when checking end
+	 * pointer of the freemap, we have to be careful to detect overflows and
+	 * so use uint32_t for those checks.
 	 */
 	for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
 		if (ichdr.freemap[i].base > mp->m_attr_geo->blksize)
@@ -303,7 +308,9 @@ xfs_attr3_leaf_verify(
 			return __this_address;
 		if (ichdr.freemap[i].size & 0x3)
 			return __this_address;
-		end = ichdr.freemap[i].base + ichdr.freemap[i].size;
+
+		/* be care of 16 bit overflows here */
+		end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size;
 		if (end < ichdr.freemap[i].base)
 			return __this_address;
 		if (end > mp->m_attr_geo->blksize)
-- 
cgit v1.2.3


From c3a66bf4ce403f3d4ff083f6fd75604e7d8dc450 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 4 Feb 2019 08:54:22 -0800
Subject: xfs: fix shared extent data corruption due to missing cow reservation

commit 59e4293149106fb92530f8e56fa3992d8548c5e6 upstream.

Page writeback indirectly handles shared extents via the existence
of overlapping COW fork blocks. If COW fork blocks exist, writeback
always performs the associated copy-on-write regardless if the
underlying blocks are actually shared. If the blocks are shared,
then overlapping COW fork blocks must always exist.

fstests shared/010 reproduces a case where a buffered write occurs
over a shared block without performing the requisite COW fork
reservation.  This ultimately causes writeback to the shared extent
and data corruption that is detected across md5 checks of the
filesystem across a mount cycle.

The problem occurs when a buffered write lands over a shared extent
that crosses an extent size hint boundary and that also happens to
have a partial COW reservation that doesn't cover the start and end
blocks of the data fork extent.

For example, a buffered write occurs across the file offset (in FSB
units) range of [29, 57]. A shared extent exists at blocks [29, 35]
and COW reservation already exists at blocks [32, 34]. After
accommodating a COW extent size hint of 32 blocks and the existing
reservation at offset 32, xfs_reflink_reserve_cow() allocates 32
blocks of reservation at offset 0 and returns with COW reservation
across the range of [0, 34]. The associated data fork extent is
still [29, 35], however, which isn't fully covered by the COW
reservation.

This leads to a buffered write at file offset 35 over a shared
extent without associated COW reservation. Writeback eventually
kicks in, performs an overwrite of the underlying shared block and
causes the associated data corruption.

Update xfs_reflink_reserve_cow() to accommodate the fact that a
delalloc allocation request may not fully cover the extent in the
data fork. Trim the data fork extent appropriately, just as is done
for shared extent boundaries and/or existing COW reservations that
happen to overlap the start of the data fork extent. This prevents
shared/010 failures due to data corruption on reflink enabled
filesystems.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_reflink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 42ea7bab9144..7088f44c0c59 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -302,6 +302,7 @@ xfs_reflink_reserve_cow(
 	if (error)
 		return error;
 
+	xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
 	trace_xfs_reflink_cow_alloc(ip, &got);
 	return 0;
 }
-- 
cgit v1.2.3


From 5a7455e922b4ee1e92390f2f46ccb5269f79728d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 4 Feb 2019 08:54:23 -0800
Subject: xfs: fix transient reference count error in
 xfs_buf_resubmit_failed_buffers

commit d43aaf1685aa471f0593685c9f54d53e3af3cf3f upstream.

When retrying a failed inode or dquot buffer,
xfs_buf_resubmit_failed_buffers() clears all the failed flags from
the inde/dquot log items. In doing so, it also drops all the
reference counts on the buffer that the failed log items hold. This
means it can drop all the active references on the buffer and hence
free the buffer before it queues it for write again.

Putting the buffer on the delwri queue takes a reference to the
buffer (so that it hangs around until it has been written and
completed), but this goes bang if the buffer has already been freed.

Hence we need to add the buffer to the delwri queue before we remove
the failed flags from the log items attached to the buffer to ensure
it always remains referenced during the resubmit process.

Reported-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_buf_item.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 12d8455bfbb2..010db5f8fb00 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1233,9 +1233,23 @@ xfs_buf_iodone(
 }
 
 /*
- * Requeue a failed buffer for writeback
+ * Requeue a failed buffer for writeback.
  *
- * Return true if the buffer has been re-queued properly, false otherwise
+ * We clear the log item failed state here as well, but we have to be careful
+ * about reference counts because the only active reference counts on the buffer
+ * may be the failed log items. Hence if we clear the log item failed state
+ * before queuing the buffer for IO we can release all active references to
+ * the buffer and free it, leading to use after free problems in
+ * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
+ * order we process them in - the buffer is locked, and we own the buffer list
+ * so nothing on them is going to change while we are performing this action.
+ *
+ * Hence we can safely queue the buffer for IO before we clear the failed log
+ * item state, therefore  always having an active reference to the buffer and
+ * avoiding the transient zero-reference state that leads to use-after-free.
+ *
+ * Return true if the buffer was added to the buffer list, false if it was
+ * already on the buffer list.
  */
 bool
 xfs_buf_resubmit_failed_buffers(
@@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers(
 	struct list_head	*buffer_list)
 {
 	struct xfs_log_item	*lip;
+	bool			ret;
+
+	ret = xfs_buf_delwri_queue(bp, buffer_list);
 
 	/*
-	 * Clear XFS_LI_FAILED flag from all items before resubmit
-	 *
-	 * XFS_LI_FAILED set/clear is protected by ail_lock, caller  this
+	 * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
 	 * function already have it acquired
 	 */
 	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
 		xfs_clear_li_failed(lip);
 
-	/* Add this buffer back to the delayed write list */
-	return xfs_buf_delwri_queue(bp, buffer_list);
+	return ret;
 }
-- 
cgit v1.2.3


From 886f0de16239a2596aa08a4fbaed10ee5b21a103 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 4 Feb 2019 08:54:24 -0800
Subject: xfs: delalloc -> unwritten COW fork allocation can go wrong

commit 9230a0b65b47fe6856c4468ec0175c4987e5bede upstream.

Long saga. There have been days spent following this through dead end
after dead end in multi-GB event traces. This morning, after writing
a trace-cmd wrapper that enabled me to be more selective about XFS
trace points, I discovered that I could get just enough essential
tracepoints enabled that there was a 50:50 chance the fsx config
would fail at ~115k ops. If it didn't fail at op 115547, I stopped
fsx at op 115548 anyway.

That gave me two traces - one where the problem manifested, and one
where it didn't. After refining the traces to have the necessary
information, I found that in the failing case there was a real
extent in the COW fork compared to an unwritten extent in the
working case.

Walking back through the two traces to the point where the CWO fork
extents actually diverged, I found that the bad case had an extra
unwritten extent in it. This is likely because the bug it led me to
had triggered multiple times in those 115k ops, leaving stray
COW extents around. What I saw was a COW delalloc conversion to an
unwritten extent (as they should always be through
xfs_iomap_write_allocate()) resulted in a /written extent/:

xfs_writepage:        dev 259:0 ino 0x83 pgoff 0x17000 size 0x79a00 offset 0 length 0
xfs_iext_remove:      dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/2 offset 32 block 152 count 20 flag 1 caller xfs_bmap_add_extent_delay_real
xfs_bmap_pre_update:  dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/1 offset 1 block 4503599627239429 count 31 flag 0 caller xfs_bmap_add_extent_delay_real
xfs_bmap_post_update: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/1 offset 1 block 121 count 51 flag 0 caller xfs_bmap_add_ex

Basically, Cow fork before:

	0 1            32          52
	+H+DDDDDDDDDDDD+UUUUUUUUUUU+
	   PREV		RIGHT

COW delalloc conversion allocates:

	  1	       32
	  +uuuuuuuuuuuu+
	  NEW

And the result according to the xfs_bmap_post_update trace was:

	0 1            32          52
	+H+wwwwwwwwwwwwwwwwwwwwwwww+
	   PREV

Which is clearly wrong - it should be a merged unwritten extent,
not an unwritten extent.

That lead me to look at the LEFT_FILLING|RIGHT_FILLING|RIGHT_CONTIG
case in xfs_bmap_add_extent_delay_real(), and sure enough, there's
the bug.

It takes the old delalloc extent (PREV) and adds the length of the
RIGHT extent to it, takes the start block from NEW, removes the
RIGHT extent and then updates PREV with the new extent.

What it fails to do is update PREV.br_state. For delalloc, this is
always XFS_EXT_NORM, while in this case we are converting the
delayed allocation to unwritten, so it needs to be updated to
XFS_EXT_UNWRITTEN. This LF|RF|RC case does not do this, and so
the resultant extent is always written.

And that's the bug I've been chasing for a week - a bmap btree bug,
not a reflink/dedupe/copy_file_range bug, but a BMBT bug introduced
with the recent in core extent tree scalability enhancements.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a47670332326..3a496ffe6551 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1683,10 +1683,13 @@ xfs_bmap_add_extent_delay_real(
 	case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
 		/*
 		 * Filling in all of a previously delayed allocation extent.
-		 * The right neighbor is contiguous, the left is not.
+		 * The right neighbor is contiguous, the left is not. Take care
+		 * with delay -> unwritten extent allocation here because the
+		 * delalloc record we are overwriting is always written.
 		 */
 		PREV.br_startblock = new->br_startblock;
 		PREV.br_blockcount += RIGHT.br_blockcount;
+		PREV.br_state = new->br_state;
 
 		xfs_iext_next(ifp, &bma->icur);
 		xfs_iext_remove(bma->ip, &bma->icur, state);
-- 
cgit v1.2.3


From 757332c643ca474dae6cfdacb5a1fce68580c82d Mon Sep 17 00:00:00 2001
From: Ye Yin <dbyin@tencent.com>
Date: Mon, 4 Feb 2019 08:54:25 -0800
Subject: fs/xfs: fix f_ffree value for statfs when project quota is set

commit de7243057e7cefa923fa5f467c0f1ec24eef41d2 upsream.

When project is set, we should use inode limit minus the used count

Signed-off-by: Ye Yin <dbyin@tencent.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_qm_bhv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 73a1d77ec187..3091e4bc04ef 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -40,7 +40,7 @@ xfs_fill_statvfs_from_dquot(
 		statp->f_files = limit;
 		statp->f_ffree =
 			(statp->f_files > dqp->q_res_icount) ?
-			 (statp->f_ffree - dqp->q_res_icount) : 0;
+			 (statp->f_files - dqp->q_res_icount) : 0;
 	}
 }
 
-- 
cgit v1.2.3


From c6c20af69c617121331e231f797893d9dca0c9c5 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 4 Feb 2019 08:54:26 -0800
Subject: xfs: fix PAGE_MASK usage in xfs_free_file_space

commit a579121f94aba4e8bad1a121a0fad050d6925296 upstream.

In commit e53c4b598, I *tried* to teach xfs to force writeback when we
fzero/fpunch right up to EOF so that if EOF is in the middle of a page,
the post-EOF part of the page gets zeroed before we return to userspace.
Unfortunately, I missed the part where PAGE_MASK is ~(PAGE_SIZE - 1),
which means that we totally fail to zero if we're fpunching and EOF is
within the first page.  Worse yet, the same PAGE_MASK thinko plagues the
filemap_write_and_wait_range call, so we'd initiate writeback of the
entire file, which (mostly) masked the thinko.

Drop the tricky PAGE_MASK and replace it with correct usage of PAGE_SIZE
and the proper rounding macros.

Fixes: e53c4b598 ("xfs: ensure post-EOF zeroing happens after zeroing part of a file")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/xfs_bmap_util.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 9d1e5c3a661e..211b06e4702e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1175,9 +1175,9 @@ xfs_free_file_space(
 	 * page could be mmap'd and iomap_zero_range doesn't do that for us.
 	 * Writeback of the eof page will do this, albeit clumsily.
 	 */
-	if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) {
+	if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) {
 		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-				(offset + len) & ~PAGE_MASK, LLONG_MAX);
+				round_down(offset + len, PAGE_SIZE), LLONG_MAX);
 	}
 
 	return error;
-- 
cgit v1.2.3


From 0c802cbaa6db4203fc559829548e19ae5ee8c9c2 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 4 Feb 2019 08:54:27 -0800
Subject: xfs: fix inverted return from xfs_btree_sblock_verify_crc

commit 7d048df4e9b05ba89b74d062df59498aa81f3785 upstream.

xfs_btree_sblock_verify_crc is a bool so should not be returning
a failaddr_t; worse, if xfs_log_check_lsn fails it returns
__this_address which looks like a boolean true (i.e. success)
to the caller.

(interestingly xfs_btree_lblock_verify_crc doesn't have the issue)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/xfs/libxfs/xfs_btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 34c6d7bd4d18..bbdae2b4559f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -330,7 +330,7 @@ xfs_btree_sblock_verify_crc(
 
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
-			return __this_address;
+			return false;
 		return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
 	}
 
-- 
cgit v1.2.3


From 1f78052b0991b6c19cdff093e1e9cad297d291cf Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 1 Feb 2019 09:36:36 -0800
Subject: xfs: eof trim writeback mapping as soon as it is cached

commit aa6ee4ab69293969867ab09b57546d226ace3d7a upstream.

The cached writeback mapping is EOF trimmed to try and avoid races
between post-eof block management and writeback that result in
sending cached data to a stale location. The cached mapping is
currently trimmed on the validation check, which leaves a race
window between the time the mapping is cached and when it is trimmed
against the current inode size.

For example, if a new mapping is cached by delalloc conversion on a
blocksize == page size fs, we could cycle various locks, perform
memory allocations, etc.  in the writeback codepath before the
associated mapping is eventually trimmed to i_size. This leaves
enough time for a post-eof truncate and file append before the
cached mapping is trimmed. The former event essentially invalidates
a range of the cached mapping and the latter bumps the inode size
such the trim on the next writepage event won't trim all of the
invalid blocks. fstest generic/464 reproduces this scenario
occasionally and causes a lost writeback and stale delalloc blocks
warning on inode inactivation.

To work around this problem, trim the cached writeback mapping as
soon as it is cached in addition to on subsequent validation checks.
This is a minor tweak to tighten the race window as much as possible
until a proper invalidation mechanism is available.

Fixes: 40214d128e07 ("xfs: trim writepage mapping to within eof")
Cc: <stable@vger.kernel.org> # v4.14+
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/xfs/xfs_aops.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 49f5f5896a43..b697866946d2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -449,6 +449,7 @@ xfs_map_blocks(
 	}
 
 	wpc->imap = imap;
+	xfs_trim_extent_eof(&wpc->imap, ip);
 	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
 	return 0;
 allocate_blocks:
@@ -459,6 +460,7 @@ allocate_blocks:
 	ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
 	       imap.br_startoff + imap.br_blockcount <= cow_fsb);
 	wpc->imap = imap;
+	xfs_trim_extent_eof(&wpc->imap, ip);
 	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
 	return 0;
 }
-- 
cgit v1.2.3


From 48be0eb05e7ebd0560b60938c5b82c93538877ba Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Sat, 12 Jan 2019 02:39:05 +0100
Subject: fuse: call pipe_buf_release() under pipe lock

commit 9509941e9c534920ccc4771ae70bd6cbbe79df1c upstream.

Some of the pipe_buf_release() handlers seem to assume that the pipe is
locked - in particular, anon_pipe_buf_release() accesses pipe->tmp_page
without taking any extra locks. From a glance through the callers of
pipe_buf_release(), it looks like FUSE is the only one that calls
pipe_buf_release() without having the pipe locked.

This bug should only lead to a memory leak, nothing terrible.

Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device")
Cc: stable@vger.kernel.org
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dev.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index bf0da0382c9e..c5e38061a48b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2032,8 +2032,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 
 	ret = fuse_dev_do_write(fud, &cs, len);
 
+	pipe_lock(pipe);
 	for (idx = 0; idx < nbuf; idx++)
 		pipe_buf_release(pipe, &bufs[idx]);
+	pipe_unlock(pipe);
 
 out:
 	kvfree(bufs);
-- 
cgit v1.2.3


From f99027ab63bee0cbbde549ff663aef8b1c1944b5 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 16 Jan 2019 10:27:59 +0100
Subject: fuse: decrement NR_WRITEBACK_TEMP on the right page

commit a2ebba824106dabe79937a9f29a875f837e1b6d4 upstream.

NR_WRITEBACK_TEMP is accounted on the temporary page in the request, not
the page cache page.

Fixes: 8b284dc47291 ("fuse: writepages: handle same page rewrites")
Cc: <stable@vger.kernel.org> # v3.13
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fbd6978479cb..bd500c3b7858 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1778,7 +1778,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 		spin_unlock(&fc->lock);
 
 		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
-		dec_node_page_state(page, NR_WRITEBACK_TEMP);
+		dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
 		wb_writeout_inc(&bdi->wb);
 		fuse_writepage_free(fc, new_req);
 		fuse_request_free(new_req);
-- 
cgit v1.2.3


From 6ccc9e1128e50e518ecdf9e1b120c735e40de25a Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 16 Jan 2019 10:27:59 +0100
Subject: fuse: handle zero sized retrieve correctly

commit 97e1532ef81acb31c30f9e75bf00306c33a77812 upstream.

Dereferencing req->page_descs[0] will Oops if req->max_pages is zero.

Reported-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com
Tested-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com
Fixes: b2430d7567a3 ("fuse: add per-page descriptor <offset, length> to fuse_req")
Cc: <stable@vger.kernel.org> # v3.9
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c5e38061a48b..baaed4d05b22 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1698,7 +1698,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 	req->in.h.nodeid = outarg->nodeid;
 	req->in.numargs = 2;
 	req->in.argpages = 1;
-	req->page_descs[0].offset = offset;
 	req->end = fuse_retrieve_end;
 
 	index = outarg->offset >> PAGE_SHIFT;
@@ -1713,6 +1712,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 
 		this_num = min_t(unsigned, num, PAGE_SIZE - offset);
 		req->pages[req->num_pages] = page;
+		req->page_descs[req->num_pages].offset = offset;
 		req->page_descs[req->num_pages].length = this_num;
 		req->num_pages++;
 
-- 
cgit v1.2.3


From c619140d4843e1e54b32fadadc79e0fba909f55d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 23 Jan 2019 11:27:02 +0100
Subject: debugfs: fix debugfs_rename parameter checking

commit d88c93f090f708c18195553b352b9f205e65418f upstream.

debugfs_rename() needs to check that the dentries passed into it really
are valid, as sometimes they are not (i.e. if the return value of
another debugfs call is passed into this one.)  So fix this up by
properly checking if the two parent directories are errors (they are
allowed to be NULL), and if the dentry to rename is not NULL or an
error.

Cc: stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/debugfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 13b01351dd1c..41ef452c1fcf 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -787,6 +787,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry,
 	struct dentry *dentry = NULL, *trap;
 	struct name_snapshot old_name;
 
+	if (IS_ERR(old_dir))
+		return old_dir;
+	if (IS_ERR(new_dir))
+		return new_dir;
+	if (IS_ERR_OR_NULL(old_dentry))
+		return old_dentry;
+
 	trap = lock_rename(new_dir, old_dir);
 	/* Source or destination directories don't exist? */
 	if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir))
-- 
cgit v1.2.3


From 28f49e768d212445dc8d5964df6f37df8d73c70d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 31 Jan 2019 23:41:11 -0500
Subject: Revert "ext4: use ext4_write_inode() when fsyncing w/o a journal"

commit 8fdd60f2ae3682caf2a7258626abc21eb4711892 upstream.

This reverts commit ad211f3e94b314a910d4af03178a0b52a7d1ee0a.

As Jan Kara pointed out, this change was unsafe since it means we lose
the call to sync_mapping_buffers() in the nojournal case.  The
original point of the commit was avoid taking the inode mutex (since
it causes a lockdep warning in generic/113); but we need the mutex in
order to call sync_mapping_buffers().

The real fix to this problem was discussed here:

https://lore.kernel.org/lkml/20181025150540.259281-4-bvanassche@acm.org

The proposed patch was to fix a syzbot complaint, but the problem can
also demonstrated via "kvm-xfstests -c nojournal generic/113".
Multiple solutions were discused in the e-mail thread, but none have
landed in the kernel as of this writing.  Anyway, commit
ad211f3e94b314 is absolutely the wrong way to suppress the lockdep, so
revert it.

Fixes: ad211f3e94b314a910d4af03178a0b52a7d1ee0a ("ext4: use ext4_write_inode() when fsyncing w/o a journal")
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reported: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/fsync.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 712f00995390..5508baa11bb6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -116,16 +116,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
-	ret = file_write_and_wait_range(file, start, end);
-	if (ret)
-		return ret;
-
 	if (!journal) {
-		struct writeback_control wbc = {
-			.sync_mode = WB_SYNC_ALL
-		};
-
-		ret = ext4_write_inode(inode, &wbc);
+		ret = __generic_file_fsync(file, start, end, datasync);
 		if (!ret)
 			ret = ext4_sync_parent(inode);
 		if (test_opt(inode->i_sb, BARRIER))
@@ -133,6 +125,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
+	ret = file_write_and_wait_range(file, start, end);
+	if (ret)
+		return ret;
 	/*
 	 * data=writeback,ordered:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-- 
cgit v1.2.3


From c2109f05b7fd5d2e5472f45dbcc343786c504719 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 14 Feb 2019 15:02:18 -0800
Subject: Revert "exec: load_script: don't blindly truncate shebang string"

commit cb5b020a8d38f77209d0472a0fea755299a8ec78 upstream.

This reverts commit 8099b047ecc431518b9bb6bdbba3549bbecdc343.

It turns out that people do actually depend on the shebang string being
truncated, and on the fact that an interpreter (like perl) will often
just re-interpret it entirely to get the full argument list.

Reported-by: Samuel Dionne-Riel <samuel@dionne-riel.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/binfmt_script.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d0078cbb718b..7cde3f46ad26 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -42,14 +42,10 @@ static int load_script(struct linux_binprm *bprm)
 	fput(bprm->file);
 	bprm->file = NULL;
 
-	for (cp = bprm->buf+2;; cp++) {
-		if (cp >= bprm->buf + BINPRM_BUF_SIZE)
-			return -ENOEXEC;
-		if (!*cp || (*cp == '\n'))
-			break;
-	}
+	bprm->buf[BINPRM_BUF_SIZE - 1] = '\0';
+	if ((cp = strchr(bprm->buf, '\n')) == NULL)
+		cp = bprm->buf+BINPRM_BUF_SIZE-1;
 	*cp = '\0';
-
 	while (cp > bprm->buf) {
 		cp--;
 		if ((*cp == ' ') || (*cp == '\t'))
-- 
cgit v1.2.3


From 63715c1f0a67a1e92631fca7f4b4dff89eb6be23 Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Tue, 8 Jan 2019 18:30:56 +0000
Subject: cifs: Limit memory used by lock request calls to a page

[ Upstream commit 92a8109e4d3a34fb6b115c9098b51767dc933444 ]

The code tries to allocate a contiguous buffer with a size supplied by
the server (maxBuf). This could fail if memory is fragmented since it
results in high order allocations for commonly used server
implementations. It is also wasteful since there are probably
few locks in the usual case. Limit the buffer to be no larger than a
page to avoid memory allocation failures due to fragmentation.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/file.c     | 8 ++++++++
 fs/cifs/smb2file.c | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7b637fc27990..23db881daab5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1128,6 +1128,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 		return -EINVAL;
 	}
 
+	BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+		     PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+			PAGE_SIZE);
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
 						sizeof(LOCKING_ANDX_RANGE);
 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
@@ -1466,6 +1470,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 	if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE)))
 		return -EINVAL;
 
+	BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) >
+		     PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr),
+			PAGE_SIZE);
 	max_num = (max_buf - sizeof(struct smb_hdr)) /
 						sizeof(LOCKING_ANDX_RANGE);
 	buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2fc3d31967ee..b204e84b87fb 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -128,6 +128,8 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 	if (max_buf < sizeof(struct smb2_lock_element))
 		return -EINVAL;
 
+	BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
 	max_num = max_buf / sizeof(struct smb2_lock_element);
 	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
 	if (!buf)
@@ -264,6 +266,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
 		return -EINVAL;
 	}
 
+	BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE);
+	max_buf = min_t(unsigned int, max_buf, PAGE_SIZE);
 	max_num = max_buf / sizeof(struct smb2_lock_element);
 	buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL);
 	if (!buf) {
-- 
cgit v1.2.3


From b1765ebd9d12c3aa46e1c94764723a373ad57341 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilov@microsoft.com>
Date: Tue, 15 Jan 2019 15:08:48 -0800
Subject: CIFS: Do not assume one credit for async responses

[ Upstream commit 0fd1d37b0501efc6e295f56ab55cdaff784aa50c ]

If we don't receive a response we can't assume that the server
granted one credit. Assume zero credits in such cases.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/smb2pdu.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 8a01e89ff827..1e5a1171212f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2814,9 +2814,10 @@ smb2_echo_callback(struct mid_q_entry *mid)
 {
 	struct TCP_Server_Info *server = mid->callback_data;
 	struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
-	unsigned int credits_received = 1;
+	unsigned int credits_received = 0;
 
-	if (mid->mid_state == MID_RESPONSE_RECEIVED)
+	if (mid->mid_state == MID_RESPONSE_RECEIVED
+	    || mid->mid_state == MID_RESPONSE_MALFORMED)
 		credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
 
 	DeleteMidQEntry(mid);
@@ -3073,7 +3074,7 @@ smb2_readv_callback(struct mid_q_entry *mid)
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct smb2_sync_hdr *shdr =
 				(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
-	unsigned int credits_received = 1;
+	unsigned int credits_received = 0;
 	struct smb_rqst rqst = { .rq_iov = rdata->iov,
 				 .rq_nvec = 2,
 				 .rq_pages = rdata->pages,
@@ -3112,6 +3113,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
 		task_io_account_read(rdata->got_bytes);
 		cifs_stats_bytes_read(tcon, rdata->got_bytes);
 		break;
+	case MID_RESPONSE_MALFORMED:
+		credits_received = le16_to_cpu(shdr->CreditRequest);
+		/* fall through */
 	default:
 		if (rdata->result != -ENODATA)
 			rdata->result = -EIO;
@@ -3305,7 +3309,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
 	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
 	unsigned int written;
 	struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf;
-	unsigned int credits_received = 1;
+	unsigned int credits_received = 0;
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
@@ -3333,6 +3337,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
 	case MID_RETRY_NEEDED:
 		wdata->result = -EAGAIN;
 		break;
+	case MID_RESPONSE_MALFORMED:
+		credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest);
+		/* fall through */
 	default:
 		wdata->result = -EIO;
 		break;
-- 
cgit v1.2.3


From 93769fef8d6122285b94b13fc8872847182f5440 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 14 Feb 2019 12:33:19 -0500
Subject: Revert "nfsd4: return default lease period"

commit 3bf6b57ec2ec945e5a6edf5c202a754f1e852ecd upstream.

This reverts commit d6ebf5088f09472c1136cd506bdc27034a6763f8.

I forgot that the kernel's default lease period should never be
decreased!

After a kernel upgrade, the kernel has no way of knowing on its own what
the previous lease time was.  Unless userspace tells it otherwise, it
will assume the previous lease period was the same.

So if we decrease this value in a kernel upgrade, we end up enforcing a
grace period that's too short, and clients will fail to reclaim state in
time.  Symptoms may include EIO and log messages like "NFS:
nfs4_reclaim_open_state: Lock reclaim failed!"

There was no real justification for the lease period decrease anyway.

Reported-by: Donald Buczek <buczek@molgen.mpg.de>
Fixes: d6ebf5088f09 "nfsd4: return default lease period"
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfsctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 899174c7a8ae..39b835d7c445 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1239,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net)
 	retval = nfsd_idmap_init(net);
 	if (retval)
 		goto out_idmap_error;
-	nn->nfsd4_lease = 45;	/* default lease time */
-	nn->nfsd4_grace = 45;
+	nn->nfsd4_lease = 90;	/* default lease time */
+	nn->nfsd4_grace = 90;
 	nn->somebody_reclaimed = false;
 	nn->clverifier_counter = prandom_u32();
 	nn->clientid_counter = prandom_u32();
-- 
cgit v1.2.3


From 8d485d3a628bb46a86f232d9960ef64533aaf29d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 12 Feb 2019 15:35:51 -0800
Subject: Revert "mm: don't reclaim inodes with many attached pages"

commit 69056ee6a8a3d576ed31e38b3b14c70d6c74edcc upstream.

This reverts commit a76cf1a474d7d ("mm: don't reclaim inodes with many
attached pages").

This change causes serious changes to page cache and inode cache
behaviour and balance, resulting in major performance regressions when
combining worklaods such as large file copies and kernel compiles.

  https://bugzilla.kernel.org/show_bug.cgi?id=202441

This change is a hack to work around the problems introduced by changing
how agressive shrinkers are on small caches in commit 172b06c32b94 ("mm:
slowly shrink slabs with a relatively small number of objects").  It
creates more problems than it solves, wasn't adequately reviewed or
tested, so it needs to be reverted.

Link: http://lkml.kernel.org/r/20190130041707.27750-2-david@fromorbit.com
Fixes: a76cf1a474d7d ("mm: don't reclaim inodes with many attached pages")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Cc: Wolfgang Walter <linux@stwm.de>
Cc: Roman Gushchin <guro@fb.com>
Cc: Spock <dairinin@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/inode.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 65ae154df760..42f6d25f32a5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 		return LRU_REMOVED;
 	}
 
-	/*
-	 * Recently referenced inodes and inodes with many attached pages
-	 * get one more pass.
-	 */
-	if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
+	/* recently referenced inodes get one more pass */
+	if (inode->i_state & I_REFERENCED) {
 		inode->i_state &= ~I_REFERENCED;
 		spin_unlock(&inode->i_lock);
 		return LRU_ROTATE;
-- 
cgit v1.2.3


From dd5f4d067a2cdda47b9f3b6c119642a2b81edee7 Mon Sep 17 00:00:00 2001
From: Sandeep Patil <sspatil@android.com>
Date: Tue, 12 Feb 2019 15:36:11 -0800
Subject: mm: proc: smaps_rollup: fix pss_locked calculation

commit 27dd768ed8db48beefc4d9e006c58e7a00342bde upstream.

The 'pss_locked' field of smaps_rollup was being calculated incorrectly.
It accumulated the current pss everytime a locked VMA was found.  Fix
that by adding to 'pss_locked' the same time as that of 'pss' if the vma
being walked is locked.

Link: http://lkml.kernel.org/r/20190203065425.14650-1-sspatil@android.com
Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup")
Signed-off-by: Sandeep Patil <sspatil@android.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Daniel Colascione <dancol@google.com>
Cc: <stable@vger.kernel.org>	[4.14.x, 4.19.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/task_mmu.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a027473561c6..d76fe166f6ce 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -423,7 +423,7 @@ struct mem_size_stats {
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		bool compound, bool young, bool dirty)
+		bool compound, bool young, bool dirty, bool locked)
 {
 	int i, nr = compound ? 1 << compound_order(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
@@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 		else
 			mss->private_clean += size;
 		mss->pss += (u64)size << PSS_SHIFT;
+		if (locked)
+			mss->pss_locked += (u64)size << PSS_SHIFT;
 		return;
 	}
 
 	for (i = 0; i < nr; i++, page++) {
 		int mapcount = page_mapcount(page);
+		unsigned long pss = (PAGE_SIZE << PSS_SHIFT);
 
 		if (mapcount >= 2) {
 			if (dirty || PageDirty(page))
 				mss->shared_dirty += PAGE_SIZE;
 			else
 				mss->shared_clean += PAGE_SIZE;
-			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+			mss->pss += pss / mapcount;
+			if (locked)
+				mss->pss_locked += pss / mapcount;
 		} else {
 			if (dirty || PageDirty(page))
 				mss->private_dirty += PAGE_SIZE;
 			else
 				mss->private_clean += PAGE_SIZE;
-			mss->pss += PAGE_SIZE << PSS_SHIFT;
+			mss->pss += pss;
+			if (locked)
+				mss->pss_locked += pss;
 		}
 	}
 }
@@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = walk->vma;
+	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page = NULL;
 
 	if (pte_present(*pte)) {
@@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 	if (!page)
 		return;
 
-	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
+	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = walk->vma;
+	bool locked = !!(vma->vm_flags & VM_LOCKED);
 	struct page *page;
 
 	/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 		/* pass */;
 	else
 		VM_BUG_ON_PAGE(1, page);
-	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
+	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma,
 		}
 	}
 #endif
-
 	/* mmap_sem is held in m_start */
 	walk_page_vma(vma, &smaps_walk);
-	if (vma->vm_flags & VM_LOCKED)
-		mss->pss_locked += mss->pss;
 }
 
 #define SEQ_PUT_DEC(str, val) \
-- 
cgit v1.2.3


From b8d7fb1efb960f52730b231f5f8c6c63d11bc00b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 11 Feb 2019 15:18:52 +0800
Subject: ceph: avoid repeatedly adding inode to mdsc->snap_flush_list

commit 04242ff3ac0abbaa4362f97781dac268e6c3541a upstream.

Otherwise, mdsc->snap_flush_list may get corrupted.

Cc: stable@vger.kernel.org
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/snap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 041c27ea8de1..f74193da0e09 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	     capsnap->size);
 
 	spin_lock(&mdsc->snap_flush_lock);
-	list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+	if (list_empty(&ci->i_snap_flush_item))
+		list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
 	spin_unlock(&mdsc->snap_flush_lock);
 	return 1;  /* caller may want to ceph_flush_snaps */
 }
-- 
cgit v1.2.3


From a89e0d5c603ac3154670df31a4f871df629507c7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 20 Feb 2019 22:19:42 -0800
Subject: proc, oom: do not report alien mms when setting oom_score_adj

commit b2b469939e93458753cfbf8282ad52636495965e upstream.

Tetsuo has reported that creating a thousands of processes sharing MM
without SIGHAND (aka alien threads) and setting
/proc/<pid>/oom_score_adj will swamp the kernel log and takes ages [1]
to finish.  This is especially worrisome that all that printing is done
under RCU lock and this can potentially trigger RCU stall or softlockup
detector.

The primary reason for the printk was to catch potential users who might
depend on the behavior prior to 44a70adec910 ("mm, oom_adj: make sure
processes sharing mm have same view of oom_score_adj") but after more
than 2 years without a single report I guess it is safe to simply remove
the printk altogether.

The next step should be moving oom_score_adj over to the mm struct and
remove all the tasks crawling as suggested by [2]

[1] http://lkml.kernel.org/r/97fce864-6f75-bca5-14bc-12c9f890e740@i-love.sakura.ne.jp
[2] http://lkml.kernel.org/r/20190117155159.GA4087@dhcp22.suse.cz

Link: http://lkml.kernel.org/r/20190212102129.26288-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Yong-Taek Lee <ytk.lee@samsung.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/base.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7e9f07bf260d..81d77b15b347 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1084,10 +1084,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
 
 			task_lock(p);
 			if (!p->vfork_done && process_shares_mm(p, mm)) {
-				pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
-						task_pid_nr(p), p->comm,
-						p->signal->oom_score_adj, oom_adj,
-						task_pid_nr(task), task->comm);
 				p->signal->oom_score_adj = oom_adj;
 				if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
 					p->signal->oom_score_adj_min = (short)oom_adj;
-- 
cgit v1.2.3


From dc4ec1bad9e2dfc6308c7c53f95f491b22cef897 Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Wed, 9 Jan 2019 17:23:54 +0000
Subject: afs: Don't set vnode->cb_s_break in afs_validate()

[ Upstream commit 4882a27cec24319d10f95e978ecc80050e3e3e15 ]

A cb_interest record is not necessarily attached to the vnode on entry to
afs_validate(), which can cause an oops when we try to bring the vnode's
cb_s_break up to date in the default case (ie. no current callback promise
and the vnode has not been deleted).

Fix this by simply removing the line, as vnode->cb_s_break will be set when
needed by afs_register_server_cb_interest() when we next get a callback
promise from RPC call.

The oops looks something like:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    ...
    RIP: 0010:afs_validate+0x66/0x250 [kafs]
    ...
    Call Trace:
     afs_d_revalidate+0x8d/0x340 [kafs]
     ? __d_lookup+0x61/0x150
     lookup_dcache+0x44/0x70
     ? lookup_dcache+0x44/0x70
     __lookup_hash+0x24/0xa0
     do_unlinkat+0x11d/0x2c0
     __x64_sys_unlink+0x23/0x30
     do_syscall_64+0x4d/0xf0
     entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: ae3b7361dc0e ("afs: Fix validation/callback interaction")
Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/afs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 071075d775a9..26aa2d111a28 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -411,7 +411,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	} else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		valid = true;
 	} else {
-		vnode->cb_s_break = vnode->cb_interest->server->cb_s_break;
 		vnode->cb_v_break = vnode->volume->cb_v_break;
 		valid = false;
 	}
-- 
cgit v1.2.3


From 5d6af6f9dd2f99381b147431e30a72fbd625fe70 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 9 Jan 2019 17:23:54 +0000
Subject: afs: Fix key refcounting in file locking code

[ Upstream commit 59d49076ae3e6912e6d7df2fd68e2337f3d02036 ]

Fix the refcounting of the authentication keys in the file locking code.
The vnode->lock_key member points to a key on which it expects to be
holding a ref, but it isn't always given an extra ref, however.

Fixes: 0fafdc9f888b ("afs: Fix file locking")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/afs/flock.c | 4 ++--
 fs/afs/inode.c | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index dc62d15a964b..1bb300ef362b 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -208,7 +208,7 @@ again:
 		/* The new front of the queue now owns the state variables. */
 		next = list_entry(vnode->pending_locks.next,
 				  struct file_lock, fl_u.afs.link);
-		vnode->lock_key = afs_file_key(next->fl_file);
+		vnode->lock_key = key_get(afs_file_key(next->fl_file));
 		vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 		vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
 		goto again;
@@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
 	/* The new front of the queue now owns the state variables. */
 	next = list_entry(vnode->pending_locks.next,
 			  struct file_lock, fl_u.afs.link);
-	vnode->lock_key = afs_file_key(next->fl_file);
+	vnode->lock_key = key_get(afs_file_key(next->fl_file));
 	vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 	vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
 	afs_lock_may_be_available(vnode);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 26aa2d111a28..0726e40db0f8 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -542,6 +542,8 @@ void afs_evict_inode(struct inode *inode)
 #endif
 
 	afs_put_permits(rcu_access_pointer(vnode->permit_cache));
+	key_put(vnode->lock_key);
+	vnode->lock_key = NULL;
 	_leave("");
 }
 
-- 
cgit v1.2.3


From c5a1dc256cc2ac6c08f637994e513a8cb65be3d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?=
 <ernesto.mnd.fernandez@gmail.com>
Date: Mon, 8 Oct 2018 20:58:23 -0300
Subject: direct-io: allow direct writes to empty inodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 8b9433eb4de3c26a9226c981c283f9f4896ae030 ]

On a DIO_SKIP_HOLES filesystem, the ->get_block() method is currently
not allowed to create blocks for an empty inode.  This confusion comes
from trying to bit shift a negative number, so check the size of the
inode first.

The problem is most visible for hfsplus, because the fallback to
buffered I/O doesn't happen and the write fails with EIO.  This is in
part the fault of the module, because it gives a wrong return value on
->get_block(); that will be fixed in a separate patch.

Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ernesto A. Fernández <ernesto.mnd.fernandez@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/direct-io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 199146036093..1abb7634b2d5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	int create;
 	unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
+	loff_t i_size;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
 		 */
 		create = dio->op == REQ_OP_WRITE;
 		if (dio->flags & DIO_SKIP_HOLES) {
-			if (fs_startblk <= ((i_size_read(dio->inode) - 1) >>
-							i_blkbits))
+			i_size = i_size_read(dio->inode);
+			if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits)
 				create = 0;
 		}
 
-- 
cgit v1.2.3


From edca54b897bb16d2ad7c0549b7bce2e5e2bc36e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Dec 2017 08:38:30 -0800
Subject: writeback: synchronize sync(2) against cgroup writeback membership
 switches

[ Upstream commit 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 ]

sync_inodes_sb() can race against cgwb (cgroup writeback) membership
switches and fail to writeback some inodes.  For example, if an inode
switches to another wb while sync_inodes_sb() is in progress, the new
wb might not be visible to bdi_split_work_to_wbs() at all or the inode
might jump from a wb which hasn't issued writebacks yet to one which
already has.

This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
switch path against sync_inodes_sb() so that sync_inodes_sb() is
guaranteed to see all the target wbs and inodes can't jump wbs to
escape syncing.

v2: Fixed misplaced rwsem init.  Spotted by Jiufei.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jiufei Xue <xuejiufei@gmail.com>
Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/fs-writeback.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..82ce6d4f7e31 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
 	struct work_struct	work;
 };
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	up_write(&bdi->wb_switch_rwsem);
+}
+
 static void inode_switch_wbs_work_fn(struct work_struct *work)
 {
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	bool switched = false;
 	void **slot;
 
+	/*
+	 * If @inode switches cgwb membership while sync_inodes_sb() is
+	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
+	 */
+	down_read(&bdi->wb_switch_rwsem);
+
 	/*
 	 * By the time control reaches here, RCU grace period has passed
 	 * since I_WB_SWITCH assertion and all wb stat update transactions
@@ -435,6 +452,8 @@ skip_switch:
 	spin_unlock(&new_wb->list_lock);
 	spin_unlock(&old_wb->list_lock);
 
+	up_read(&bdi->wb_switch_rwsem);
+
 	if (switched) {
 		wb_wakeup(new_wb);
 		wb_put(old_wb);
@@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	if (inode->i_state & I_WB_SWITCH)
 		return;
 
+	/*
+	 * Avoid starting new switches while sync_inodes_sb() is in
+	 * progress.  Otherwise, if the down_write protected issue path
+	 * blocks heavily, we might end up starting a large number of
+	 * switches which will block on the rwsem.
+	 */
+	if (!down_read_trylock(&bdi->wb_switch_rwsem))
+		return;
+
 	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
 	if (!isw)
-		return;
+		goto out_unlock;
 
 	/* find and pin the new wb */
 	rcu_read_lock();
@@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
 	 */
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-	return;
+	goto out_unlock;
 
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
 	kfree(isw);
+out_unlock:
+	up_read(&bdi->wb_switch_rwsem);
 }
 
 /**
@@ -894,6 +924,9 @@ fs_initcall(cgroup_writeback_init);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
 	__releases(&inode->i_lock)
@@ -2420,8 +2453,11 @@ void sync_inodes_sb(struct super_block *sb)
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+	bdi_down_write_wb_switch_rwsem(bdi);
 	bdi_split_work_to_wbs(bdi, &work, false);
 	wb_wait_for_completion(bdi, &done);
+	bdi_up_write_wb_switch_rwsem(bdi);
 
 	wait_sb_inodes(sb);
 }
-- 
cgit v1.2.3


From 527cabfffbc5f55c111d510a918b33fb9fbe537d Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 28 Feb 2019 16:22:02 -0800
Subject: hugetlbfs: fix races and page leaks during migration

commit cb6acd01e2e43fd8bad11155752b7699c3d0fb76 upstream.

hugetlb pages should only be migrated if they are 'active'.  The
routines set/clear_page_huge_active() modify the active state of hugetlb
pages.

When a new hugetlb page is allocated at fault time, set_page_huge_active
is called before the page is locked.  Therefore, another thread could
race and migrate the page while it is being added to page table by the
fault code.  This race is somewhat hard to trigger, but can be seen by
strategically adding udelay to simulate worst case scheduling behavior.
Depending on 'how' the code races, various BUG()s could be triggered.

To address this issue, simply delay the set_page_huge_active call until
after the page is successfully added to the page table.

Hugetlb pages can also be leaked at migration time if the pages are
associated with a file in an explicitly mounted hugetlbfs filesystem.
For example, consider a two node system with 4GB worth of huge pages
available.  A program mmaps a 2G file in a hugetlbfs filesystem.  It
then migrates the pages associated with the file from one node to
another.  When the program exits, huge page counts are as follows:

  node0
  1024    free_hugepages
  1024    nr_hugepages

  node1
  0       free_hugepages
  1024    nr_hugepages

  Filesystem                         Size  Used Avail Use% Mounted on
  nodev                              4.0G  2.0G  2.0G  50% /var/opt/hugepool

That is as expected.  2G of huge pages are taken from the free_hugepages
counts, and 2G is the size of the file in the explicitly mounted
filesystem.  If the file is then removed, the counts become:

  node0
  1024    free_hugepages
  1024    nr_hugepages

  node1
  1024    free_hugepages
  1024    nr_hugepages

  Filesystem                         Size  Used Avail Use% Mounted on
  nodev                              4.0G  2.0G  2.0G  50% /var/opt/hugepool

Note that the filesystem still shows 2G of pages used, while there
actually are no huge pages in use.  The only way to 'fix' the filesystem
accounting is to unmount the filesystem

If a hugetlb page is associated with an explicitly mounted filesystem,
this information in contained in the page_private field.  At migration
time, this information is not preserved.  To fix, simply transfer
page_private from old to new page at migration time if necessary.

There is a related race with removing a huge page from a file and
migration.  When a huge page is removed from the pagecache, the
page_mapping() field is cleared, yet page_private remains set until the
page is actually freed by free_huge_page().  A page could be migrated
while in this state.  However, since page_mapping() is not set the
hugetlbfs specific routine to transfer page_private is not called and we
leak the page count in the filesystem.

To fix that, check for this condition before migrating a huge page.  If
the condition is detected, return EBUSY for the page.

Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com
Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com
Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
[mike.kravetz@oracle.com: v2]
  Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com
[mike.kravetz@oracle.com: update comment and changelog]
  Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hugetlbfs/inode.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 32920a10100e..a7fa037b876b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -859,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
 	rc = migrate_huge_page_move_mapping(mapping, newpage, page);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
+
+	/*
+	 * page_private is subpool pointer in hugetlb pages.  Transfer to
+	 * new page.  PagePrivate is not associated with page_private for
+	 * hugetlb pages and can not be set here as only page_huge_active
+	 * pages can be migrated.
+	 */
+	if (page_private(page)) {
+		set_page_private(newpage, page_private(page));
+		set_page_private(page, 0);
+	}
+
 	if (mode != MIGRATE_SYNC_NO_COPY)
 		migrate_page_copy(newpage, page);
 	else
-- 
cgit v1.2.3


From f5e66cdb51fd352d7091cb2516b24734e524cb33 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 8 Feb 2019 16:59:49 -0800
Subject: aio: Fix locking in aio_poll()

commit d3d6a18d7d351cbcc9b33dbedf710e65f8ce1595 upstream.

wake_up_locked() may but does not have to be called with interrupts
disabled. Since the fuse filesystem calls wake_up_locked() without
disabling interrupts aio_poll_wake() may be called with interrupts
enabled. Since the kioctx.ctx_lock may be acquired from IRQ context,
all code that acquires that lock from thread context must disable
interrupts. Hence change the spin_trylock() call in aio_poll_wake()
into a spin_trylock_irqsave() call. This patch fixes the following
lockdep complaint:

=====================================================
WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
5.0.0-rc4-next-20190131 #23 Not tainted
-----------------------------------------------------
syz-executor2/13779 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
0000000098ac1230 (&fiq->waitq){+.+.}, at: spin_lock include/linux/spinlock.h:329 [inline]
0000000098ac1230 (&fiq->waitq){+.+.}, at: aio_poll fs/aio.c:1772 [inline]
0000000098ac1230 (&fiq->waitq){+.+.}, at: __io_submit_one fs/aio.c:1875 [inline]
0000000098ac1230 (&fiq->waitq){+.+.}, at: io_submit_one+0xedf/0x1cf0 fs/aio.c:1908

and this task is already holding:
000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: spin_lock_irq include/linux/spinlock.h:354 [inline]
000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: aio_poll fs/aio.c:1771 [inline]
000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: __io_submit_one fs/aio.c:1875 [inline]
000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: io_submit_one+0xeb6/0x1cf0 fs/aio.c:1908
which would create a new lock dependency:
 (&(&ctx->ctx_lock)->rlock){..-.} -> (&fiq->waitq){+.+.}

but this new dependency connects a SOFTIRQ-irq-safe lock:
 (&(&ctx->ctx_lock)->rlock){..-.}

... which became SOFTIRQ-irq-safe at:
  lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
  __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline]
  _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160
  spin_lock_irq include/linux/spinlock.h:354 [inline]
  free_ioctx_users+0x2d/0x4a0 fs/aio.c:610
  percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline]
  percpu_ref_put include/linux/percpu-refcount.h:301 [inline]
  percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline]
  percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158
  __rcu_reclaim kernel/rcu/rcu.h:240 [inline]
  rcu_do_batch kernel/rcu/tree.c:2486 [inline]
  invoke_rcu_callbacks kernel/rcu/tree.c:2799 [inline]
  rcu_core+0x928/0x1390 kernel/rcu/tree.c:2780
  __do_softirq+0x266/0x95a kernel/softirq.c:292
  run_ksoftirqd kernel/softirq.c:654 [inline]
  run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646
  smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164
  kthread+0x357/0x430 kernel/kthread.c:247
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352

to a SOFTIRQ-irq-unsafe lock:
 (&fiq->waitq){+.+.}

... which became SOFTIRQ-irq-unsafe at:
...
  lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
  __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
  _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
  spin_lock include/linux/spinlock.h:329 [inline]
  flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415
  fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676
  fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687
  fuse_send_init fs/fuse/inode.c:989 [inline]
  fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214
  mount_nodev+0x68/0x110 fs/super.c:1392
  fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239
  legacy_get_tree+0xf2/0x200 fs/fs_context.c:590
  vfs_get_tree+0x123/0x450 fs/super.c:1481
  do_new_mount fs/namespace.c:2610 [inline]
  do_mount+0x1436/0x2c40 fs/namespace.c:2932
  ksys_mount+0xdb/0x150 fs/namespace.c:3148
  __do_sys_mount fs/namespace.c:3162 [inline]
  __se_sys_mount fs/namespace.c:3159 [inline]
  __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159
  do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

other info that might help us debug this:

 Possible interrupt unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&fiq->waitq);
                               local_irq_disable();
                               lock(&(&ctx->ctx_lock)->rlock);
                               lock(&fiq->waitq);
  <Interrupt>
    lock(&(&ctx->ctx_lock)->rlock);

 *** DEADLOCK ***

1 lock held by syz-executor2/13779:
 #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: spin_lock_irq include/linux/spinlock.h:354 [inline]
 #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: aio_poll fs/aio.c:1771 [inline]
 #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: __io_submit_one fs/aio.c:1875 [inline]
 #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: io_submit_one+0xeb6/0x1cf0 fs/aio.c:1908

the dependencies between SOFTIRQ-irq-safe lock and the holding lock:
-> (&(&ctx->ctx_lock)->rlock){..-.} {
   IN-SOFTIRQ-W at:
                    lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
                    __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline]
                    _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160
                    spin_lock_irq include/linux/spinlock.h:354 [inline]
                    free_ioctx_users+0x2d/0x4a0 fs/aio.c:610
                    percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline]
                    percpu_ref_put include/linux/percpu-refcount.h:301 [inline]
                    percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline]
                    percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158
                    __rcu_reclaim kernel/rcu/rcu.h:240 [inline]
                    rcu_do_batch kernel/rcu/tree.c:2486 [inline]
                    invoke_rcu_callbacks kernel/rcu/tree.c:2799 [inline]
                    rcu_core+0x928/0x1390 kernel/rcu/tree.c:2780
                    __do_softirq+0x266/0x95a kernel/softirq.c:292
                    run_ksoftirqd kernel/softirq.c:654 [inline]
                    run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646
                    smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164
                    kthread+0x357/0x430 kernel/kthread.c:247
                    ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
   INITIAL USE at:
                   lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
                   __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline]
                   _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160
                   spin_lock_irq include/linux/spinlock.h:354 [inline]
                   __do_sys_io_cancel fs/aio.c:2052 [inline]
                   __se_sys_io_cancel fs/aio.c:2035 [inline]
                   __x64_sys_io_cancel+0xd5/0x5a0 fs/aio.c:2035
                   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
                   entry_SYSCALL_64_after_hwframe+0x49/0xbe
 }
 ... key      at: [<ffffffff8a574140>] __key.52370+0x0/0x40
 ... acquired at:
   lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
   _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
   spin_lock include/linux/spinlock.h:329 [inline]
   aio_poll fs/aio.c:1772 [inline]
   __io_submit_one fs/aio.c:1875 [inline]
   io_submit_one+0xedf/0x1cf0 fs/aio.c:1908
   __do_sys_io_submit fs/aio.c:1953 [inline]
   __se_sys_io_submit fs/aio.c:1923 [inline]
   __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923
   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

the dependencies between the lock to be acquired
 and SOFTIRQ-irq-unsafe lock:
-> (&fiq->waitq){+.+.} {
   HARDIRQ-ON-W at:
                    lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
                    __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
                    _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
                    spin_lock include/linux/spinlock.h:329 [inline]
                    flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415
                    fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676
                    fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687
                    fuse_send_init fs/fuse/inode.c:989 [inline]
                    fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214
                    mount_nodev+0x68/0x110 fs/super.c:1392
                    fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239
                    legacy_get_tree+0xf2/0x200 fs/fs_context.c:590
                    vfs_get_tree+0x123/0x450 fs/super.c:1481
                    do_new_mount fs/namespace.c:2610 [inline]
                    do_mount+0x1436/0x2c40 fs/namespace.c:2932
                    ksys_mount+0xdb/0x150 fs/namespace.c:3148
                    __do_sys_mount fs/namespace.c:3162 [inline]
                    __se_sys_mount fs/namespace.c:3159 [inline]
                    __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159
                    do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
   SOFTIRQ-ON-W at:
                    lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
                    __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
                    _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
                    spin_lock include/linux/spinlock.h:329 [inline]
                    flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415
                    fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676
                    fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687
                    fuse_send_init fs/fuse/inode.c:989 [inline]
                    fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214
                    mount_nodev+0x68/0x110 fs/super.c:1392
                    fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239
                    legacy_get_tree+0xf2/0x200 fs/fs_context.c:590
                    vfs_get_tree+0x123/0x450 fs/super.c:1481
                    do_new_mount fs/namespace.c:2610 [inline]
                    do_mount+0x1436/0x2c40 fs/namespace.c:2932
                    ksys_mount+0xdb/0x150 fs/namespace.c:3148
                    __do_sys_mount fs/namespace.c:3162 [inline]
                    __se_sys_mount fs/namespace.c:3159 [inline]
                    __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159
                    do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
   INITIAL USE at:
                   lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
                   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
                   _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
                   spin_lock include/linux/spinlock.h:329 [inline]
                   flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415
                   fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676
                   fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687
                   fuse_send_init fs/fuse/inode.c:989 [inline]
                   fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214
                   mount_nodev+0x68/0x110 fs/super.c:1392
                   fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239
                   legacy_get_tree+0xf2/0x200 fs/fs_context.c:590
                   vfs_get_tree+0x123/0x450 fs/super.c:1481
                   do_new_mount fs/namespace.c:2610 [inline]
                   do_mount+0x1436/0x2c40 fs/namespace.c:2932
                   ksys_mount+0xdb/0x150 fs/namespace.c:3148
                   __do_sys_mount fs/namespace.c:3162 [inline]
                   __se_sys_mount fs/namespace.c:3159 [inline]
                   __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159
                   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
                   entry_SYSCALL_64_after_hwframe+0x49/0xbe
 }
 ... key      at: [<ffffffff8a60dec0>] __key.43450+0x0/0x40
 ... acquired at:
   lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
   __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
   _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
   spin_lock include/linux/spinlock.h:329 [inline]
   aio_poll fs/aio.c:1772 [inline]
   __io_submit_one fs/aio.c:1875 [inline]
   io_submit_one+0xedf/0x1cf0 fs/aio.c:1908
   __do_sys_io_submit fs/aio.c:1953 [inline]
   __se_sys_io_submit fs/aio.c:1923 [inline]
   __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923
   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

stack backtrace:
CPU: 0 PID: 13779 Comm: syz-executor2 Not tainted 5.0.0-rc4-next-20190131 #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_bad_irq_dependency kernel/locking/lockdep.c:1573 [inline]
 check_usage.cold+0x60f/0x940 kernel/locking/lockdep.c:1605
 check_irq_usage kernel/locking/lockdep.c:1650 [inline]
 check_prev_add_irq kernel/locking/lockdep_states.h:8 [inline]
 check_prev_add kernel/locking/lockdep.c:1860 [inline]
 check_prevs_add kernel/locking/lockdep.c:1968 [inline]
 validate_chain kernel/locking/lockdep.c:2339 [inline]
 __lock_acquire+0x1f12/0x4790 kernel/locking/lockdep.c:3320
 lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826
 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
 _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
 spin_lock include/linux/spinlock.h:329 [inline]
 aio_poll fs/aio.c:1772 [inline]
 __io_submit_one fs/aio.c:1875 [inline]
 io_submit_one+0xedf/0x1cf0 fs/aio.c:1908
 __do_sys_io_submit fs/aio.c:1953 [inline]
 __se_sys_io_submit fs/aio.c:1923 [inline]
 __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923
 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Avi Kivity <avi@scylladb.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org>
Fixes: e8693bcfa0b4 ("aio: allow direct aio poll comletions for keyed wakeups") # v4.19
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
[ bvanassche: added a comment ]
Reluctantly-Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 44551d96eaa4..45d5ef8dd0a8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1661,6 +1661,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
 	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
 	__poll_t mask = key_to_poll(key);
+	unsigned long flags;
 
 	req->woken = true;
 
@@ -1669,10 +1670,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 		if (!(mask & req->events))
 			return 0;
 
-		/* try to complete the iocb inline if we can: */
-		if (spin_trylock(&iocb->ki_ctx->ctx_lock)) {
+		/*
+		 * Try to complete the iocb inline if we can. Use
+		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
+		 * call this function with IRQs disabled and because IRQs
+		 * have to be disabled before ctx_lock is obtained.
+		 */
+		if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
 			list_del(&iocb->ki_list);
-			spin_unlock(&iocb->ki_ctx->ctx_lock);
+			spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
 
 			list_del_init(&req->wait.entry);
 			aio_poll_complete(iocb, mask);
-- 
cgit v1.2.3


From b60d90b2d3d14c426693a0a34041db11be66d29e Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 19 Feb 2019 10:10:38 +0800
Subject: exec: Fix mem leak in kernel_read_file

commit f612acfae86af7ecad754ae6a46019be9da05b8e upstream.

syzkaller report this:
BUG: memory leak
unreferenced object 0xffffc9000488d000 (size 9195520):
  comm "syz-executor.0", pid 2752, jiffies 4294787496 (age 18.757s)
  hex dump (first 32 bytes):
    ff ff ff ff ff ff ff ff a8 00 00 00 01 00 00 00  ................
    02 00 00 00 00 00 00 00 80 a1 7a c1 ff ff ff ff  ..........z.....
  backtrace:
    [<000000000863775c>] __vmalloc_node mm/vmalloc.c:1795 [inline]
    [<000000000863775c>] __vmalloc_node_flags mm/vmalloc.c:1809 [inline]
    [<000000000863775c>] vmalloc+0x8c/0xb0 mm/vmalloc.c:1831
    [<000000003f668111>] kernel_read_file+0x58f/0x7d0 fs/exec.c:924
    [<000000002385813f>] kernel_read_file_from_fd+0x49/0x80 fs/exec.c:993
    [<0000000011953ff1>] __do_sys_finit_module+0x13b/0x2a0 kernel/module.c:3895
    [<000000006f58491f>] do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
    [<00000000ee78baf4>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [<00000000241f889b>] 0xffffffffffffffff

It should goto 'out_free' lable to free allocated buf while kernel_read
fails.

Fixes: 39d637af5aa7 ("vfs: forbid write access when reading a file into memory")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Thibaut Sautereau <thibaut@sautereau.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 1ebf6e5a521d..433b1257694a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -929,7 +929,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
 		bytes = kernel_read(file, *buf + pos, i_size - pos, &pos);
 		if (bytes < 0) {
 			ret = bytes;
-			goto out;
+			goto out_free;
 		}
 
 		if (bytes == 0)
-- 
cgit v1.2.3


From d23792f53f0de5706520f1302a8a4143e13a1680 Mon Sep 17 00:00:00 2001
From: Piotr Jaroszynski <pjaroszynski@nvidia.com>
Date: Sun, 27 Jan 2019 08:46:45 -0800
Subject: iomap: get/put the page in iomap_page_create/release()

[ Upstream commit 8e47a457321ca1a74ad194ab5dcbca764bc70731 ]

migrate_page_move_mapping() expects pages with private data set to have
a page_count elevated by 1.  This is what used to happen for xfs through
the buffer_heads code before the switch to iomap in commit 82cb14175e7d
("xfs: add support for sub-pagesize writeback without buffer_heads").
Not having the count elevated causes move_pages() to fail on memory
mapped files coming from xfs.

Make iomap compatible with the migrate_page_move_mapping() assumption by
elevating the page count as part of iomap_page_create() and lowering it
in iomap_page_release().

It causes the move_pages() syscall to misbehave on memory mapped files
from xfs.  It does not not move any pages, which I suppose is "just" a
perf issue, but it also ends up returning a positive number which is out
of spec for the syscall.  Talking to Michal Hocko, it sounds like
returning positive numbers might be a necessary update to move_pages()
anyway though.

Fixes: 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads")
Signed-off-by: Piotr Jaroszynski <pjaroszynski@nvidia.com>
[hch: actually get/put the page iomap_migrate_page() to make it work
      properly]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/iomap.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/iomap.c b/fs/iomap.c
index e57fb1e534c5..2e3e64012db7 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -117,6 +117,12 @@ iomap_page_create(struct inode *inode, struct page *page)
 	atomic_set(&iop->read_count, 0);
 	atomic_set(&iop->write_count, 0);
 	bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+
+	/*
+	 * migrate_page_move_mapping() assumes that pages with private data have
+	 * their count elevated by 1.
+	 */
+	get_page(page);
 	set_page_private(page, (unsigned long)iop);
 	SetPagePrivate(page);
 	return iop;
@@ -133,6 +139,7 @@ iomap_page_release(struct page *page)
 	WARN_ON_ONCE(atomic_read(&iop->write_count));
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
+	put_page(page);
 	kfree(iop);
 }
 
@@ -565,8 +572,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
 
 	if (page_has_private(page)) {
 		ClearPagePrivate(page);
+		get_page(newpage);
 		set_page_private(newpage, page_private(page));
 		set_page_private(page, 0);
+		put_page(page);
 		SetPagePrivate(newpage);
 	}
 
-- 
cgit v1.2.3


From d9ba842efdf068c53095b15f68377327259c3d16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 17 Jan 2019 08:58:58 -0800
Subject: iomap: fix a use after free in iomap_dio_rw

[ Upstream commit 4ea899ead2786a30aaa8181fefa81a3df4ad28f6 ]

Introduce a local wait_for_completion variable to avoid an access to the
potentially freed dio struture after dropping the last reference count.

Also use the chance to document the completion behavior to make the
refcounting clear to the reader of the code.

Fixes: ff6a9292e6 ("iomap: implement direct I/O")
Reported-by: Chandan Rajendra <chandan@linux.ibm.com>
Reported-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Chandan Rajendra <chandan@linux.ibm.com>
Tested-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/iomap.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/iomap.c b/fs/iomap.c
index 2e3e64012db7..fac45206418a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1787,6 +1787,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos, start = pos;
 	loff_t end = iocb->ki_pos + count - 1, ret = 0;
 	unsigned int flags = IOMAP_DIRECT;
+	bool wait_for_completion = is_sync_kiocb(iocb);
 	struct blk_plug plug;
 	struct iomap_dio *dio;
 
@@ -1806,7 +1807,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	dio->end_io = end_io;
 	dio->error = 0;
 	dio->flags = 0;
-	dio->wait_for_completion = is_sync_kiocb(iocb);
 
 	dio->submit.iter = iter;
 	dio->submit.waiter = current;
@@ -1861,7 +1861,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		dio_warn_stale_pagecache(iocb->ki_filp);
 	ret = 0;
 
-	if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion &&
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
 	    !inode->i_sb->s_dio_done_wq) {
 		ret = sb_init_dio_done_wq(inode->i_sb);
 		if (ret < 0)
@@ -1877,7 +1877,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		if (ret <= 0) {
 			/* magic error code to fall back to buffered I/O */
 			if (ret == -ENOTBLK) {
-				dio->wait_for_completion = true;
+				wait_for_completion = true;
 				ret = 0;
 			}
 			break;
@@ -1899,8 +1899,24 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (dio->flags & IOMAP_DIO_WRITE_FUA)
 		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
 
+	/*
+	 * We are about to drop our additional submission reference, which
+	 * might be the last reference to the dio.  There are three three
+	 * different ways we can progress here:
+	 *
+	 *  (a) If this is the last reference we will always complete and free
+	 *	the dio ourselves.
+	 *  (b) If this is not the last reference, and we serve an asynchronous
+	 *	iocb, we must never touch the dio after the decrement, the
+	 *	I/O completion handler will complete and free it.
+	 *  (c) If this is not the last reference, but we serve a synchronous
+	 *	iocb, the I/O completion handler will wake us up on the drop
+	 *	of the final reference, and we will complete and free it here
+	 *	after we got woken by the I/O completion handler.
+	 */
+	dio->wait_for_completion = wait_for_completion;
 	if (!atomic_dec_and_test(&dio->ref)) {
-		if (!dio->wait_for_completion)
+		if (!wait_for_completion)
 			return -EIOCBQUEUED;
 
 		for (;;) {
@@ -1917,9 +1933,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		__set_current_state(TASK_RUNNING);
 	}
 
-	ret = iomap_dio_complete(dio);
-
-	return ret;
+	return iomap_dio_complete(dio);
 
 out_free_dio:
 	kfree(dio);
-- 
cgit v1.2.3


From 5c72ca3bf62502ace23aa6281b6dd6867417d642 Mon Sep 17 00:00:00 2001
From: Yao Liu <yotta.liu@ucloud.cn>
Date: Mon, 28 Jan 2019 19:44:14 +0800
Subject: nfs: Fix NULL pointer dereference of dev_name

[ Upstream commit 80ff00172407e0aad4b10b94ef0816fc3e7813cb ]

There is a NULL pointer dereference of dev_name in nfs_parse_devname()

The oops looks something like:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  ...
  RIP: 0010:nfs_fs_mount+0x3b6/0xc20 [nfs]
  ...
  Call Trace:
   ? ida_alloc_range+0x34b/0x3d0
   ? nfs_clone_super+0x80/0x80 [nfs]
   ? nfs_free_parsed_mount_data+0x60/0x60 [nfs]
   mount_fs+0x52/0x170
   ? __init_waitqueue_head+0x3b/0x50
   vfs_kern_mount+0x6b/0x170
   do_mount+0x216/0xdc0
   ksys_mount+0x83/0xd0
   __x64_sys_mount+0x25/0x30
   do_syscall_64+0x65/0x220
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix this by adding a NULL check on dev_name

Signed-off-by: Yao Liu <yotta.liu@ucloud.cn>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/super.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5ef2c71348bd..6b666d187907 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1906,6 +1906,11 @@ static int nfs_parse_devname(const char *dev_name,
 	size_t len;
 	char *end;
 
+	if (unlikely(!dev_name || !*dev_name)) {
+		dfprintk(MOUNT, "NFS: device name not specified\n");
+		return -EINVAL;
+	}
+
 	/* Is the host name protected with square brakcets? */
 	if (*dev_name == '[') {
 		end = strchr(++dev_name, ']');
-- 
cgit v1.2.3


From 6efd69d63339904fce3095723e470fd8f86a49a8 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Tue, 29 Jan 2019 12:46:16 +1000
Subject: cifs: fix computation for MAX_SMB2_HDR_SIZE

[ Upstream commit 58d15ed1203f4d858c339ea4d7dafa94bd2a56d3 ]

The size of the fixed part of the create response is 88 bytes not 56.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/smb2pdu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 8fb7887f2b3d..437257d1116f 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -84,8 +84,8 @@
 
 #define NUMBER_OF_SMB2_COMMANDS	0x0013
 
-/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */
-#define MAX_SMB2_HDR_SIZE 0x00b0
+/* 52 transform hdr + 64 hdr + 88 create rsp */
+#define MAX_SMB2_HDR_SIZE 204
 
 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
 #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
-- 
cgit v1.2.3


From 845d73be1b4cba6c353e99f79eff0f17e0d7c1d8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 1 Feb 2019 14:20:01 -0800
Subject: proc: fix /proc/net/* after setns(2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 1fde6f21d90f8ba5da3cb9c54ca991ed72696c43 ]

/proc entries under /proc/net/* can't be cached into dcache because
setns(2) can change current net namespace.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: avoid vim miscolorization]
[adobriyan@gmail.com: write test, add dummy ->d_revalidate hook: necessary if /proc/net/* is pinned at setns time]
  Link: http://lkml.kernel.org/r/20190108192350.GA12034@avx2
Link: http://lkml.kernel.org/r/20190107162336.GA9239@avx2
Fixes: 1da4d377f943fe4194ffb9fb9c26cc58fad4dd24 ("proc: revalidate misc dentries")
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reported-by: Mateusz Stępień <mateusz.stepien@netrounds.com>
Reported-by: Ahmad Fatoum <a.fatoum@pengutronix.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/proc/generic.c  |  4 +++-
 fs/proc/internal.h |  1 +
 fs/proc/proc_net.c | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ae109429a88..e39bac94dead 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
 		inode = proc_get_inode(dir->i_sb, de);
 		if (!inode)
 			return ERR_PTR(-ENOMEM);
-		d_set_d_op(dentry, &proc_misc_dentry_ops);
+		d_set_d_op(dentry, de->proc_dops);
 		return d_splice_alias(inode, dentry);
 	}
 	read_unlock(&proc_subdir_lock);
@@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	INIT_LIST_HEAD(&ent->pde_openers);
 	proc_set_user(ent, (*parent)->uid, (*parent)->gid);
 
+	ent->proc_dops = &proc_misc_dentry_ops;
+
 out:
 	return ent;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5185d7f6a51e..95b14196f284 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,6 +44,7 @@ struct proc_dir_entry {
 	struct completion *pde_unload_completion;
 	const struct inode_operations *proc_iops;
 	const struct file_operations *proc_fops;
+	const struct dentry_operations *proc_dops;
 	union {
 		const struct seq_operations *seq_ops;
 		int (*single_show)(struct seq_file *, void *);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index d5e0fcb3439e..a7b12435519e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode)
 	return maybe_get_net(PDE_NET(PDE(inode)));
 }
 
+static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	return 0;
+}
+
+static const struct dentry_operations proc_net_dentry_ops = {
+	.d_revalidate	= proc_net_d_revalidate,
+	.d_delete	= always_delete_dentry,
+};
+
+static void pde_force_lookup(struct proc_dir_entry *pde)
+{
+	/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
+	pde->proc_dops = &proc_net_dentry_ops;
+}
+
 static int seq_open_net(struct inode *inode, struct file *file)
 {
 	unsigned int state_size = PDE(inode)->state_size;
@@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode,
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_seq_fops;
 	p->seq_ops = ops;
 	p->state_size = state_size;
@@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_seq_fops;
 	p->seq_ops = ops;
 	p->state_size = state_size;
@@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode,
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_single_fops;
 	p->single_show = show;
 	return proc_register(parent, p);
@@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 	p = proc_create_reg(name, mode, &parent, data);
 	if (!p)
 		return NULL;
+	pde_force_lookup(p);
 	p->proc_fops = &proc_net_single_fops;
 	p->single_show = show;
 	p->write = write;
-- 
cgit v1.2.3


From 3d0acc076f5ffb40d28d1210a6b987034d17b46c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 1 Feb 2019 14:21:23 -0800
Subject: fs/drop_caches.c: avoid softlockups in drop_pagecache_sb()

[ Upstream commit c27d82f52f75fc9d8d9d40d120d2a96fdeeada5e ]

When superblock has lots of inodes without any pagecache (like is the
case for /proc), drop_pagecache_sb() will iterate through all of them
without dropping sb->s_inode_list_lock which can lead to softlockups
(one of our customers hit this).

Fix the problem by going to the slow path and doing cond_resched() in
case the process needs rescheduling.

Link: http://lkml.kernel.org/r/20190114085343.15011-1-jack@suse.cz
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/drop_caches.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 82377017130f..d31b6c72b476 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	spin_lock(&sb->s_inode_list_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		spin_lock(&inode->i_lock);
+		/*
+		 * We must skip inodes in unusual state. We may also skip
+		 * inodes without pages but we deliberately won't in case
+		 * we need to reschedule to avoid softlockups.
+		 */
 		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
-		    (inode->i_mapping->nrpages == 0)) {
+		    (inode->i_mapping->nrpages == 0 && !need_resched())) {
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
@@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&sb->s_inode_list_lock);
 
+		cond_resched();
 		invalidate_mapping_pages(inode->i_mapping, 0, -1);
 		iput(toput_inode);
 		toput_inode = inode;
-- 
cgit v1.2.3


From 1efb234ec25193233d71997c769f6504d283c671 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Fri, 1 Feb 2019 14:21:26 -0800
Subject: autofs: drop dentry reference only when it is never used

[ Upstream commit 63ce5f552beb9bdb41546b3a26c4374758b21815 ]

autofs_expire_run() calls dput(dentry) to drop the reference count of
dentry.  However, dentry is read via autofs_dentry_ino(dentry) after
that.  This may result in a use-free-bug.  The patch drops the reference
count of dentry only when it is never used.

Link: http://lkml.kernel.org/r/154725122396.11260.16053424107144453867.stgit@pluto-themaw-net
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/autofs/expire.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index d441244b79df..28d9c2b1b3bb 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb,
 	pkt.len = dentry->d_name.len;
 	memcpy(pkt.name, dentry->d_name.name, pkt.len);
 	pkt.name[pkt.len] = '\0';
-	dput(dentry);
 
 	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
 		ret = -EFAULT;
@@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb,
 	complete_all(&ino->expire_complete);
 	spin_unlock(&sbi->fs_lock);
 
+	dput(dentry);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 929278903367a8a9ca7b701b4d0af9a6ba5110a3 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Fri, 1 Feb 2019 14:21:29 -0800
Subject: autofs: fix error return in autofs_fill_super()

[ Upstream commit f585b283e3f025754c45bbe7533fc6e5c4643700 ]

In autofs_fill_super() on error of get inode/make root dentry the return
should be ENOMEM as this is the only failure case of the called
functions.

Link: http://lkml.kernel.org/r/154725123240.11260.796773942606871359.stgit@pluto-themaw-net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/autofs/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 846c052569dd..3c14a8e45ffb 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -255,8 +255,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	}
 	root_inode = autofs_get_inode(s, S_IFDIR | 0755);
 	root = d_make_root(root_inode);
-	if (!root)
+	if (!root) {
+		ret = -ENOMEM;
 		goto fail_ino;
+	}
 	pipe = NULL;
 
 	root->d_fsdata = ino;
-- 
cgit v1.2.3


From 72426ed2a149c3cd0c33b409ca10b3918f399cc0 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 21 Jan 2019 22:49:37 +0900
Subject: fs: ratelimit __find_get_block_slow() failure message.

[ Upstream commit 43636c804df0126da669c261fc820fb22f62bfc2 ]

When something let __find_get_block_slow() hit all_mapped path, it calls
printk() for 100+ times per a second. But there is no need to print same
message with such high frequency; it is just asking for stall warning, or
at least bloating log files.

  [  399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8
  [  399.873324][T15342] b_state=0x00000029, b_size=512
  [  399.878403][T15342] device loop0 blocksize: 4096
  [  399.883296][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8
  [  399.890400][T15342] b_state=0x00000029, b_size=512
  [  399.895595][T15342] device loop0 blocksize: 4096
  [  399.900556][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8
  [  399.907471][T15342] b_state=0x00000029, b_size=512
  [  399.912506][T15342] device loop0 blocksize: 4096

This patch reduces frequency to up to once per a second, in addition to
concatenating three lines into one.

  [  399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8, b_state=0x00000029, b_size=512, device loop0 blocksize: 4096

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/buffer.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 6f1ae3ac9789..c083c4b3c1e7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	struct buffer_head *head;
 	struct page *page;
 	int all_mapped = 1;
+	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 
 	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
@@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	 * file io on the block device and getblk.  It gets dealt with
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
-	if (all_mapped) {
-		printk("__find_get_block_slow() failed. "
-			"block=%llu, b_blocknr=%llu\n",
-			(unsigned long long)block,
-			(unsigned long long)bh->b_blocknr);
-		printk("b_state=0x%08lx, b_size=%zu\n",
-			bh->b_state, bh->b_size);
-		printk("device %pg blocksize: %d\n", bdev,
-			1 << bd_inode->i_blkbits);
+	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
+	if (all_mapped && __ratelimit(&last_warned)) {
+		printk("__find_get_block_slow() failed. block=%llu, "
+		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
+		       "device %pg blocksize: %d\n",
+		       (unsigned long long)block,
+		       (unsigned long long)bh->b_blocknr,
+		       bh->b_state, bh->b_size, bdev,
+		       1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
-- 
cgit v1.2.3


From 4f5a4c8881064c2eac658528390a6f2e7253c9b5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Wed, 6 Mar 2019 15:41:57 +0100
Subject: gfs2: Fix missed wakeups in find_insert_glock

commit 605b0487f0bc1ae9963bf52ece0f5c8055186f81 upstream.

Mark Syms has reported seeing tasks that are stuck waiting in
find_insert_glock.  It turns out that struct lm_lockname contains four padding
bytes on 64-bit architectures that function glock_waitqueue doesn't skip when
hashing the glock name.  As a result, we can end up waking up the wrong
waitqueue, and the waiting tasks may be stuck forever.

Fix that by using ht_parms.key_len instead of sizeof(struct lm_lockname) for
the key length.

Reported-by: Mark Syms <mark.syms@citrix.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/glock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4614ee25f621..9d566e62684c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode,
 
 static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name)
 {
-	u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0);
+	u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0);
 
 	return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS);
 }
-- 
cgit v1.2.3


From b4d965a37d89cd8611984ea16c85903d01ac967a Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Wed, 24 Oct 2018 11:50:33 +1000
Subject: cifs: allow calling SMB2_xxx_free(NULL)

commit 32a1fb36f6e50183871c2c1fcf5493c633e84732 upstream.

Change these free functions to allow passing NULL as the argument and
treat it as a no-op just like free(NULL) would.
Or, if rqst->rq_iov is NULL.

The second scenario could happen for smb2_queryfs() if the call
to SMB2_query_info_init() fails and we go to qfs_exit to clean up
and free all resources.
In that case we have not yet assigned rqst[2].rq_iov and thus
the rq_iov dereference in SMB2_close_free() will cause a NULL pointer
dereference.

[ bp: upstream patch also fixes SMB2_set_info_free which was introduced in 4.20 ]

Fixes:  1eb9fb52040f ("cifs: create SMB2_open_init()/SMB2_open_free() helpers")

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Aurelien Aptel <aaptel@suse.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 1e5a1171212f..a2d701775c49 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2243,10 +2243,12 @@ SMB2_open_free(struct smb_rqst *rqst)
 {
 	int i;
 
-	cifs_small_buf_release(rqst->rq_iov[0].iov_base);
-	for (i = 1; i < rqst->rq_nvec; i++)
-		if (rqst->rq_iov[i].iov_base != smb2_padding)
-			kfree(rqst->rq_iov[i].iov_base);
+	if (rqst && rqst->rq_iov) {
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base);
+		for (i = 1; i < rqst->rq_nvec; i++)
+			if (rqst->rq_iov[i].iov_base != smb2_padding)
+				kfree(rqst->rq_iov[i].iov_base);
+	}
 }
 
 int
@@ -2535,7 +2537,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
 void
 SMB2_close_free(struct smb_rqst *rqst)
 {
-	cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+	if (rqst && rqst->rq_iov)
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
 }
 
 int
@@ -2685,7 +2688,8 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
 void
 SMB2_query_info_free(struct smb_rqst *rqst)
 {
-	cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
+	if (rqst && rqst->rq_iov)
+		cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
 }
 
 static int
-- 
cgit v1.2.3


From 2835c059726a100c1de78fd5219cbe4395a894b1 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 28 Dec 2018 11:00:38 -0800
Subject: f2fs: wait on atomic writes to count F2FS_CP_WB_DATA

commit 31867b23d7d1ee3535136c6a410a6cf56f666bfc upstream.

Otherwise, we can get wrong counts incurring checkpoint hang.

IO_W (CP:  -24, Data:   24, Flush: (   0    0    1), Discard: (   0    0))

Thread A                        Thread B
- f2fs_write_data_pages
 -  __write_data_page
  - f2fs_submit_page_write
   - inc_page_count(F2FS_WB_DATA)
     type is F2FS_WB_DATA due to file is non-atomic one
- f2fs_ioc_start_atomic_write
 - set_inode_flag(FI_ATOMIC_FILE)
                                - f2fs_write_end_io
                                 - dec_page_count(F2FS_WB_CP_DATA)
                                   type is F2FS_WB_DATA due to file becomes
                                   atomic one

Cc: <stable@vger.kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/file.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index fd36aa6569dc..81c1dd635a8d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1736,10 +1736,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 
 	down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
-	if (!get_dirty_pages(inode))
-		goto skip_flush;
-
-	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
+	/*
+	 * Should wait end_io to count F2FS_WB_CP_DATA correctly by
+	 * f2fs_is_atomic_file.
+	 */
+	if (get_dirty_pages(inode))
+		f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
 		"Unexpected flush for atomic writes: ino=%lu, npages=%u",
 					inode->i_ino, get_dirty_pages(inode));
 	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
@@ -1747,7 +1749,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 		goto out;
 	}
-skip_flush:
+
 	set_inode_flag(inode, FI_ATOMIC_FILE);
 	clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
 	up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-- 
cgit v1.2.3


From e08ba890dc29250fafdfa7c9dba62ccfeec8ef7f Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Thu, 24 Jan 2019 14:35:13 +0800
Subject: 9p: use inode->i_lock to protect i_size_write() under 32-bit

commit 5e3cc1ee1405a7eb3487ed24f786dec01b4cbe1f upstream.

Use inode->i_lock to protect i_size_write(), else i_size_read() in
generic_fillattr() may loop infinitely in read_seqcount_begin() when
multiple processes invoke v9fs_vfs_getattr() or v9fs_vfs_getattr_dotl()
simultaneously under 32-bit SMP environment, and a soft lockup will be
triggered as show below:

  watchdog: BUG: soft lockup - CPU#5 stuck for 22s! [stat:2217]
  Modules linked in:
  CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4
  Hardware name: Generic DT based system
  PC is at generic_fillattr+0x104/0x108
  LR is at 0xec497f00
  pc : [<802b8898>]    lr : [<ec497f00>]    psr: 200c0013
  sp : ec497e20  ip : ed608030  fp : ec497e3c
  r10: 00000000  r9 : ec497f00  r8 : ed608030
  r7 : ec497ebc  r6 : ec497f00  r5 : ee5c1550  r4 : ee005780
  r3 : 0000052d  r2 : 00000000  r1 : ec497f00  r0 : ed608030
  Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
  Control: 10c5387d  Table: ac48006a  DAC: 00000051
  CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4
  Hardware name: Generic DT based system
  Backtrace:
  [<8010d974>] (dump_backtrace) from [<8010dc88>] (show_stack+0x20/0x24)
  [<8010dc68>] (show_stack) from [<80a1d194>] (dump_stack+0xb0/0xdc)
  [<80a1d0e4>] (dump_stack) from [<80109f34>] (show_regs+0x1c/0x20)
  [<80109f18>] (show_regs) from [<801d0a80>] (watchdog_timer_fn+0x280/0x2f8)
  [<801d0800>] (watchdog_timer_fn) from [<80198658>] (__hrtimer_run_queues+0x18c/0x380)
  [<801984cc>] (__hrtimer_run_queues) from [<80198e60>] (hrtimer_run_queues+0xb8/0xf0)
  [<80198da8>] (hrtimer_run_queues) from [<801973e8>] (run_local_timers+0x28/0x64)
  [<801973c0>] (run_local_timers) from [<80197460>] (update_process_times+0x3c/0x6c)
  [<80197424>] (update_process_times) from [<801ab2b8>] (tick_nohz_handler+0xe0/0x1bc)
  [<801ab1d8>] (tick_nohz_handler) from [<80843050>] (arch_timer_handler_virt+0x38/0x48)
  [<80843018>] (arch_timer_handler_virt) from [<80180a64>] (handle_percpu_devid_irq+0x8c/0x240)
  [<801809d8>] (handle_percpu_devid_irq) from [<8017ac20>] (generic_handle_irq+0x34/0x44)
  [<8017abec>] (generic_handle_irq) from [<8017b344>] (__handle_domain_irq+0x6c/0xc4)
  [<8017b2d8>] (__handle_domain_irq) from [<801022e0>] (gic_handle_irq+0x4c/0x88)
  [<80102294>] (gic_handle_irq) from [<80101a30>] (__irq_svc+0x70/0x98)
  [<802b8794>] (generic_fillattr) from [<8056b284>] (v9fs_vfs_getattr_dotl+0x74/0xa4)
  [<8056b210>] (v9fs_vfs_getattr_dotl) from [<802b8904>] (vfs_getattr_nosec+0x68/0x7c)
  [<802b889c>] (vfs_getattr_nosec) from [<802b895c>] (vfs_getattr+0x44/0x48)
  [<802b8918>] (vfs_getattr) from [<802b8a74>] (vfs_statx+0x9c/0xec)
  [<802b89d8>] (vfs_statx) from [<802b9428>] (sys_lstat64+0x48/0x78)
  [<802b93e0>] (sys_lstat64) from [<80101000>] (ret_fast_syscall+0x0/0x28)

[dominique.martinet@cea.fr: updated comment to not refer to a function
in another subsystem]
Link: http://lkml.kernel.org/r/20190124063514.8571-2-houtao1@huawei.com
Cc: stable@vger.kernel.org
Fixes: 7549ae3e81cc ("9p: Use the i_size_[read, write]() macros instead of using inode->i_size directly.")
Reported-by: Xing Gaopeng <xingaopeng@huawei.com>
Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/9p/v9fs_vfs.h       | 23 +++++++++++++++++++++--
 fs/9p/vfs_file.c       |  6 +++++-
 fs/9p/vfs_inode.c      | 23 +++++++++++------------
 fs/9p/vfs_inode_dotl.c | 27 ++++++++++++++-------------
 fs/9p/vfs_super.c      |  4 ++--
 5 files changed, 53 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 5a0db6dec8d1..aaee1e6584e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,6 +40,9 @@
  */
 #define P9_LOCK_TIMEOUT (30*HZ)
 
+/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */
+#define V9FS_STAT2INODE_KEEP_ISIZE 1
+
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
@@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
 		    struct inode *inode, umode_t mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
-void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *);
+void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
+		      struct super_block *sb, unsigned int flags);
+void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+			   unsigned int flags);
 int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
@@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
 }
 
 int v9fs_open_to_dotl_flags(int flags);
+
+static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size)
+{
+	/*
+	 * 32-bit need the lock, concurrent updates could break the
+	 * sequences and make i_size_read() loop forever.
+	 * 64-bit updates are atomic and can skip the locking.
+	 */
+	if (sizeof(i_size) > sizeof(long))
+		spin_lock(&inode->i_lock);
+	i_size_write(inode, i_size);
+	if (sizeof(i_size) > sizeof(long))
+		spin_unlock(&inode->i_lock);
+}
 #endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index ab3d5f5dbb00..c87e6d6ec069 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -442,7 +442,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		i_size = i_size_read(inode);
 		if (iocb->ki_pos > i_size) {
 			inode_add_bytes(inode, iocb->ki_pos - i_size);
-			i_size_write(inode, iocb->ki_pos);
+			/*
+			 * Need to serialize against i_size_write() in
+			 * v9fs_stat2inode()
+			 */
+			v9fs_i_size_write(inode, iocb->ki_pos);
 		}
 		return retval;
 	}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 85ff859d3af5..72b779bc0942 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode(st, inode, sb);
+	v9fs_stat2inode(st, inode, sb, 0);
 	v9fs_cache_inode_get_cookie(inode);
 	unlock_new_inode(inode);
 	return inode;
@@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
+	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
 	generic_fillattr(d_inode(dentry), stat);
 
 	p9stat_free(st);
@@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
  * @stat: Plan 9 metadata (mistat) structure
  * @inode: inode to populate
  * @sb: superblock of filesystem
+ * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
  *
  */
 
 void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
-	struct super_block *sb)
+		 struct super_block *sb, unsigned int flags)
 {
 	umode_t mode;
 	char ext[32];
@@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	mode = p9mode2perm(v9ses, stat);
 	mode |= inode->i_mode & ~S_IALLUGO;
 	inode->i_mode = mode;
-	i_size_write(inode, stat->length);
 
+	if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+		v9fs_i_size_write(inode, stat->length);
 	/* not real number of blocks, but 512 byte ones ... */
-	inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
+	inode->i_blocks = (stat->length + 512 - 1) >> 9;
 	v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
 }
 
@@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 {
 	int umode;
 	dev_t rdev;
-	loff_t i_size;
 	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses;
+	unsigned int flags;
 
 	v9ses = v9fs_inode2v9ses(inode);
 	st = p9_client_stat(fid);
@@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	/*
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	i_size = inode->i_size;
-	v9fs_stat2inode(st, inode, inode->i_sb);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		inode->i_size = i_size;
-	spin_unlock(&inode->i_lock);
+	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+		V9FS_STAT2INODE_KEEP_ISIZE : 0;
+	v9fs_stat2inode(st, inode, inode->i_sb, flags);
 out:
 	p9stat_free(st);
 	kfree(st);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4823e1c46999..a950a927a626 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode_dotl(st, inode);
+	v9fs_stat2inode_dotl(st, inode, 0);
 	v9fs_cache_inode_get_cookie(inode);
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
@@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode_dotl(st, d_inode(dentry));
+	v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
 	generic_fillattr(d_inode(dentry), stat);
 	/* Change block size to what the server returned */
 	stat->blksize = st->st_blksize;
@@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
  * v9fs_stat2inode_dotl - populate an inode structure with stat info
  * @stat: stat structure
  * @inode: inode to populate
+ * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE)
  *
  */
 
 void
-v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
+v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
+		      unsigned int flags)
 {
 	umode_t mode;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		mode |= inode->i_mode & ~S_IALLUGO;
 		inode->i_mode = mode;
 
-		i_size_write(inode, stat->st_size);
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
+			v9fs_i_size_write(inode, stat->st_size);
 		inode->i_blocks = stat->st_blocks;
 	} else {
 		if (stat->st_result_mask & P9_STATS_ATIME) {
@@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		}
 		if (stat->st_result_mask & P9_STATS_RDEV)
 			inode->i_rdev = new_decode_dev(stat->st_rdev);
-		if (stat->st_result_mask & P9_STATS_SIZE)
-			i_size_write(inode, stat->st_size);
+		if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
+		    stat->st_result_mask & P9_STATS_SIZE)
+			v9fs_i_size_write(inode, stat->st_size);
 		if (stat->st_result_mask & P9_STATS_BLOCKS)
 			inode->i_blocks = stat->st_blocks;
 	}
@@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry,
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 {
-	loff_t i_size;
 	struct p9_stat_dotl *st;
 	struct v9fs_session_info *v9ses;
+	unsigned int flags;
 
 	v9ses = v9fs_inode2v9ses(inode);
 	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
@@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
 		goto out;
 
-	spin_lock(&inode->i_lock);
 	/*
 	 * We don't want to refresh inode->i_size,
 	 * because we may have cached data
 	 */
-	i_size = inode->i_size;
-	v9fs_stat2inode_dotl(st, inode);
-	if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
-		inode->i_size = i_size;
-	spin_unlock(&inode->i_lock);
+	flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ?
+		V9FS_STAT2INODE_KEEP_ISIZE : 0;
+	v9fs_stat2inode_dotl(st, inode, flags);
 out:
 	kfree(st);
 	return 0;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 48ce50484e80..eeab9953af89 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 			goto release_sb;
 		}
 		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode_dotl(st, d_inode(root));
+		v9fs_stat2inode_dotl(st, d_inode(root), 0);
 		kfree(st);
 	} else {
 		struct p9_wstat *st = NULL;
@@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		}
 
 		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode(st, d_inode(root), sb);
+		v9fs_stat2inode(st, d_inode(root), sb, 0);
 
 		p9stat_free(st);
 		kfree(st);
-- 
cgit v1.2.3


From 6c023d86b3645fc2671bf2a7677fd208744edbb1 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 6 Feb 2019 06:09:43 -0500
Subject: NFS: Don't use page_file_mapping after removing the page

[ Upstream commit d2ceb7e57086750ea6198a31fd942d98099a0786 ]

If nfs_page_async_flush() removes the page from the mapping, then we can't
use page_file_mapping() on it as nfs_updatepate() is wont to do when
receiving an error.  Instead, push the mapping to the stack before the page
is possibly truncated.

Fixes: 8fc75bed96bb ("NFS: Fix up return value on fatal errors in nfs_page_async_flush()")
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/write.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d790faff8e47..51d0b7913c04 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -238,9 +238,9 @@ out:
 }
 
 /* A writeback failed: mark the page as bad, and invalidate the page cache */
-static void nfs_set_pageerror(struct page *page)
+static void nfs_set_pageerror(struct address_space *mapping)
 {
-	nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
+	nfs_zap_mapping(mapping->host, mapping);
 }
 
 /*
@@ -994,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
 		nfs_list_remove_request(req);
 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
 		    (hdr->good_bytes < bytes)) {
-			nfs_set_pageerror(req->wb_page);
+			nfs_set_pageerror(page_file_mapping(req->wb_page));
 			nfs_context_set_write_error(req->wb_context, hdr->error);
 			goto remove_req;
 		}
@@ -1330,7 +1330,8 @@ int nfs_updatepage(struct file *file, struct page *page,
 		unsigned int offset, unsigned int count)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
-	struct inode	*inode = page_file_mapping(page)->host;
+	struct address_space *mapping = page_file_mapping(page);
+	struct inode	*inode = mapping->host;
 	int		status = 0;
 
 	nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
@@ -1348,7 +1349,7 @@ int nfs_updatepage(struct file *file, struct page *page,
 
 	status = nfs_writepage_setup(ctx, page, offset, count);
 	if (status < 0)
-		nfs_set_pageerror(page);
+		nfs_set_pageerror(mapping);
 	else
 		__set_page_dirty_nobuffers(page);
 out:
-- 
cgit v1.2.3


From 726832821903bdd75ea9a409ba12fa60069a8268 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 14 Feb 2019 16:20:25 +0000
Subject: keys: Fix dependency loop between construction record and auth key

[ Upstream commit 822ad64d7e46a8e2c8b8a796738d7b657cbb146d ]

In the request_key() upcall mechanism there's a dependency loop by which if
a key type driver overrides the ->request_key hook and the userspace side
manages to lose the authorisation key, the auth key and the internal
construction record (struct key_construction) can keep each other pinned.

Fix this by the following changes:

 (1) Killing off the construction record and using the auth key instead.

 (2) Including the operation name in the auth key payload and making the
     payload available outside of security/keys/.

 (3) The ->request_key hook is given the authkey instead of the cons
     record and operation name.

Changes (2) and (3) allow the auth key to naturally be cleaned up if the
keyring it is in is destroyed or cleared or the auth key is unlinked.

Fixes: 7ee02a316600 ("keys: Fix dependency loop between construction record and auth key")
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <james.morris@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/nfs4idmap.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 3f23b6840547..bf34ddaa2ad7 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -44,6 +44,7 @@
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <keys/user-type.h>
+#include <keys/request_key_auth-type.h>
 #include <linux/module.h>
 
 #include "internal.h"
@@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy;
 struct idmap_legacy_upcalldata {
 	struct rpc_pipe_msg pipe_msg;
 	struct idmap_msg idmap_msg;
-	struct key_construction	*key_cons;
+	struct key	*authkey;
 	struct idmap *idmap;
 };
 
@@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = {
 	{ Opt_find_err, NULL }
 };
 
-static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *);
+static int nfs_idmap_legacy_upcall(struct key *, void *);
 static ssize_t idmap_pipe_downcall(struct file *, const char __user *,
 				   size_t);
 static void idmap_release_pipe(struct inode *);
@@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
 static void
 nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
 {
-	struct key_construction *cons = idmap->idmap_upcall_data->key_cons;
+	struct key *authkey = idmap->idmap_upcall_data->authkey;
 
 	kfree(idmap->idmap_upcall_data);
 	idmap->idmap_upcall_data = NULL;
-	complete_request_key(cons, ret);
+	complete_request_key(authkey, ret);
+	key_put(authkey);
 }
 
 static void
@@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
 		nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
 }
 
-static int nfs_idmap_legacy_upcall(struct key_construction *cons,
-				   const char *op,
-				   void *aux)
+static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
 {
 	struct idmap_legacy_upcalldata *data;
+	struct request_key_auth *rka = get_request_key_auth(authkey);
 	struct rpc_pipe_msg *msg;
 	struct idmap_msg *im;
 	struct idmap *idmap = (struct idmap *)aux;
-	struct key *key = cons->key;
+	struct key *key = rka->target_key;
 	int ret = -ENOKEY;
 
 	if (!aux)
@@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 	msg = &data->pipe_msg;
 	im = &data->idmap_msg;
 	data->idmap = idmap;
-	data->key_cons = cons;
+	data->authkey = key_get(authkey);
 
 	ret = nfs_idmap_prepare_message(key->description, idmap, im, msg);
 	if (ret < 0)
@@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons,
 out2:
 	kfree(data);
 out1:
-	complete_request_key(cons, ret);
+	complete_request_key(authkey, ret);
 	return ret;
 }
 
@@ -651,9 +652,10 @@ out:
 static ssize_t
 idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
+	struct request_key_auth *rka;
 	struct rpc_inode *rpci = RPC_I(file_inode(filp));
 	struct idmap *idmap = (struct idmap *)rpci->private;
-	struct key_construction *cons;
+	struct key *authkey;
 	struct idmap_msg im;
 	size_t namelen_in;
 	int ret = -ENOKEY;
@@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	if (idmap->idmap_upcall_data == NULL)
 		goto out_noupcall;
 
-	cons = idmap->idmap_upcall_data->key_cons;
+	authkey = idmap->idmap_upcall_data->authkey;
+	rka = get_request_key_auth(authkey);
 
 	if (mlen != sizeof(im)) {
 		ret = -ENOSPC;
@@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 
 	ret = nfs_idmap_read_and_verify_message(&im,
 			&idmap->idmap_upcall_data->idmap_msg,
-			cons->key, cons->authkey);
+			rka->target_key, authkey);
 	if (ret >= 0) {
-		key_set_timeout(cons->key, nfs_idmap_cache_timeout);
+		key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
 		ret = mlen;
 	}
 
-- 
cgit v1.2.3


From 7a8b048430c124175a28514e2ed7cb8230998ccc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 6 Jan 2019 11:41:29 -0500
Subject: fix cgroup_do_mount() handling of failure exits

commit 399504e21a10be16dd1408ba0147367d9d82a10c upstream.

same story as with last May fixes in sysfs (7b745a4e4051
"unfuck sysfs_mount()"); new_sb is left uninitialized
in case of early errors in kernfs_mount_ns() and papering
over it by treating any error from kernfs_mount_ns() as
equivalent to !new_ns ends up conflating the cases when
objects had never been transferred to a superblock with
ones when that has happened and resulting new superblock
had been dropped.  Easily fixed (same way as in sysfs
case).  Additionally, there's a superblock leak on
kernfs_node_dentry() failure *and* a dentry leak inside
kernfs_node_dentry() itself - the latter on probably
impossible errors, but the former not impossible to trigger
(as the matter of fact, injecting allocation failures
at that point *does* trigger it).

Cc: stable@kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/mount.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index ff2716f9322e..0b22c39dad47 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 		return dentry;
 
 	knparent = find_next_ancestor(kn, NULL);
-	if (WARN_ON(!knparent))
+	if (WARN_ON(!knparent)) {
+		dput(dentry);
 		return ERR_PTR(-EINVAL);
+	}
 
 	do {
 		struct dentry *dtmp;
@@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 		if (kn == knparent)
 			return dentry;
 		kntmp = find_next_ancestor(kn, knparent);
-		if (WARN_ON(!kntmp))
+		if (WARN_ON(!kntmp)) {
+			dput(dentry);
 			return ERR_PTR(-EINVAL);
+		}
 		dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
 					       strlen(kntmp->name));
 		dput(dentry);
-- 
cgit v1.2.3


From 3ed9f22e28dd3a2b18f65ee5442882ebded6413e Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Wed, 13 Feb 2019 15:43:08 -0800
Subject: CIFS: Do not reset lease state to NONE on lease break

commit 7b9b9edb49ad377b1e06abf14354c227e9ac4b06 upstream.

Currently on lease break the client sets a caching level twice:
when oplock is detected and when oplock is processed. While the
1st attempt sets the level to the value provided by the server,
the 2nd one resets the level to None unconditionally.
This happens because the oplock/lease processing code was changed
to avoid races between page cache flushes and oplock breaks.
The commit c11f1df5003d534 ("cifs: Wait for writebacks to complete
before attempting write.") fixed the races for oplocks but didn't
apply the same changes for leases resulting in overwriting the
server granted value to None. Fix this by properly processing
lease breaks.

Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2misc.c | 17 ++++++++++++++---
 fs/cifs/smb2ops.c  | 15 ++++++++++++---
 2 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7b8b58fb4d3f..58700d2ba8cd 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
 	__u8 lease_state;
 	struct list_head *tmp;
 	struct cifsFileInfo *cfile;
-	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_pending_open *open;
 	struct cifsInodeInfo *cinode;
 	int ack_req = le32_to_cpu(rsp->Flags &
@@ -537,13 +536,25 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
 		cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
 			 le32_to_cpu(rsp->NewLeaseState));
 
-		server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
-
 		if (ack_req)
 			cfile->oplock_break_cancelled = false;
 		else
 			cfile->oplock_break_cancelled = true;
 
+		set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+
+		/*
+		 * Set or clear flags depending on the lease state being READ.
+		 * HANDLE caching flag should be added when the client starts
+		 * to defer closing remote file handles with HANDLE leases.
+		 */
+		if (lease_state & SMB2_LEASE_READ_CACHING_HE)
+			set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+				&cinode->flags);
+		else
+			clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+				  &cinode->flags);
+
 		queue_work(cifsoplockd_wq, &cfile->oplock_break);
 		kfree(lw);
 		return true;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 237d7281ada3..b66d630e0058 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -2300,6 +2300,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server,
 		server->ops->set_oplock_level(cinode, 0, 0, NULL);
 }
 
+static void
+smb21_downgrade_oplock(struct TCP_Server_Info *server,
+		       struct cifsInodeInfo *cinode, bool set_level2)
+{
+	server->ops->set_oplock_level(cinode,
+				      set_level2 ? SMB2_LEASE_READ_CACHING_HE :
+				      0, 0, NULL);
+}
+
 static void
 smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
 		      unsigned int epoch, bool *purge_cache)
@@ -3351,7 +3360,7 @@ struct smb_version_operations smb21_operations = {
 	.print_stats = smb2_print_stats,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.handle_cancelled_mid = smb2_handle_cancelled_mid,
-	.downgrade_oplock = smb2_downgrade_oplock,
+	.downgrade_oplock = smb21_downgrade_oplock,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
 	.negotiate_wsize = smb2_negotiate_wsize,
@@ -3447,7 +3456,7 @@ struct smb_version_operations smb30_operations = {
 	.dump_share_caps = smb2_dump_share_caps,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.handle_cancelled_mid = smb2_handle_cancelled_mid,
-	.downgrade_oplock = smb2_downgrade_oplock,
+	.downgrade_oplock = smb21_downgrade_oplock,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
 	.negotiate_wsize = smb2_negotiate_wsize,
@@ -3551,7 +3560,7 @@ struct smb_version_operations smb311_operations = {
 	.dump_share_caps = smb2_dump_share_caps,
 	.is_oplock_break = smb2_is_valid_oplock_break,
 	.handle_cancelled_mid = smb2_handle_cancelled_mid,
-	.downgrade_oplock = smb2_downgrade_oplock,
+	.downgrade_oplock = smb21_downgrade_oplock,
 	.need_neg = smb2_need_neg,
 	.negotiate = smb2_negotiate,
 	.negotiate_wsize = smb2_negotiate_wsize,
-- 
cgit v1.2.3


From dc8e8ad962a8add17737ec29396bb7a754907e52 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 4 Mar 2019 14:02:50 -0800
Subject: CIFS: Do not skip SMB2 message IDs on send failures

commit c781af7e0c1fed9f1d0e0ec31b86f5b21a8dca17 upstream.

When we hit failures during constructing MIDs or sending PDUs
through the network, we end up not using message IDs assigned
to the packet. The next SMB packet will skip those message IDs
and continue with the next one. This behavior may lead to a server
not granting us credits until we use the skipped IDs. Fix this by
reverting the current ID to the original value if any errors occur
before we push the packet through the network stack.

This patch fixes the generic/310 test from the xfs-tests.

Cc: <stable@vger.kernel.org> # 4.19.x
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/cifsglob.h      | 19 +++++++++++++++++++
 fs/cifs/smb2ops.c       | 13 +++++++++++++
 fs/cifs/smb2transport.c | 14 ++++++++++++--
 fs/cifs/transport.c     |  6 +++++-
 4 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9dcaed031843..80f33582059e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -235,6 +235,8 @@ struct smb_version_operations {
 	int * (*get_credits_field)(struct TCP_Server_Info *, const int);
 	unsigned int (*get_credits)(struct mid_q_entry *);
 	__u64 (*get_next_mid)(struct TCP_Server_Info *);
+	void (*revert_current_mid)(struct TCP_Server_Info *server,
+				   const unsigned int val);
 	/* data offset from read response message */
 	unsigned int (*read_data_offset)(char *);
 	/*
@@ -756,6 +758,22 @@ get_next_mid(struct TCP_Server_Info *server)
 	return cpu_to_le16(mid);
 }
 
+static inline void
+revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+	if (server->ops->revert_current_mid)
+		server->ops->revert_current_mid(server, val);
+}
+
+static inline void
+revert_current_mid_from_hdr(struct TCP_Server_Info *server,
+			    const struct smb2_sync_hdr *shdr)
+{
+	unsigned int num = le16_to_cpu(shdr->CreditCharge);
+
+	return revert_current_mid(server, num > 0 ? num : 1);
+}
+
 static inline __u16
 get_mid(const struct smb_hdr *smb)
 {
@@ -1391,6 +1409,7 @@ struct mid_q_entry {
 	struct kref refcount;
 	struct TCP_Server_Info *server;	/* server corresponding to this mid */
 	__u64 mid;		/* multiplex id */
+	__u16 credits;		/* number of credits consumed by this mid */
 	__u32 pid;		/* process id */
 	__u32 sequence_number;  /* for CIFS signing */
 	unsigned long when_alloc;  /* when mid was created */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index b66d630e0058..d4d7d61a6fe2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -204,6 +204,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
 	return mid;
 }
 
+static void
+smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
+{
+	spin_lock(&GlobalMid_Lock);
+	if (server->CurrentMid >= val)
+		server->CurrentMid -= val;
+	spin_unlock(&GlobalMid_Lock);
+}
+
 static struct mid_q_entry *
 smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 {
@@ -3256,6 +3265,7 @@ struct smb_version_operations smb20_operations = {
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = cifs_wait_mtu_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3350,6 +3360,7 @@ struct smb_version_operations smb21_operations = {
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = smb2_wait_mtu_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3445,6 +3456,7 @@ struct smb_version_operations smb30_operations = {
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = smb2_wait_mtu_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
@@ -3549,6 +3561,7 @@ struct smb_version_operations smb311_operations = {
 	.get_credits = smb2_get_credits,
 	.wait_mtu_credits = smb2_wait_mtu_credits,
 	.get_next_mid = smb2_get_next_mid,
+	.revert_current_mid = smb2_revert_current_mid,
 	.read_data_offset = smb2_read_data_offset,
 	.read_data_length = smb2_read_data_length,
 	.map_error = map_smb2_to_linux_error,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 7b351c65ee46..63264db78b89 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
 		     struct TCP_Server_Info *server)
 {
 	struct mid_q_entry *temp;
+	unsigned int credits = le16_to_cpu(shdr->CreditCharge);
 
 	if (server == NULL) {
 		cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n");
@@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
 	memset(temp, 0, sizeof(struct mid_q_entry));
 	kref_init(&temp->refcount);
 	temp->mid = le64_to_cpu(shdr->MessageId);
+	temp->credits = credits > 0 ? credits : 1;
 	temp->pid = current->pid;
 	temp->command = shdr->Command; /* Always LE */
 	temp->when_alloc = jiffies;
@@ -674,13 +676,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
 	smb2_seq_num_into_buf(ses->server, shdr);
 
 	rc = smb2_get_mid_entry(ses, shdr, &mid);
-	if (rc)
+	if (rc) {
+		revert_current_mid_from_hdr(ses->server, shdr);
 		return ERR_PTR(rc);
+	}
+
 	rc = smb2_sign_rqst(rqst, ses->server);
 	if (rc) {
+		revert_current_mid_from_hdr(ses->server, shdr);
 		cifs_delete_mid(mid);
 		return ERR_PTR(rc);
 	}
+
 	return mid;
 }
 
@@ -695,11 +702,14 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 	smb2_seq_num_into_buf(server, shdr);
 
 	mid = smb2_mid_entry_alloc(shdr, server);
-	if (mid == NULL)
+	if (mid == NULL) {
+		revert_current_mid_from_hdr(server, shdr);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	rc = smb2_sign_rqst(rqst, server);
 	if (rc) {
+		revert_current_mid_from_hdr(server, shdr);
 		DeleteMidQEntry(mid);
 		return ERR_PTR(rc);
 	}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 66348b3d28e6..f2938bd95c40 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -638,6 +638,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 	cifs_in_send_dec(server);
 
 	if (rc < 0) {
+		revert_current_mid(server, mid->credits);
 		server->sequence_number -= 2;
 		cifs_delete_mid(mid);
 	}
@@ -842,6 +843,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 	for (i = 0; i < num_rqst; i++) {
 		midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]);
 		if (IS_ERR(midQ[i])) {
+			revert_current_mid(ses->server, i);
 			for (j = 0; j < i; j++)
 				cifs_delete_mid(midQ[j]);
 			mutex_unlock(&ses->server->srv_mutex);
@@ -867,8 +869,10 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 	for (i = 0; i < num_rqst; i++)
 		cifs_save_when_sent(midQ[i]);
 
-	if (rc < 0)
+	if (rc < 0) {
+		revert_current_mid(ses->server, num_rqst);
 		ses->server->sequence_number -= 2;
+	}
 
 	mutex_unlock(&ses->server->srv_mutex);
 
-- 
cgit v1.2.3


From 43eaa6cc17753f31768ddf1ed314e36abc0f2580 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Mon, 4 Mar 2019 17:48:01 -0800
Subject: CIFS: Fix read after write for files with read caching

commit 6dfbd84684700cb58b34e8602c01c12f3d2595c8 upstream.

When we have a READ lease for a file and have just issued a write
operation to the server we need to purge the cache and set oplock/lease
level to NONE to avoid reading stale data. Currently we do that
only if a write operation succedeed thus not covering cases when
a request was sent to the server but a negative error code was
returned later for some other reasons (e.g. -EIOCBQUEUED or -EINTR).
Fix this by turning off caching regardless of the error code being
returned.

The patches fixes generic tests 075 and 112 from the xfs-tests.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/file.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 23db881daab5..08761a6a039d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2871,14 +2871,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
 	 * these pages but not on the region from pos to ppos+len-1.
 	 */
 	written = cifs_user_writev(iocb, from);
-	if (written > 0 && CIFS_CACHE_READ(cinode)) {
+	if (CIFS_CACHE_READ(cinode)) {
 		/*
-		 * Windows 7 server can delay breaking level2 oplock if a write
-		 * request comes - break it on the client to prevent reading
-		 * an old data.
+		 * We have read level caching and we have just sent a write
+		 * request to the server thus making data in the cache stale.
+		 * Zap the cache and set oplock/lease level to NONE to avoid
+		 * reading stale data from the cache. All subsequent read
+		 * operations will read new data from the server.
 		 */
 		cifs_zap_mapping(inode);
-		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
+		cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n",
 			 inode);
 		cinode->oplock = 0;
 	}
-- 
cgit v1.2.3


From 1c2123ff43122bb9e0578198391ee3cf752049d6 Mon Sep 17 00:00:00 2001
From: Varad Gautam <vrd@amazon.de>
Date: Thu, 24 Jan 2019 14:03:06 +0100
Subject: fs/devpts: always delete dcache dentry-s in dput()

commit 73052b0daee0b750b39af18460dfec683e4f5887 upstream.

d_delete only unhashes an entry if it is reached with
dentry->d_lockref.count != 1. Prior to commit 8ead9dd54716 ("devpts:
more pty driver interface cleanups"), d_delete was called on a dentry
from devpts_pty_kill with two references held, which would trigger the
unhashing, and the subsequent dputs would release it.

Commit 8ead9dd54716 reworked devpts_pty_kill to stop acquiring the second
reference from d_find_alias, and the d_delete call left the dentries
still on the hashed list without actually ever being dropped from dcache
before explicit cleanup. This causes the number of negative dentries for
devpts to pile up, and an `ls /dev/pts` invocation can take seconds to
return.

Provide always_delete_dentry() from simple_dentry_operations
as .d_delete for devpts, to make the dentry be dropped from dcache.

Without this cleanup, the number of dentries in /dev/pts/ can be grown
arbitrarily as:

`python -c 'import pty; pty.spawn(["ls", "/dev/pts"])'`

A systemtap probe on dcache_readdir to count d_subdirs shows this count
to increase with each pty spawn invocation above:

probe kernel.function("dcache_readdir") {
    subdirs = &@cast($file->f_path->dentry, "dentry")->d_subdirs;
    p = subdirs;
    p = @cast(p, "list_head")->next;
    i = 0
    while (p != subdirs) {
      p = @cast(p, "list_head")->next;
      i = i+1;
    }
    printf("number of dentries: %d\n", i);
}

Fixes: 8ead9dd54716 ("devpts: more pty driver interface cleanups")
Signed-off-by: Varad Gautam <vrd@amazon.de>
Reported-by: Zheng Wang <wanz@amazon.de>
Reported-by: Brandon Schwartz <bsschwar@amazon.de>
Root-caused-by: Maximilian Heyne <mheyne@amazon.de>
Root-caused-by: Nicolas Pernas Maradei <npernas@amazon.de>
CC: David Woodhouse <dwmw@amazon.co.uk>
CC: Maximilian Heyne <mheyne@amazon.de>
CC: Stefan Nuernberger <snu@amazon.de>
CC: Amit Shah <aams@amazon.de>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Christian Brauner <christian.brauner@ubuntu.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: Matthew Wilcox <willy@infradead.org>
CC: Eric Biggers <ebiggers@google.com>
CC: <stable@vger.kernel.org> # 4.9+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/devpts/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..553a3f3300ae 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_blocksize_bits = 10;
 	s->s_magic = DEVPTS_SUPER_MAGIC;
 	s->s_op = &devpts_sops;
+	s->s_d_op = &simple_dentry_operations;
 	s->s_time_gran = 1;
 
 	error = -ENOMEM;
-- 
cgit v1.2.3


From 2af926fd52fc287a92fc782ce074f349d86966aa Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 23 Jan 2019 15:19:17 +0100
Subject: splice: don't merge into linked buffers

commit a0ce2f0aa6ad97c3d4927bf2ca54bcebdf062d55 upstream.

Before this patch, it was possible for two pipes to affect each other after
data had been transferred between them with tee():

============
$ cat tee_test.c

int main(void) {
  int pipe_a[2];
  if (pipe(pipe_a)) err(1, "pipe");
  int pipe_b[2];
  if (pipe(pipe_b)) err(1, "pipe");
  if (write(pipe_a[1], "abcd", 4) != 4) err(1, "write");
  if (tee(pipe_a[0], pipe_b[1], 2, 0) != 2) err(1, "tee");
  if (write(pipe_b[1], "xx", 2) != 2) err(1, "write");

  char buf[5];
  if (read(pipe_a[0], buf, 4) != 4) err(1, "read");
  buf[4] = 0;
  printf("got back: '%s'\n", buf);
}
$ gcc -o tee_test tee_test.c
$ ./tee_test
got back: 'abxx'
$
============

As suggested by Al Viro, fix it by creating a separate type for
non-mergeable pipe buffers, then changing the types of buffers in
splice_pipe_to_pipe() and link_pipe().

Cc: <stable@vger.kernel.org>
Fixes: 7c77f0b3f920 ("splice: implement pipe to pipe splicing")
Fixes: 70524490ee2e ("[PATCH] splice: add support for sys_tee()")
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pipe.c   | 14 ++++++++++++++
 fs/splice.c |  4 ++++
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index bdc5d3c0977d..c51750ed4011 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -234,6 +234,14 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
+	.can_merge = 0,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = anon_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.can_merge = 0,
 	.confirm = generic_pipe_buf_confirm,
@@ -242,6 +250,12 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
+{
+	if (buf->ops == &anon_pipe_buf_ops)
+		buf->ops = &anon_pipe_buf_nomerge_ops;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
diff --git a/fs/splice.c b/fs/splice.c
index b3daa971f597..29e92b506394 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1593,6 +1593,8 @@ retry:
 			 */
 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
+			pipe_buf_mark_unmergeable(obuf);
+
 			obuf->len = len;
 			opipe->nrbufs++;
 			ibuf->offset += obuf->len;
@@ -1667,6 +1669,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 */
 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 
+		pipe_buf_mark_unmergeable(obuf);
+
 		if (obuf->len > len)
 			obuf->len = len;
 
-- 
cgit v1.2.3


From 6f048ae2d25f94490c15320ccc6ed8b6587f1258 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 11 Jan 2019 19:37:00 +0100
Subject: ovl: During copy up, first copy up data and then xattrs

commit 5f32879ea35523b9842bdbdc0065e13635caada2 upstream.

If a file with capability set (and hence security.capability xattr) is
written kernel clears security.capability xattr. For overlay, during file
copy up if xattrs are copied up first and then data is, copied up. This
means data copy up will result in clearing of security.capability xattr
file on lower has. And this can result into surprises. If a lower file has
CAP_SETUID, then it should not be cleared over copy up (if nothing was
actually written to file).

This also creates problems with chown logic where it first copies up file
and then tries to clear setuid bit. But by that time security.capability
xattr is already gone (due to data copy up), and caller gets -ENODATA.
This has been reported by Giuseppe here.

https://github.com/containers/libpod/issues/2015#issuecomment-447824842

Fix this by copying up data first and then metadta. This is a regression
which has been introduced by my commit as part of metadata only copy up
patches.

TODO: There will be some corner cases where a file is copied up metadata
only and later data copy up happens and that will clear security.capability
xattr. Something needs to be done about that too.

Fixes: bd64e57586d3 ("ovl: During copy up, first copy up metadata and then data")
Cc: <stable@vger.kernel.org> # v4.19+
Reported-by: Giuseppe Scrivano <gscrivan@redhat.com>
Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/overlayfs/copy_up.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 1cc797a08a5b..59080f7ba02f 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -501,6 +501,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
 {
 	int err;
 
+	/*
+	 * Copy up data first and then xattrs. Writing data after
+	 * xattrs will remove security.capability xattr automatically.
+	 */
+	if (S_ISREG(c->stat.mode) && !c->metacopy) {
+		struct path upperpath, datapath;
+
+		ovl_path_upper(c->dentry, &upperpath);
+		if (WARN_ON(upperpath.dentry != NULL))
+			return -EIO;
+		upperpath.dentry = temp;
+
+		ovl_path_lowerdata(c->dentry, &datapath);
+		err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
+		if (err)
+			return err;
+	}
+
 	err = ovl_copy_xattr(c->lowerpath.dentry, temp);
 	if (err)
 		return err;
@@ -518,19 +536,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
 			return err;
 	}
 
-	if (S_ISREG(c->stat.mode) && !c->metacopy) {
-		struct path upperpath, datapath;
-
-		ovl_path_upper(c->dentry, &upperpath);
-		BUG_ON(upperpath.dentry != NULL);
-		upperpath.dentry = temp;
-
-		ovl_path_lowerdata(c->dentry, &datapath);
-		err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
-		if (err)
-			return err;
-	}
-
 	if (c->metacopy) {
 		err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY,
 					 NULL, 0, -EOPNOTSUPP);
-- 
cgit v1.2.3


From 205f149f1a358fc2a054461dd1432e52263cbc1e Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Wed, 30 Jan 2019 14:01:57 -0500
Subject: ovl: Do not lose security.capability xattr over metadata file copy-up

commit 993a0b2aec52754f0897b1dab4c453be8217cae5 upstream.

If a file has been copied up metadata only, and later data is copied up,
upper loses any security.capability xattr it has (underlying filesystem
clears it as upon file write).

From a user's point of view, this is just a file copy-up and that should
not result in losing security.capability xattr.  Hence, before data copy
up, save security.capability xattr (if any) and restore it on upper after
data copy up is complete.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Fixes: 0c2888749363 ("ovl: A new xattr OVL_XATTR_METACOPY for file on upper")
Cc: <stable@vger.kernel.org> # v4.19+
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/overlayfs/copy_up.c   | 28 ++++++++++++++++++++++--
 fs/overlayfs/overlayfs.h |  2 ++
 fs/overlayfs/util.c      | 55 ++++++++++++++++++++++++++++++------------------
 3 files changed, 63 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 59080f7ba02f..75eeee08d848 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -711,6 +711,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
 {
 	struct path upperpath, datapath;
 	int err;
+	char *capability = NULL;
+	ssize_t uninitialized_var(cap_size);
 
 	ovl_path_upper(c->dentry, &upperpath);
 	if (WARN_ON(upperpath.dentry == NULL))
@@ -720,15 +722,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
 	if (WARN_ON(datapath.dentry == NULL))
 		return -EIO;
 
+	if (c->stat.size) {
+		err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS,
+					      &capability, 0);
+		if (err < 0 && err != -ENODATA)
+			goto out;
+	}
+
 	err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size);
 	if (err)
-		return err;
+		goto out_free;
+
+	/*
+	 * Writing to upper file will clear security.capability xattr. We
+	 * don't want that to happen for normal copy-up operation.
+	 */
+	if (capability) {
+		err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS,
+				      capability, cap_size, 0);
+		if (err)
+			goto out_free;
+	}
+
 
 	err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY);
 	if (err)
-		return err;
+		goto out_free;
 
 	ovl_set_upperdata(d_inode(c->dentry));
+out_free:
+	kfree(capability);
+out:
 	return err;
 }
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index a3c0d9584312..d9c16ceebfe7 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
 int ovl_check_metacopy_xattr(struct dentry *dentry);
 bool ovl_is_metacopy_dentry(struct dentry *dentry);
 char *ovl_get_redirect_xattr(struct dentry *dentry, int padding);
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+		     size_t padding);
 
 static inline bool ovl_is_impuredir(struct dentry *dentry)
 {
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index ace4fe4c39a9..c9a2e3c6d537 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -867,28 +867,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry)
 	return (oe->numlower > 1);
 }
 
-char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value,
+		     size_t padding)
 {
-	int res;
-	char *s, *next, *buf = NULL;
+	ssize_t res;
+	char *buf = NULL;
 
-	res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
+	res = vfs_getxattr(dentry, name, NULL, 0);
 	if (res < 0) {
 		if (res == -ENODATA || res == -EOPNOTSUPP)
-			return NULL;
+			return -ENODATA;
 		goto fail;
 	}
 
-	buf = kzalloc(res + padding + 1, GFP_KERNEL);
-	if (!buf)
-		return ERR_PTR(-ENOMEM);
+	if (res != 0) {
+		buf = kzalloc(res + padding, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
 
-	if (res == 0)
-		goto invalid;
+		res = vfs_getxattr(dentry, name, buf, res);
+		if (res < 0)
+			goto fail;
+	}
+	*value = buf;
+
+	return res;
+
+fail:
+	pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n",
+			    name, res);
+	kfree(buf);
+	return res;
+}
 
-	res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
+{
+	int res;
+	char *s, *next, *buf = NULL;
+
+	res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1);
+	if (res == -ENODATA)
+		return NULL;
 	if (res < 0)
-		goto fail;
+		return ERR_PTR(res);
 	if (res == 0)
 		goto invalid;
 
@@ -904,15 +925,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding)
 	}
 
 	return buf;
-
-err_free:
-	kfree(buf);
-	return ERR_PTR(res);
-fail:
-	pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
-	goto err_free;
 invalid:
 	pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
 	res = -EINVAL;
-	goto err_free;
+	kfree(buf);
+	return ERR_PTR(res);
 }
-- 
cgit v1.2.3


From 61f9209676e854973a8b7aaed93de16f727c4a32 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Dec 2018 21:16:45 +0000
Subject: Btrfs: setup a nofs context for memory allocation at
 btrfs_create_tree()

commit b89f6d1fcb30a8cbdc18ce00c7d93792076af453 upstream.

We are holding a transaction handle when creating a tree, therefore we can
not allocate the root using GFP_KERNEL, as we could deadlock if reclaim is
triggered by the allocation, therefore setup a nofs context.

Fixes: 74e4d82757f74 ("btrfs: let callers of btrfs_alloc_root pass gfp flags")
CC: stable@vger.kernel.org # 4.9+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d96d1390068a..b4f61a3d560a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -17,6 +17,7 @@
 #include <linux/semaphore.h>
 #include <linux/error-injection.h>
 #include <linux/crc32c.h>
+#include <linux/sched/mm.h>
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
@@ -1236,10 +1237,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root;
 	struct btrfs_key key;
+	unsigned int nofs_flag;
 	int ret = 0;
 	uuid_le uuid = NULL_UUID_LE;
 
+	/*
+	 * We're holding a transaction handle, so use a NOFS memory allocation
+	 * context to avoid deadlock if reclaim happens.
+	 */
+	nofs_flag = memalloc_nofs_save();
 	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
+	memalloc_nofs_restore(nofs_flag);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From 6e24f5a1ebb199c8dcd84f7b91fe8ba120ecfe32 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 13 Dec 2018 21:16:56 +0000
Subject: Btrfs: setup a nofs context for memory allocation at __btrfs_set_acl

commit a0873490660246db587849a9e172f2b7b21fa88a upstream.

We are holding a transaction handle when setting an acl, therefore we can
not allocate the xattr value buffer using GFP_KERNEL, as we could deadlock
if reclaim is triggered by the allocation, therefore setup a nofs context.

Fixes: 39a27ec1004e8 ("btrfs: use GFP_KERNEL for xattr and acl allocations")
CC: stable@vger.kernel.org # 4.9+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/acl.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 3b66c957ea6f..5810463dc6d2 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -9,6 +9,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/slab.h>
 
 #include "ctree.h"
@@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 	}
 
 	if (acl) {
+		unsigned int nofs_flag;
+
 		size = posix_acl_xattr_size(acl->a_count);
+		/*
+		 * We're holding a transaction handle, so use a NOFS memory
+		 * allocation context to avoid deadlock if reclaim happens.
+		 */
+		nofs_flag = memalloc_nofs_save();
 		value = kmalloc(size, GFP_KERNEL);
+		memalloc_nofs_restore(nofs_flag);
 		if (!value) {
 			ret = -ENOMEM;
 			goto out;
-- 
cgit v1.2.3


From 1a00f7fd0fbf87faca8737e45193bada759744cb Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Mon, 18 Feb 2019 11:28:37 +0100
Subject: btrfs: ensure that a DUP or RAID1 block group has exactly two stripes

commit 349ae63f40638a28c6fce52e8447c2d14b84cc0c upstream.

We recently had a customer issue with a corrupted filesystem. When
trying to mount this image btrfs panicked with a division by zero in
calc_stripe_length().

The corrupt chunk had a 'num_stripes' value of 1. calc_stripe_length()
takes this value and divides it by the number of copies the RAID profile
is expected to have to calculate the amount of data stripes. As a DUP
profile is expected to have 2 copies this division resulted in 1/2 = 0.
Later then the 'data_stripes' variable is used as a divisor in the
stripe length calculation which results in a division by 0 and thus a
kernel panic.

When encountering a filesystem with a DUP block group and a
'num_stripes' value unequal to 2, refuse mounting as the image is
corrupted and will lead to unexpected behaviour.

Code inspection showed a RAID1 block group has the same issues.

Fixes: e06cd3dd7cea ("Btrfs: add validadtion checks for chunk loading")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 285f64f2de5f..c13f62182513 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6425,10 +6425,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
 	}
 
 	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
-	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
+	    (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) ||
 	    (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
 	    (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
-	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
+	    (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) ||
 	    ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
 	     num_stripes != 1)) {
 		btrfs_err(fs_info,
-- 
cgit v1.2.3


From 898488e2988c83555309743d7ce6ffe1f135c6a8 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 14 Feb 2019 15:17:20 +0000
Subject: Btrfs: fix corruption reading shared and compressed extents after
 hole punching

commit 8e928218780e2f1cf2f5891c7575e8f0b284fcce upstream.

In the past we had data corruption when reading compressed extents that
are shared within the same file and they are consecutive, this got fixed
by commit 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and
shared extents") and by commit 808f80b46790f ("Btrfs: update fix for read
corruption of compressed and shared extents"). However there was a case
that was missing in those fixes, which is when the shared and compressed
extents are referenced with a non-zero offset. The following shell script
creates a reproducer for this issue:

  #!/bin/bash

  mkfs.btrfs -f /dev/sdc &> /dev/null
  mount -o compress /dev/sdc /mnt/sdc

  # Create a file with 3 consecutive compressed extents, each has an
  # uncompressed size of 128Kb and a compressed size of 4Kb.
  for ((i = 1; i <= 3; i++)); do
      head -c 4096 /dev/zero
      for ((j = 1; j <= 31; j++)); do
          head -c 4096 /dev/zero | tr '\0' "\377"
      done
  done > /mnt/sdc/foobar
  sync

  echo "Digest after file creation:   $(md5sum /mnt/sdc/foobar)"

  # Clone the first extent into offsets 128K and 256K.
  xfs_io -c "reflink /mnt/sdc/foobar 0 128K 128K" /mnt/sdc/foobar
  xfs_io -c "reflink /mnt/sdc/foobar 0 256K 128K" /mnt/sdc/foobar
  sync

  echo "Digest after cloning:         $(md5sum /mnt/sdc/foobar)"

  # Punch holes into the regions that are already full of zeroes.
  xfs_io -c "fpunch 0 4K" /mnt/sdc/foobar
  xfs_io -c "fpunch 128K 4K" /mnt/sdc/foobar
  xfs_io -c "fpunch 256K 4K" /mnt/sdc/foobar
  sync

  echo "Digest after hole punching:   $(md5sum /mnt/sdc/foobar)"

  echo "Dropping page cache..."
  sysctl -q vm.drop_caches=1
  echo "Digest after hole punching:   $(md5sum /mnt/sdc/foobar)"

  umount /dev/sdc

When running the script we get the following output:

  Digest after file creation:   5a0888d80d7ab1fd31c229f83a3bbcc8  /mnt/sdc/foobar
  linked 131072/131072 bytes at offset 131072
  128 KiB, 1 ops; 0.0033 sec (36.960 MiB/sec and 295.6830 ops/sec)
  linked 131072/131072 bytes at offset 262144
  128 KiB, 1 ops; 0.0015 sec (78.567 MiB/sec and 628.5355 ops/sec)
  Digest after cloning:         5a0888d80d7ab1fd31c229f83a3bbcc8  /mnt/sdc/foobar
  Digest after hole punching:   5a0888d80d7ab1fd31c229f83a3bbcc8  /mnt/sdc/foobar
  Dropping page cache...
  Digest after hole punching:   fba694ae8664ed0c2e9ff8937e7f1484  /mnt/sdc/foobar

This happens because after reading all the pages of the extent in the
range from 128K to 256K for example, we read the hole at offset 256K
and then when reading the page at offset 260K we don't submit the
existing bio, which is responsible for filling all the page in the
range 128K to 256K only, therefore adding the pages from range 260K
to 384K to the existing bio and submitting it after iterating over the
entire range. Once the bio completes, the uncompressed data fills only
the pages in the range 128K to 256K because there's no more data read
from disk, leaving the pages in the range 260K to 384K unfilled. It is
just a slightly different variant of what was solved by commit
005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared
extents").

Fix this by forcing a bio submit, during readpages(), whenever we find a
compressed extent map for a page that is different from the extent map
for the previous page or has a different starting offset (in case it's
the same compressed extent), instead of the extent map's original start
offset.

A test case for fstests follows soon.

Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Fixes: 808f80b46790f ("Btrfs: update fix for read corruption of compressed and shared extents")
Fixes: 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared extents")
Cc: stable@vger.kernel.org # 4.3+
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 79f82f2ec4d5..90b0a6eff535 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3002,11 +3002,11 @@ static int __do_readpage(struct extent_io_tree *tree,
 		 */
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
 		    prev_em_start && *prev_em_start != (u64)-1 &&
-		    *prev_em_start != em->orig_start)
+		    *prev_em_start != em->start)
 			force_bio_submit = true;
 
 		if (prev_em_start)
-			*prev_em_start = em->orig_start;
+			*prev_em_start = em->start;
 
 		free_extent_map(em);
 		em = NULL;
-- 
cgit v1.2.3


From cdf9941b770748eb007bbe98df618b0c705b7745 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 11 Feb 2019 00:02:05 -0500
Subject: ext4: fix check of inode in swap_inode_boot_loader

commit 67a11611e1a5211f6569044fbf8150875764d1d0 upstream.

Before really do swap between inode and boot inode, something need to
check to avoid invalid or not permitted operation, like does this inode
has inline data. But the condition check should be protected by inode
lock to avoid change while swapping. Also some other condition will not
change between swapping, but there has no problem to do this under inode
lock.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ioctl.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37dafa1d133..597e8b617f92 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -116,15 +116,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	struct inode *inode_bl;
 	struct ext4_inode_info *ei_bl;
 
-	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
-	    IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
-	    ext4_has_inline_data(inode))
-		return -EINVAL;
-
-	if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
-	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
 	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(inode_bl))
 		return PTR_ERR(inode_bl);
@@ -137,6 +128,19 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	 * that only 1 swap_inode_boot_loader is running. */
 	lock_two_nondirectories(inode, inode_bl);
 
+	if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) ||
+	    IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) ||
+	    ext4_has_inline_data(inode)) {
+		err = -EINVAL;
+		goto journal_err_out;
+	}
+
+	if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) ||
+	    !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto journal_err_out;
+	}
+
 	/* Wait for all existing dio workers */
 	inode_dio_wait(inode);
 	inode_dio_wait(inode_bl);
-- 
cgit v1.2.3


From 071f68163cc0e98d543685a188dc2d909b6bb464 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 11 Feb 2019 00:05:24 -0500
Subject: ext4: cleanup pagecache before swap i_data

commit a46c68a318b08f819047843abf349aeee5d10ac2 upstream.

While do swap, we should make sure there has no new dirty page since we
should swap i_data between two inode:
1.We should lock i_mmap_sem with write to avoid new pagecache from mmap
read/write;
2.Change filemap_flush to filemap_write_and_wait and move them to the
space protected by inode lock to avoid new pagecache from buffer read/write.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ioctl.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 597e8b617f92..ea05e8d641e9 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -121,9 +121,6 @@ static long swap_inode_boot_loader(struct super_block *sb,
 		return PTR_ERR(inode_bl);
 	ei_bl = EXT4_I(inode_bl);
 
-	filemap_flush(inode->i_mapping);
-	filemap_flush(inode_bl->i_mapping);
-
 	/* Protect orig inodes against a truncate and make sure,
 	 * that only 1 swap_inode_boot_loader is running. */
 	lock_two_nondirectories(inode, inode_bl);
@@ -141,6 +138,15 @@ static long swap_inode_boot_loader(struct super_block *sb,
 		goto journal_err_out;
 	}
 
+	down_write(&EXT4_I(inode)->i_mmap_sem);
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (err)
+		goto err_out;
+
+	err = filemap_write_and_wait(inode_bl->i_mapping);
+	if (err)
+		goto err_out;
+
 	/* Wait for all existing dio workers */
 	inode_dio_wait(inode);
 	inode_dio_wait(inode_bl);
@@ -151,7 +157,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
 	if (IS_ERR(handle)) {
 		err = -EINVAL;
-		goto journal_err_out;
+		goto err_out;
 	}
 
 	/* Protect extent tree against block allocations via delalloc */
@@ -208,6 +214,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	ext4_journal_stop(handle);
 	ext4_double_up_write_data_sem(inode, inode_bl);
 
+err_out:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 journal_err_out:
 	unlock_two_nondirectories(inode, inode_bl);
 	iput(inode_bl);
-- 
cgit v1.2.3


From 048bfb5bc05f7be392f025cdcd655878e88a4eaf Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 11 Feb 2019 00:14:02 -0500
Subject: ext4: update quota information while swapping boot loader inode

commit aa507b5faf38784defe49f5e64605ac3c4425e26 upstream.

While do swap between two inode, they swap i_data without update
quota information. Also, swap_inode_boot_loader can do "revert"
somtimes, so update the quota while all operations has been finished.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ioctl.c | 56 +++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ea05e8d641e9..eff68358fae7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -68,8 +68,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
 	ei2 = EXT4_I(inode2);
 
 	swap(inode1->i_version, inode2->i_version);
-	swap(inode1->i_blocks, inode2->i_blocks);
-	swap(inode1->i_bytes, inode2->i_bytes);
 	swap(inode1->i_atime, inode2->i_atime);
 	swap(inode1->i_mtime, inode2->i_mtime);
 
@@ -115,6 +113,9 @@ static long swap_inode_boot_loader(struct super_block *sb,
 	int err;
 	struct inode *inode_bl;
 	struct ext4_inode_info *ei_bl;
+	qsize_t size, size_bl, diff;
+	blkcnt_t blocks;
+	unsigned short bytes;
 
 	inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
 	if (IS_ERR(inode_bl))
@@ -180,6 +181,13 @@ static long swap_inode_boot_loader(struct super_block *sb,
 			memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
 	}
 
+	err = dquot_initialize(inode);
+	if (err)
+		goto err_out1;
+
+	size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes;
+	size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes;
+	diff = size - size_bl;
 	swap_inode_data(inode, inode_bl);
 
 	inode->i_ctime = inode_bl->i_ctime = current_time(inode);
@@ -193,24 +201,46 @@ static long swap_inode_boot_loader(struct super_block *sb,
 
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err < 0) {
+		/* No need to update quota information. */
 		ext4_warning(inode->i_sb,
 			"couldn't mark inode #%lu dirty (err %d)",
 			inode->i_ino, err);
 		/* Revert all changes: */
 		swap_inode_data(inode, inode_bl);
 		ext4_mark_inode_dirty(handle, inode);
-	} else {
-		err = ext4_mark_inode_dirty(handle, inode_bl);
-		if (err < 0) {
-			ext4_warning(inode_bl->i_sb,
-				"couldn't mark inode #%lu dirty (err %d)",
-				inode_bl->i_ino, err);
-			/* Revert all changes: */
-			swap_inode_data(inode, inode_bl);
-			ext4_mark_inode_dirty(handle, inode);
-			ext4_mark_inode_dirty(handle, inode_bl);
-		}
+		goto err_out1;
+	}
+
+	blocks = inode_bl->i_blocks;
+	bytes = inode_bl->i_bytes;
+	inode_bl->i_blocks = inode->i_blocks;
+	inode_bl->i_bytes = inode->i_bytes;
+	err = ext4_mark_inode_dirty(handle, inode_bl);
+	if (err < 0) {
+		/* No need to update quota information. */
+		ext4_warning(inode_bl->i_sb,
+			"couldn't mark inode #%lu dirty (err %d)",
+			inode_bl->i_ino, err);
+		goto revert;
 	}
+
+	/* Bootloader inode should not be counted into quota information. */
+	if (diff > 0)
+		dquot_free_space(inode, diff);
+	else
+		err = dquot_alloc_space(inode, -1 * diff);
+
+	if (err < 0) {
+revert:
+		/* Revert all changes: */
+		inode_bl->i_blocks = blocks;
+		inode_bl->i_bytes = bytes;
+		swap_inode_data(inode, inode_bl);
+		ext4_mark_inode_dirty(handle, inode);
+		ext4_mark_inode_dirty(handle, inode_bl);
+	}
+
+err_out1:
 	ext4_journal_stop(handle);
 	ext4_double_up_write_data_sem(inode, inode_bl);
 
-- 
cgit v1.2.3


From a0d876c77705c1dd2df2d89e040bf570755cd676 Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Mon, 11 Feb 2019 00:35:06 -0500
Subject: ext4: add mask of ext4 flags to swap

commit abdc644e8cbac2e9b19763680e5a7cf9bab2bee7 upstream.

The reason is that while swapping two inode, we swap the flags too.
Some flags such as EXT4_JOURNAL_DATA_FL can really confuse the things
since we're not resetting the address operations structure.  The
simplest way to keep things sane is to restrict the flags that can be
swapped.

Signed-off-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4.h  | 3 +++
 fs/ext4/ioctl.c | 6 +++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 032cf9b92665..2ddf7833350d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -435,6 +435,9 @@ struct flex_groups {
 /* Flags that are appropriate for non-directories/regular files. */
 #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
 
+/* The only flags that should be swapped */
+#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eff68358fae7..2e76fb55d94a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -63,6 +63,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
 	loff_t isize;
 	struct ext4_inode_info *ei1;
 	struct ext4_inode_info *ei2;
+	unsigned long tmp;
 
 	ei1 = EXT4_I(inode1);
 	ei2 = EXT4_I(inode2);
@@ -72,7 +73,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
 	swap(inode1->i_mtime, inode2->i_mtime);
 
 	memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
-	swap(ei1->i_flags, ei2->i_flags);
+	tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP;
+	ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) |
+		(ei1->i_flags & ~EXT4_FL_SHOULD_SWAP);
+	ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP);
 	swap(ei1->i_disksize, ei2->i_disksize);
 	ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
 	ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
-- 
cgit v1.2.3


From 8a4fdc649ca9ac3b7b4fd363ec713d6d8ef38046 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Feb 2019 13:30:32 -0500
Subject: ext4: fix crash during online resizing

commit f96c3ac8dfc24b4e38fc4c2eba5fea2107b929d1 upstream.

When computing maximum size of filesystem possible with given number of
group descriptor blocks, we forget to include s_first_data_block into
the number of blocks. Thus for filesystems with non-zero
s_first_data_block it can happen that computed maximum filesystem size
is actually lower than current filesystem size which confuses the code
and eventually leads to a BUG_ON in ext4_alloc_group_tables() hitting on
flex_gd->count == 0. The problem can be reproduced like:

truncate -s 100g /tmp/image
mkfs.ext4 -b 1024 -E resize=262144 /tmp/image 32768
mount -t ext4 -o loop /tmp/image /mnt
resize2fs /dev/loop0 262145
resize2fs /dev/loop0 300000

Fix the problem by properly including s_first_data_block into the
computed number of filesystem blocks.

Fixes: 1c6bd7173d66 "ext4: convert file system to meta_bg if needed..."
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 48421de803b7..3d9b18505c0c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1960,7 +1960,8 @@ retry:
 				le16_to_cpu(es->s_reserved_gdt_blocks);
 			n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb);
 			n_blocks_count = (ext4_fsblk_t)n_group *
-				EXT4_BLOCKS_PER_GROUP(sb);
+				EXT4_BLOCKS_PER_GROUP(sb) +
+				le32_to_cpu(es->s_first_data_block);
 			n_group--; /* set to last group number */
 		}
 
-- 
cgit v1.2.3


From 62600af3a7cb11d20439f745451db96bde7ff08c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 29 Jan 2019 17:17:24 +0100
Subject: ext2: Fix underflow in ext2_max_size()

commit 1c2d14212b15a60300a2d4f6364753e87394c521 upstream.

When ext2 filesystem is created with 64k block size, ext2_max_size()
will return value less than 0. Also, we cannot write any file in this fs
since the sb->maxbytes is less than 0. The core of the problem is that
the size of block index tree for such large block size is more than
i_blocks can carry. So fix the computation to count with this
possibility.

File size limits computed with the new function for the full range of
possible block sizes look like:

bits file_size
10     17247252480
11    275415851008
12   2196873666560
13   2197948973056
14   2198486220800
15   2198754754560
16   2198888906752

CC: stable@vger.kernel.org
Reported-by: yangerkun <yangerkun@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext2/super.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0c38e31ec938..364e647d87c0 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -761,7 +761,8 @@ static loff_t ext2_max_size(int bits)
 {
 	loff_t res = EXT2_NDIR_BLOCKS;
 	int meta_blocks;
-	loff_t upper_limit;
+	unsigned int upper_limit;
+	unsigned int ppb = 1 << (bits-2);
 
 	/* This is calculated to be the largest file size for a
 	 * dense, file such that the total number of
@@ -775,24 +776,34 @@ static loff_t ext2_max_size(int bits)
 	/* total blocks in file system block size */
 	upper_limit >>= (bits - 9);
 
+	/* Compute how many blocks we can address by block tree */
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	/* Does block tree limit file size? */
+	if (res < upper_limit)
+		goto check_lfs;
 
+	res = upper_limit;
+	/* How many metadata blocks are needed for addressing upper_limit? */
+	upper_limit -= EXT2_NDIR_BLOCKS;
 	/* indirect blocks */
 	meta_blocks = 1;
+	upper_limit -= ppb;
 	/* double indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2));
-	/* tripple indirect blocks */
-	meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
-	upper_limit -= meta_blocks;
-	upper_limit <<= bits;
-
-	res += 1LL << (bits-2);
-	res += 1LL << (2*(bits-2));
-	res += 1LL << (3*(bits-2));
+	if (upper_limit < ppb * ppb) {
+		meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb);
+		res -= meta_blocks;
+		goto check_lfs;
+	}
+	meta_blocks += 1 + ppb;
+	upper_limit -= ppb * ppb;
+	/* tripple indirect blocks for the rest */
+	meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) +
+		DIV_ROUND_UP(upper_limit, ppb*ppb);
+	res -= meta_blocks;
+check_lfs:
 	res <<= bits;
-	if (res > upper_limit)
-		res = upper_limit;
-
 	if (res > MAX_LFS_FILESIZE)
 		res = MAX_LFS_FILESIZE;
 
-- 
cgit v1.2.3


From dbe4bc993836f64e8c26b109addc1ac6eb997bab Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sun, 10 Feb 2019 23:23:04 -0500
Subject: jbd2: clear dirty flag when revoking a buffer from an older
 transaction

commit 904cdbd41d749a476863a0ca41f6f396774f26e4 upstream.

Now, we capture a data corruption problem on ext4 while we're truncating
an extent index block. Imaging that if we are revoking a buffer which
has been journaled by the committing transaction, the buffer's jbddirty
flag will not be cleared in jbd2_journal_forget(), so the commit code
will set the buffer dirty flag again after refile the buffer.

fsx                               kjournald2
                                  jbd2_journal_commit_transaction
jbd2_journal_revoke                commit phase 1~5...
 jbd2_journal_forget
   belongs to older transaction    commit phase 6
   jbddirty not clear               __jbd2_journal_refile_buffer
                                     __jbd2_journal_unfile_buffer
                                      test_clear_buffer_jbddirty
                                       mark_buffer_dirty

Finally, if the freed extent index block was allocated again as data
block by some other files, it may corrupt the file data after writing
cached pages later, such as during unmount time. (In general,
clean_bdev_aliases() related helpers should be invoked after
re-allocation to prevent the above corruption, but unfortunately we
missed it when zeroout the head of extra extent blocks in
ext4_ext_handle_unwritten_extents()).

This patch mark buffer as freed and set j_next_transaction to the new
transaction when it already belongs to the committing transaction in
jbd2_journal_forget(), so that commit code knows it should clear dirty
bits when it is done with the buffer.

This problem can be reproduced by xfstests generic/455 easily with
seeds (3246 3247 3248 3249).

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/transaction.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index c0b66a7a795b..acb052c62c24 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1576,14 +1576,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
 		/* However, if the buffer is still owned by a prior
 		 * (committing) transaction, we can't drop it yet... */
 		JBUFFER_TRACE(jh, "belongs to older transaction");
-		/* ... but we CAN drop it from the new transaction if we
-		 * have also modified it since the original commit. */
+		/* ... but we CAN drop it from the new transaction through
+		 * marking the buffer as freed and set j_next_transaction to
+		 * the new transaction, so that not only the commit code
+		 * knows it should clear dirty bits when it is done with the
+		 * buffer, but also the buffer can be checkpointed only
+		 * after the new transaction commits. */
 
-		if (jh->b_next_transaction) {
-			J_ASSERT(jh->b_next_transaction == transaction);
+		set_buffer_freed(bh);
+
+		if (!jh->b_next_transaction) {
 			spin_lock(&journal->j_list_lock);
-			jh->b_next_transaction = NULL;
+			jh->b_next_transaction = transaction;
 			spin_unlock(&journal->j_list_lock);
+		} else {
+			J_ASSERT(jh->b_next_transaction == transaction);
 
 			/*
 			 * only drop a reference if this transaction modified
-- 
cgit v1.2.3


From 584f390d1039e06512f716f5c30567fdc3763a17 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Thu, 21 Feb 2019 11:24:09 -0500
Subject: jbd2: fix compile warning when using JBUFFER_TRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 01215d3edb0f384ddeaa5e4a22c1ae5ff634149f upstream.

The jh pointer may be used uninitialized in the two cases below and the
compiler complain about it when enabling JBUFFER_TRACE macro, fix them.

In file included from fs/jbd2/transaction.c:19:0:
fs/jbd2/transaction.c: In function ‘jbd2_journal_get_undo_access’:
./include/linux/jbd2.h:1637:38: warning: ‘jh’ is used uninitialized in this function [-Wuninitialized]
 #define JBUFFER_TRACE(jh, info) do { printk("%s: %d\n", __func__, jh->b_jcount);} while (0)
                                      ^
fs/jbd2/transaction.c:1219:23: note: ‘jh’ was declared here
  struct journal_head *jh;
                       ^
In file included from fs/jbd2/transaction.c:19:0:
fs/jbd2/transaction.c: In function ‘jbd2_journal_dirty_metadata’:
./include/linux/jbd2.h:1637:38: warning: ‘jh’ may be used uninitialized in this function [-Wmaybe-uninitialized]
 #define JBUFFER_TRACE(jh, info) do { printk("%s: %d\n", __func__, jh->b_jcount);} while (0)
                                      ^
fs/jbd2/transaction.c:1332:23: note: ‘jh’ was declared here
  struct journal_head *jh;
                       ^

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/transaction.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index acb052c62c24..914e725c82c4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1219,11 +1219,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 	struct journal_head *jh;
 	char *committed_data = NULL;
 
-	JBUFFER_TRACE(jh, "entry");
 	if (jbd2_write_access_granted(handle, bh, true))
 		return 0;
 
 	jh = jbd2_journal_add_journal_head(bh);
+	JBUFFER_TRACE(jh, "entry");
+
 	/*
 	 * Do this first --- it can drop the journal lock, so we want to
 	 * make sure that obtaining the committed_data is done
@@ -1334,15 +1335,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
 	if (is_handle_aborted(handle))
 		return -EROFS;
-	if (!buffer_jbd(bh)) {
-		ret = -EUCLEAN;
-		goto out;
-	}
+	if (!buffer_jbd(bh))
+		return -EUCLEAN;
+
 	/*
 	 * We don't grab jh reference here since the buffer must be part
 	 * of the running transaction.
 	 */
 	jh = bh2jh(bh);
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+
 	/*
 	 * This and the following assertions are unreliable since we may see jh
 	 * in inconsistent state unless we grab bh_state lock. But this is
@@ -1376,9 +1379,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 	}
 
 	journal = transaction->t_journal;
-	jbd_debug(5, "journal_head %p\n", jh);
-	JBUFFER_TRACE(jh, "entry");
-
 	jbd_lock_bh_state(bh);
 
 	if (jh->b_modified == 0) {
-- 
cgit v1.2.3


From be74fddc976e40a2a535168182783d82154d66e8 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 13 Feb 2019 09:21:38 -0500
Subject: NFS: Fix I/O request leakages

commit f57dcf4c72113c745d83f1c65f7291299f65c14f upstream.

When we fail to add the request to the I/O queue, we currently leave it
to the caller to free the failed request. However since some of the
requests that fail are actually created by nfs_pageio_add_request()
itself, and are not passed back the caller, this leads to a leakage
issue, which can again cause page locks to leak.

This commit addresses the leakage by freeing the created requests on
error, using desc->pg_completion_ops->error_cleanup()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Fixes: a7d42ddb30997 ("nfs: add mirroring support to pgio layer")
Cc: stable@vger.kernel.org # v4.0: c18b96a1b862: nfs: clean up rest of reqs
Cc: stable@vger.kernel.org # v4.0: d600ad1f2bdb: NFS41: pop some layoutget
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/pagelist.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 3dbd15b47c27..5dfefae1b47d 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -989,6 +989,17 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 	}
 }
 
+static void
+nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
+		struct nfs_page *req)
+{
+	LIST_HEAD(head);
+
+	nfs_list_remove_request(req);
+	nfs_list_add_request(req, &head);
+	desc->pg_completion_ops->error_cleanup(&head);
+}
+
 /**
  * nfs_pageio_add_request - Attempt to coalesce a request into a page list.
  * @desc: destination io descriptor
@@ -1026,10 +1037,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 			nfs_page_group_unlock(req);
 			desc->pg_moreio = 1;
 			nfs_pageio_doio(desc);
-			if (desc->pg_error < 0)
-				return 0;
-			if (mirror->pg_recoalesce)
-				return 0;
+			if (desc->pg_error < 0 || mirror->pg_recoalesce)
+				goto out_cleanup_subreq;
 			/* retry add_request for this subreq */
 			nfs_page_group_lock(req);
 			continue;
@@ -1062,6 +1071,10 @@ err_ptr:
 	desc->pg_error = PTR_ERR(subreq);
 	nfs_page_group_unlock(req);
 	return 0;
+out_cleanup_subreq:
+	if (req != subreq)
+		nfs_pageio_cleanup_request(desc, subreq);
+	return 0;
 }
 
 static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1169,11 +1182,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 		if (nfs_pgio_has_mirroring(desc))
 			desc->pg_mirror_idx = midx;
 		if (!nfs_pageio_add_request_mirror(desc, dupreq))
-			goto out_failed;
+			goto out_cleanup_subreq;
 	}
 
 	return 1;
 
+out_cleanup_subreq:
+	if (req != dupreq)
+		nfs_pageio_cleanup_request(desc, dupreq);
 out_failed:
 	/* remember fatal errors */
 	if (nfs_error_is_fatal(desc->pg_error))
-- 
cgit v1.2.3


From 63b0ee126f7ea398e3e68ae7e57598e080865a18 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 15 Feb 2019 14:59:52 -0500
Subject: NFS: Fix an I/O request leakage in nfs_do_recoalesce

commit 4d91969ed4dbcefd0e78f77494f0cb8fada9048a upstream.

Whether we need to exit early, or just reprocess the list, we
must not lost track of the request which failed to get recoalesced.

Fixes: 03d5eb65b538 ("NFS: Fix a memory leak in nfs_do_recoalesce")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/pagelist.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 5dfefae1b47d..7c9b75c546e6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1093,7 +1093,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 			struct nfs_page *req;
 
 			req = list_first_entry(&head, struct nfs_page, wb_list);
-			nfs_list_remove_request(req);
 			if (__nfs_pageio_add_request(desc, req))
 				continue;
 			if (desc->pg_error < 0) {
-- 
cgit v1.2.3


From 2c648caf630db6f2e3429459b6a7fa4760275c58 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 15 Feb 2019 16:08:25 -0500
Subject: NFS: Don't recoalesce on error in nfs_pageio_complete_mirror()

commit 8127d82705998568b52ac724e28e00941538083d upstream.

If the I/O completion failed with a fatal error, then we should just
exit nfs_pageio_complete_mirror() rather than try to recoalesce.

Fixes: a7d42ddb3099 ("nfs: add mirroring support to pgio layer")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/pagelist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7c9b75c546e6..0ec6bce3dd69 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1214,7 +1214,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
 		desc->pg_mirror_idx = mirror_idx;
 	for (;;) {
 		nfs_pageio_doio(desc);
-		if (!mirror->pg_recoalesce)
+		if (desc->pg_error < 0 || !mirror->pg_recoalesce)
 			break;
 		if (!nfs_do_recoalesce(desc))
 			break;
-- 
cgit v1.2.3


From 10a68cdf1035689fe9bfb5e413110cefad510508 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Feb 2019 10:47:00 -0500
Subject: nfsd: fix performance-limiting session calculation

commit c54f24e338ed2a35218f117a4a1afb5f9e2b4e64 upstream.

We're unintentionally limiting the number of slots per nfsv4.1 session
to 10.  Often more than 10 simultaneous RPCs are needed for the best
performance.

This calculation was meant to prevent any one client from using up more
than a third of the limit we set for total memory use across all clients
and sessions.  Instead, it's limiting the client to a third of the
maximum for a single session.

Fix this.

Reported-by: Chris Tracy <ctracy@engr.scu.edu>
Cc: stable@vger.kernel.org
Fixes: de766e570413 "nfsd: give out fewer session slots as limit approaches"
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs4state.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9c6d1d57b598..bec75600e692 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1514,16 +1514,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca)
 {
 	u32 slotsize = slot_bytes(ca);
 	u32 num = ca->maxreqs;
-	int avail;
+	unsigned long avail, total_avail;
 
 	spin_lock(&nfsd_drc_lock);
-	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
-		    nfsd_drc_max_mem - nfsd_drc_mem_used);
+	total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used;
+	avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail);
 	/*
 	 * Never use more than a third of the remaining memory,
 	 * unless it's the only way to give this client a slot:
 	 */
-	avail = clamp_t(int, avail, slotsize, avail/3);
+	avail = clamp_t(int, avail, slotsize, total_avail/3);
 	num = min_t(int, num, avail / slotsize);
 	nfsd_drc_mem_used += num * slotsize;
 	spin_unlock(&nfsd_drc_lock);
-- 
cgit v1.2.3


From 8056912c1c75fc27127dd7d092c197e01c63b6ad Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 4 Mar 2019 14:08:22 +1100
Subject: nfsd: fix memory corruption caused by readdir

commit b602345da6cbb135ba68cf042df8ec9a73da7981 upstream.

If the result of an NFSv3 readdir{,plus} request results in the
"offset" on one entry having to be split across 2 pages, and is sized
so that the next directory entry doesn't fit in the requested size,
then memory corruption can happen.

When encode_entry() is called after encoding the last entry that fits,
it notices that ->offset and ->offset1 are set, and so stores the
offset value in the two pages as required.  It clears ->offset1 but
*does not* clear ->offset.

Normally this omission doesn't matter as encode_entry_baggage() will
be called, and will set ->offset to a suitable value (not on a page
boundary).
But in the case where cd->buflen < elen and nfserr_toosmall is
returned, ->offset is not reset.

This means that nfsd3proc_readdirplus will see ->offset with a value 4
bytes before the end of a page, and ->offset1 set to NULL.
It will try to write 8bytes to ->offset.
If we are lucky, the next page will be read-only, and the system will
  BUG: unable to handle kernel paging request at...

If we are unlucky, some innocent page will have the first 4 bytes
corrupted.

nfsd3proc_readdir() doesn't even check for ->offset1, it just blindly
writes 8 bytes to the offset wherever it is.

Fix this by clearing ->offset after it is used, and copying the
->offset handling code from nfsd3_proc_readdirplus into
nfsd3_proc_readdir.

(Note that the commit hash in the Fixes tag is from the 'history'
 tree - this bug predates git).

Fixes: 0b1d57cf7654 ("[PATCH] kNFSd: Fix nfs3 dentry encoding")
Fixes-URL: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=0b1d57cf7654
Cc: stable@vger.kernel.org (v2.6.12+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs3proc.c | 16 ++++++++++++++--
 fs/nfsd/nfs3xdr.c  |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9eb8086ea841..c9cf46e0c040 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
 					&resp->common, nfs3svc_encode_entry);
 	memcpy(resp->verf, argp->verf, 8);
 	resp->count = resp->buffer - argp->buffer;
-	if (resp->offset)
-		xdr_encode_hyper(resp->offset, argp->cookie);
+	if (resp->offset) {
+		loff_t offset = argp->cookie;
+
+		if (unlikely(resp->offset1)) {
+			/* we ended up with offset on a page boundary */
+			*resp->offset = htonl(offset >> 32);
+			*resp->offset1 = htonl(offset & 0xffffffff);
+			resp->offset1 = NULL;
+		} else {
+			xdr_encode_hyper(resp->offset, offset);
+		}
+		resp->offset = NULL;
+	}
 
 	RETURN_STATUS(nfserr);
 }
@@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
 		} else {
 			xdr_encode_hyper(resp->offset, offset);
 		}
+		resp->offset = NULL;
 	}
 
 	RETURN_STATUS(nfserr);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9b973f4f7d01..83919116d5cb 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -921,6 +921,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
 		} else {
 			xdr_encode_hyper(cd->offset, offset64);
 		}
+		cd->offset = NULL;
 	}
 
 	/*
-- 
cgit v1.2.3


From ecab6ab1c22726d2c439b587d103700f72210b4b Mon Sep 17 00:00:00 2001
From: Yihao Wu <wuyihao@linux.alibaba.com>
Date: Wed, 6 Mar 2019 21:03:50 +0800
Subject: nfsd: fix wrong check in write_v4_end_grace()

commit dd838821f0a29781b185cd8fb8e48d5c177bd838 upstream.

Commit 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before
nfsd startup" is trying to fix a NULL dereference issue, but it
mistakenly checks if the nfsd server is started. So fix it.

Fixes: 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before nfsd startup"
Cc: stable@vger.kernel.org
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Yihao Wu <wuyihao@linux.alibaba.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfsctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 39b835d7c445..cb69660d0779 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
 		case 'Y':
 		case 'y':
 		case '1':
-			if (nn->nfsd_serv)
+			if (!nn->nfsd_serv)
 				return -EBUSY;
 			nfsd4_end_grace(nn);
 			break;
-- 
cgit v1.2.3


From 4af185feb9df8fae93717df2fe31e51d89139172 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 1 Mar 2019 12:13:34 -0500
Subject: NFSv4.1: Reinitialise sequence results before retransmitting a
 request

commit c1dffe0bf7f9c3d57d9f237a7cb2a81e62babd2b upstream.

If we have to retransmit a request, we should ensure that we reinitialise
the sequence results structure, since in the event of a signal
we need to treat the request as if it had not been sent.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8220a168282e..e7abcf7629b3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -947,6 +947,13 @@ nfs4_sequence_process_interrupted(struct nfs_client *client,
 
 #endif	/* !CONFIG_NFS_V4_1 */
 
+static void nfs41_sequence_res_init(struct nfs4_sequence_res *res)
+{
+	res->sr_timestamp = jiffies;
+	res->sr_status_flags = 0;
+	res->sr_status = 1;
+}
+
 static
 void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
 		struct nfs4_sequence_res *res,
@@ -958,10 +965,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args,
 	args->sa_slot = slot;
 
 	res->sr_slot = slot;
-	res->sr_timestamp = jiffies;
-	res->sr_status_flags = 0;
-	res->sr_status = 1;
-
 }
 
 int nfs4_setup_sequence(struct nfs_client *client,
@@ -1007,6 +1010,7 @@ int nfs4_setup_sequence(struct nfs_client *client,
 
 	trace_nfs4_setup_sequence(session, args);
 out_start:
+	nfs41_sequence_res_init(res);
 	rpc_call_start(task);
 	return 0;
 
-- 
cgit v1.2.3


From c72e90d94a148be5b04dd47d84303006d645d36e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 11 Mar 2019 15:04:18 +0100
Subject: udf: Fix crash on IO error during truncate

commit d3ca4651d05c0ff7259d087d8c949bcf3e14fb46 upstream.

When truncate(2) hits IO error when reading indirect extent block the
code just bugs with:

kernel BUG at linux-4.15.0/fs/udf/truncate.c:249!
...

Fix the problem by bailing out cleanly in case of IO error.

CC: stable@vger.kernel.org
Reported-by: jean-luc malet <jeanluc.malet@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/udf/truncate.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index b647f0bd150c..94220ba85628 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode)
 			epos.block = eloc;
 			epos.bh = udf_tread(sb,
 					udf_get_lb_pblock(sb, &eloc, 0));
+			/* Error reading indirect block? */
+			if (!epos.bh)
+				return;
 			if (elen)
 				indirect_ext_len =
 					(elen + sb->s_blocksize - 1) >>
-- 
cgit v1.2.3


From 14c52acaac86ec7fb99c3a1287227664cfc5eb54 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Thu, 21 Mar 2019 14:59:02 +1000
Subject: cifs: allow guest mounts to work for smb3.11

commit e71ab2aa06f731a944993120b0eef1556c63b81c upstream.

Fix Guest/Anonymous sessions so that they work with SMB 3.11.

The commit noted below tightened the conditions and forced signing for
the SMB2-TreeConnect commands as per MS-SMB2.
However, this should only apply to normal user sessions and not for
Guest/Anonumous sessions.

Fixes: 6188f28bf608 ("Tree connect for SMB3.1.1 must be signed for non-encrypted shares")

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index a2d701775c49..3d8db0504f5c 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1500,9 +1500,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	iov[1].iov_base = unc_path;
 	iov[1].iov_len = unc_path_len;
 
-	/* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */
+	/*
+	 * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1
+	 * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1
+	 */
 	if ((ses->server->dialect == SMB311_PROT_ID) &&
-	    !smb3_encryption_required(tcon))
+	    !smb3_encryption_required(tcon) &&
+	    !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)))
 		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
 
 	memset(&rqst, 0, sizeof(struct smb_rqst));
-- 
cgit v1.2.3


From 38bd575b9aef70c54a2fc9559b01f077b335aaa4 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 22 Mar 2019 22:31:17 -0500
Subject: SMB3: Fix SMB3.1.1 guest mounts to Samba

commit 8c11a607d1d9cd6e7f01fd6b03923597fb0ef95a upstream.

Workaround problem with Samba responses to SMB3.1.1
null user (guest) mounts.  The server doesn't set the
expected flag in the session setup response so we have
to do a similar check to what is done in smb3_validate_negotiate
where we also check if the user is a null user (but not sec=krb5
since username might not be passed in on mount for Kerberos case).

Note that the commit below tightened the conditions and forced signing
for the SMB2-TreeConnect commands as per MS-SMB2.
However, this should only apply to normal user sessions and not for
cases where there is no user (even if server forgets to set the flag
in the response) since we don't have anything useful to sign with.
This is especially important now that the more secure SMB3.1.1 protocol
is in the default dialect list.

An earlier patch ("cifs: allow guest mounts to work for smb3.11") fixed
the guest mounts to Windows.

    Fixes: 6188f28bf608 ("Tree connect for SMB3.1.1 must be signed for non-encrypted shares")

Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Reviewed-by: Paulo Alcantara <palcantara@suse.de>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 3d8db0504f5c..3d0db37d64ad 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1503,10 +1503,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	/*
 	 * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1
 	 * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1
+	 * (Samba servers don't always set the flag so also check if null user)
 	 */
 	if ((ses->server->dialect == SMB311_PROT_ID) &&
 	    !smb3_encryption_required(tcon) &&
-	    !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)))
+	    !(ses->session_flags &
+		    (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) &&
+	    ((ses->user_name != NULL) || (ses->sectype == Kerberos)))
 		req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
 
 	memset(&rqst, 0, sizeof(struct smb_rqst));
-- 
cgit v1.2.3


From 558331d0205bea630477c2046af32dd1a5540f07 Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Thu, 14 Mar 2019 23:19:22 -0400
Subject: ext4: fix NULL pointer dereference while journal is aborted

commit fa30dde38aa8628c73a6dded7cb0bba38c27b576 upstream.

We see the following NULL pointer dereference while running xfstests
generic/475:
BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
PGD 8000000c84bad067 P4D 8000000c84bad067 PUD c84e62067 PMD 0
Oops: 0000 [#1] SMP PTI
CPU: 7 PID: 9886 Comm: fsstress Kdump: loaded Not tainted 5.0.0-rc8 #10
RIP: 0010:ext4_do_update_inode+0x4ec/0x760
...
Call Trace:
? jbd2_journal_get_write_access+0x42/0x50
? __ext4_journal_get_write_access+0x2c/0x70
? ext4_truncate+0x186/0x3f0
ext4_mark_iloc_dirty+0x61/0x80
ext4_mark_inode_dirty+0x62/0x1b0
ext4_truncate+0x186/0x3f0
? unmap_mapping_pages+0x56/0x100
ext4_setattr+0x817/0x8b0
notify_change+0x1df/0x430
do_truncate+0x5e/0x90
? generic_permission+0x12b/0x1a0

This is triggered because the NULL pointer handle->h_transaction was
dereferenced in function ext4_update_inode_fsync_trans().
I found that the h_transaction was set to NULL in jbd2__journal_restart
but failed to attached to a new transaction while the journal is aborted.

Fix this by checking the handle before updating the inode.

Fixes: b436b9bef84d ("ext4: Wait for proper transaction commit on fsync")
Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4_jbd2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 15b6dd733780..df908ef79cce 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle,
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (ext4_handle_valid(handle)) {
+	if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
 		ei->i_sync_tid = handle->h_transaction->t_tid;
 		if (datasync)
 			ei->i_datasync_tid = handle->h_transaction->t_tid;
-- 
cgit v1.2.3


From 76c9ee6bd5d23caf5e57cd91282b8ff374e60437 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 14 Mar 2019 23:20:25 -0400
Subject: ext4: fix data corruption caused by unaligned direct AIO

commit 372a03e01853f860560eade508794dd274e9b390 upstream.

Ext4 needs to serialize unaligned direct AIO because the zeroing of
partial blocks of two competing unaligned AIOs can result in data
corruption.

However it decides not to serialize if the potentially unaligned aio is
past i_size with the rationale that no pending writes are possible past
i_size. Unfortunately if the i_size is not block aligned and the second
unaligned write lands past i_size, but still into the same block, it has
the potential of corrupting the previous unaligned write to the same
block.

This is (very simplified) reproducer from Frank

    // 41472 = (10 * 4096) + 512
    // 37376 = 41472 - 4096

    ftruncate(fd, 41472);
    io_prep_pwrite(iocbs[0], fd, buf[0], 4096, 37376);
    io_prep_pwrite(iocbs[1], fd, buf[1], 4096, 41472);

    io_submit(io_ctx, 1, &iocbs[1]);
    io_submit(io_ctx, 1, &iocbs[2]);

    io_getevents(io_ctx, 2, 2, events, NULL);

Without this patch the 512B range from 40960 up to the start of the
second unaligned write (41472) is going to be zeroed overwriting the data
written by the first write. This is a data corruption.

00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
00009200  30 30 30 30 30 30 30 30  30 30 30 30 30 30 30 30
*
0000a000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
0000a200  31 31 31 31 31 31 31 31  31 31 31 31 31 31 31 31

With this patch the data corruption is avoided because we will recognize
the unaligned_aio and wait for the unwritten extent conversion.

00000000  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
*
00009200  30 30 30 30 30 30 30 30  30 30 30 30 30 30 30 30
*
0000a200  31 31 31 31 31 31 31 31  31 31 31 31 31 31 31 31
*
0000b200

Reported-by: Frank Sorenson <fsorenso@redhat.com>
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Fixes: e9e3bcecf44c ("ext4: serialize unaligned asynchronous DIO")
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 69d65d49837b..98ec11f69cd4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 	struct super_block *sb = inode->i_sb;
 	int blockmask = sb->s_blocksize - 1;
 
-	if (pos >= i_size_read(inode))
+	if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize))
 		return 0;
 
 	if ((pos | iov_iter_alignment(from)) & blockmask)
-- 
cgit v1.2.3


From d12d86411c03d9127f0bc0766e25618b9e5c7568 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sat, 23 Mar 2019 11:43:05 -0400
Subject: ext4: brelse all indirect buffer in ext4_ind_remove_space()

commit 674a2b27234d1b7afcb0a9162e81b2e53aeef217 upstream.

All indirect buffers get by ext4_find_shared() should be released no
mater the branch should be freed or not. But now, we forget to release
the lower depth indirect buffers when removing space from the same
higher depth indirect block. It will lead to buffer leak and futher
more, it may lead to quota information corruption when using old quota,
consider the following case.

 - Create and mount an empty ext4 filesystem without extent and quota
   features,
 - quotacheck and enable the user & group quota,
 - Create some files and write some data to them, and then punch hole
   to some files of them, it may trigger the buffer leak problem
   mentioned above.
 - Disable quota and run quotacheck again, it will create two new
   aquota files and write the checked quota information to them, which
   probably may reuse the freed indirect block(the buffer and page
   cache was not freed) as data block.
 - Enable quota again, it will invoke
   vfs_load_quota_inode()->invalidate_bdev() to try to clean unused
   buffers and pagecache. Unfortunately, because of the buffer of quota
   data block is still referenced, quota code cannot read the up to date
   quota info from the device and lead to quota information corruption.

This problem can be reproduced by xfstests generic/231 on ext3 file
system or ext4 file system without extent and quota features.

This patch fix this problem by releasing the missing indirect buffers,
in ext4_ind_remove_space().

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/indirect.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index bf7fa1507e81..9e96a0bd08d9 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1387,10 +1387,14 @@ end_range:
 					   partial->p + 1,
 					   partial2->p,
 					   (chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
+			while (partial > chain) {
+				BUFFER_TRACE(partial->bh, "call brelse");
+				brelse(partial->bh);
+			}
+			while (partial2 > chain2) {
+				BUFFER_TRACE(partial2->bh, "call brelse");
+				brelse(partial2->bh);
+			}
 			return 0;
 		}
 
-- 
cgit v1.2.3


From 1fd916e879a9b2fabca931c012a59409e783d241 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Mon, 25 Feb 2019 17:11:03 +0800
Subject: f2fs: fix to avoid deadlock of atomic file operations

commit 48432984d718c95cf13e26d487c2d1b697c3c01f upstream.

Thread A				Thread B
- __fput
 - f2fs_release_file
  - drop_inmem_pages
   - mutex_lock(&fi->inmem_lock)
   - __revoke_inmem_pages
    - lock_page(page)
					- open
					- f2fs_setattr
					- truncate_setsize
					 - truncate_inode_pages_range
					  - lock_page(page)
					  - truncate_cleanup_page
					   - f2fs_invalidate_page
					    - drop_inmem_page
					    - mutex_lock(&fi->inmem_lock);

We may encounter above ABBA deadlock as reported by Kyungtae Kim:

I'm reporting a bug in linux-4.17.19: "INFO: task hung in
drop_inmem_page" (no reproducer)

I think this might be somehow related to the following:
https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ

=========================================
INFO: task syz-executor7:10822 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D27024 10822   6346 0x00000004
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617
 __mutex_lock_common kernel/locking/mutex.c:833 [inline]
 __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893
 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908
 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327
 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401
 do_invalidatepage mm/truncate.c:165 [inline]
 truncate_cleanup_page+0x261/0x330 mm/truncate.c:187
 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367
 truncate_inode_pages mm/truncate.c:478 [inline]
 truncate_pagecache+0x6d/0x90 mm/truncate.c:801
 truncate_setsize+0x81/0xa0 mm/truncate.c:826
 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781
 notify_change+0xa62/0xe80 fs/attr.c:313
 do_truncate+0x12e/0x1e0 fs/open.c:63
 do_last fs/namei.c:2955 [inline]
 path_openat+0x2042/0x29f0 fs/namei.c:3505
 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540
 do_sys_open+0x35e/0x4e0 fs/open.c:1101
 __do_sys_open fs/open.c:1119 [inline]
 __se_sys_open fs/open.c:1114 [inline]
 __x64_sys_open+0x89/0xc0 fs/open.c:1114
 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9
RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080
RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700
INFO: task syz-executor7:10858 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor7   D28880 10858   6346 0x00000004
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline]
 rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594
 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117
 __down_write arch/x86/include/asm/rwsem.h:142 [inline]
 down_write+0x58/0xa0 kernel/locking/rwsem.c:72
 inode_lock include/linux/fs.h:713 [inline]
 do_truncate+0x120/0x1e0 fs/open.c:61
 do_last fs/namei.c:2955 [inline]
 path_openat+0x2042/0x29f0 fs/namei.c:3505
 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540
 do_sys_open+0x35e/0x4e0 fs/open.c:1101
 __do_sys_open fs/open.c:1119 [inline]
 __se_sys_open fs/open.c:1114 [inline]
 __x64_sys_open+0x89/0xc0 fs/open.c:1114
 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002
RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9
RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080
RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700
INFO: task syz-executor5:10829 blocked for more than 120 seconds.
      Not tainted 4.17.19 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
syz-executor5   D28760 10829   6308 0x80000002
Call Trace:
 context_switch kernel/sched/core.c:2867 [inline]
 __schedule+0x721/0x1e60 kernel/sched/core.c:3515
 schedule+0x88/0x1c0 kernel/sched/core.c:3559
 io_schedule+0x21/0x80 kernel/sched/core.c:5179
 wait_on_page_bit_common mm/filemap.c:1100 [inline]
 __lock_page+0x2b5/0x390 mm/filemap.c:1273
 lock_page include/linux/pagemap.h:483 [inline]
 __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231
 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306
 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556
 __fput+0x2c7/0x780 fs/file_table.c:209
 ____fput+0x1a/0x20 fs/file_table.c:243
 task_work_run+0x151/0x1d0 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x8ba/0x30a0 kernel/exit.c:865
 do_group_exit+0x13b/0x3a0 kernel/exit.c:968
 get_signal+0x6bb/0x1650 kernel/signal.c:2482
 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810
 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162
 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
 do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4497b9
RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80
RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700

This patch tries to use trylock_page to mitigate such deadlock condition
for fix.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 30779aaa9dba..1fa6f8185766 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -216,7 +216,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page)
 }
 
 static int __revoke_inmem_pages(struct inode *inode,
-				struct list_head *head, bool drop, bool recover)
+				struct list_head *head, bool drop, bool recover,
+				bool trylock)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
@@ -228,7 +229,16 @@ static int __revoke_inmem_pages(struct inode *inode,
 		if (drop)
 			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
 
-		lock_page(page);
+		if (trylock) {
+			/*
+			 * to avoid deadlock in between page lock and
+			 * inmem_lock.
+			 */
+			if (!trylock_page(page))
+				continue;
+		} else {
+			lock_page(page);
+		}
 
 		f2fs_wait_on_page_writeback(page, DATA, true);
 
@@ -317,13 +327,19 @@ void f2fs_drop_inmem_pages(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	mutex_lock(&fi->inmem_lock);
-	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
-	spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
-	if (!list_empty(&fi->inmem_ilist))
-		list_del_init(&fi->inmem_ilist);
-	spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
-	mutex_unlock(&fi->inmem_lock);
+	while (!list_empty(&fi->inmem_pages)) {
+		mutex_lock(&fi->inmem_lock);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, true);
+
+		if (list_empty(&fi->inmem_pages)) {
+			spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+			if (!list_empty(&fi->inmem_ilist))
+				list_del_init(&fi->inmem_ilist);
+			spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+		}
+		mutex_unlock(&fi->inmem_lock);
+	}
 
 	clear_inode_flag(inode, FI_ATOMIC_FILE);
 	fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0;
@@ -427,12 +443,15 @@ retry:
 		 * recovery or rewrite & commit last transaction. For other
 		 * error number, revoking was done by filesystem itself.
 		 */
-		err = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		err = __revoke_inmem_pages(inode, &revoke_list,
+						false, true, false);
 
 		/* drop all uncommitted pages */
-		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+		__revoke_inmem_pages(inode, &fi->inmem_pages,
+						true, false, false);
 	} else {
-		__revoke_inmem_pages(inode, &revoke_list, false, false);
+		__revoke_inmem_pages(inode, &revoke_list,
+						false, false, false);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 22dcb30fb9d8199b9a105f699b828dedb5b59d36 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 4 Mar 2019 14:06:12 +0000
Subject: Btrfs: fix incorrect file size after shrinking truncate and fsync

commit bf504110bc8aa05df48b0e5f0aa84bfb81e0574b upstream.

If we do a shrinking truncate against an inode which is already present
in the respective log tree and then rename it, as part of logging the new
name we end up logging an inode item that reflects the old size of the
file (the one which we previously logged) and not the new smaller size.
The decision to preserve the size previously logged was added by commit
1a4bcf470c886b ("Btrfs: fix fsync data loss after adding hard link to
inode") in order to avoid data loss after replaying the log. However that
decision is only needed for the case the logged inode size is smaller then
the current size of the inode, as explained in that commit's change log.
If the current size of the inode is smaller then the previously logged
size, we know a shrinking truncate happened and therefore need to use
that smaller size.

Example to trigger the problem:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ xfs_io -f -c "pwrite -S 0xab 0 8000" /mnt/foo
  $ xfs_io -c "fsync" /mnt/foo
  $ xfs_io -c "truncate 3000" /mnt/foo

  $ mv /mnt/foo /mnt/bar
  $ xfs_io -c "fsync" /mnt/bar

  <power failure>

  $ mount /dev/sdb /mnt
  $ od -t x1 -A d /mnt/bar
  0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab
  *
  0008000

Once we rename the file, we log its name (and inode item), and because
the inode was already logged before in the current transaction, we log it
with a size of 8000 bytes because that is the size we previously logged
(with the first fsync). As part of the rename, besides logging the inode,
we do also sync the log, which is done since commit d4682ba03ef618
("Btrfs: sync log after logging new name"), so the next fsync against our
inode is effectively a no-op, since no new changes happened since the
rename operation. Even if did not sync the log during the rename
operation, the same problem (fize size of 8000 bytes instead of 3000
bytes) would be visible after replaying the log if the log ended up
getting synced to disk through some other means, such as for example by
fsyncing some other modified file. In the example above the fsync after
the rename operation is there just because not every filesystem may
guarantee logging/journalling the inode (and syncing the log/journal)
during the rename operation, for example it is needed for f2fs, but not
for ext4 and xfs.

Fix this scenario by, when logging a new name (which is triggered by
rename and link operations), using the current size of the inode instead
of the previously logged inode size.

A test case for fstests follows soon.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202695
CC: stable@vger.kernel.org # 4.4+
Reported-by: Seulbae Kim <seulbae@gatech.edu>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0805f8c5e72d..b08c47a7976c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4504,6 +4504,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
 		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				      struct btrfs_inode_item);
 		*size_ret = btrfs_inode_size(path->nodes[0], item);
+		/*
+		 * If the in-memory inode's i_size is smaller then the inode
+		 * size stored in the btree, return the inode's i_size, so
+		 * that we get a correct inode size after replaying the log
+		 * when before a power failure we had a shrinking truncate
+		 * followed by addition of a new name (rename / new hard link).
+		 * Otherwise return the inode size from the btree, to avoid
+		 * data loss when replaying a log due to previously doing a
+		 * write that expands the inode's size and logging a new name
+		 * immediately after.
+		 */
+		if (*size_ret > inode->vfs_inode.i_size)
+			*size_ret = inode->vfs_inode.i_size;
 	}
 
 	btrfs_release_path(path);
-- 
cgit v1.2.3


From b57220cc982033dc8e1e8d398cb006b72b275213 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 6 Mar 2019 17:13:04 -0500
Subject: btrfs: remove WARN_ON in log_dir_items

commit 2cc8334270e281815c3850c3adea363c51f21e0d upstream.

When Filipe added the recursive directory logging stuff in
2f2ff0ee5e430 ("Btrfs: fix metadata inconsistencies after directory
fsync") he specifically didn't take the directory i_mutex for the
children directories that we need to log because of lockdep.  This is
generally fine, but can lead to this WARN_ON() tripping if we happen to
run delayed deletion's in between our first search and our second search
of dir_item/dir_indexes for this directory.  We expect this to happen,
so the WARN_ON() isn't necessary.  Drop the WARN_ON() and add a comment
so we know why this case can happen.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b08c47a7976c..5af65d53618b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3532,9 +3532,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
-	/* find the first key from this transaction again */
+	/*
+	 * Find the first key from this transaction again.  See the note for
+	 * log_new_dir_dentries, if we're logging a directory recursively we
+	 * won't be holding its i_mutex, which means we can modify the directory
+	 * while we're logging it.  If we remove an entry between our first
+	 * search and this search we'll not find the key again and can just
+	 * bail.
+	 */
 	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
-	if (WARN_ON(ret != 0))
+	if (ret != 0)
 		goto done;
 
 	/*
-- 
cgit v1.2.3


From d952c337b25d3f3d3e4b2156b2ab83118c7951fd Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Thu, 7 Mar 2019 15:40:50 +0100
Subject: btrfs: don't report readahead errors and don't update statistics

commit 0cc068e6ee59c1fffbfa977d8bf868b7551d80ac upstream.

As readahead is an optimization, all errors are usually filtered out,
but still properly handled when the real read call is done. The commit
5e9d398240b2 ("btrfs: readpages() should submit IO as read-ahead") added
REQ_RAHEAD to readpages() because that's only used for readahead
(despite what one would expect from the callback name).

This causes a flood of messages and inflated read error stats, so skip
reporting in case it's readahead.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202403
Reported-by: LimeTech <tomm@lime-technology.com>
Fixes: 5e9d398240b2 ("btrfs: readpages() should submit IO as read-ahead")
CC: stable@vger.kernel.org # 4.19+
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c13f62182513..207f4e87445d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6051,7 +6051,7 @@ static void btrfs_end_bio(struct bio *bio)
 				if (bio_op(bio) == REQ_OP_WRITE)
 					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_WRITE_ERRS);
-				else
+				else if (!(bio->bi_opf & REQ_RAHEAD))
 					btrfs_dev_stat_inc_and_print(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
 				if (bio->bi_opf & REQ_PREFLUSH)
-- 
cgit v1.2.3


From 1cf4ab01eb5aa72fab9acf405b4a9037dcc7cefc Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@canonical.com>
Date: Thu, 14 Mar 2019 08:56:28 +0100
Subject: btrfs: raid56: properly unmap parity page in finish_parity_scrub()

commit 3897b6f0a859288c22fb793fad11ec2327e60fcd upstream.

Parity page is incorrectly unmapped in finish_parity_scrub(), triggering
a reference counter bug on i386, i.e.:

 [ 157.662401] kernel BUG at mm/highmem.c:349!
 [ 157.666725] invalid opcode: 0000 [#1] SMP PTI

The reason is that kunmap(p_page) was completely left out, so we never
did an unmap for the p_page and the loop unmapping the rbio page was
iterating over the wrong number of stripes: unmapping should be done
with nr_data instead of rbio->real_stripes.

Test case to reproduce the bug:

 - create a raid5 btrfs filesystem:
   # mkfs.btrfs -m raid5 -d raid5 /dev/sdb /dev/sdc /dev/sdd /dev/sde

 - mount it:
   # mount /dev/sdb /mnt

 - run btrfs scrub in a loop:
   # while :; do btrfs scrub start -BR /mnt; done

BugLink: https://bugs.launchpad.net/bugs/1812845
Fixes: 5a6ac9eacb49 ("Btrfs, raid56: support parity scrub on raid56")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/raid56.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index df41d7049936..927f9f3daddb 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2429,8 +2429,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 			bitmap_clear(rbio->dbitmap, pagenr, 1);
 		kunmap(p);
 
-		for (stripe = 0; stripe < rbio->real_stripes; stripe++)
+		for (stripe = 0; stripe < nr_data; stripe++)
 			kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+		kunmap(p_page);
 	}
 
 	__free_page(p_page);
-- 
cgit v1.2.3


From 0ae3b84b3fa6cd988c8ee65053f6fa34bfda3034 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 18 Mar 2019 17:45:20 +0200
Subject: btrfs: Avoid possible qgroup_rsv_size overflow in
 btrfs_calculate_inode_block_rsv_size

commit 139a56170de67101791d6e6c8e940c6328393fe9 upstream.

qgroup_rsv_size is calculated as the product of
outstanding_extent * fs_info->nodesize. The product is calculated with
32 bit precision since both variables are defined as u32. Yet
qgroup_rsv_size expects a 64 bit result.

Avoid possible multiplication overflow by casting outstanding_extent to
u64. Such overflow would in the worst case (64K nodesize) require more
than 65536 extents, which is quite large and i'ts not likely that it
would happen in practice.

Fixes-coverity-id: 1435101
Fixes: ff6bc37eb7f6 ("btrfs: qgroup: Use independent and accurate per inode qgroup rsv")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a16760b410b1..c0db7785cede 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5872,7 +5872,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
 	 *
 	 * This is overestimating in most cases.
 	 */
-	qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
+	qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
 
 	spin_lock(&block_rsv->lock);
 	block_rsv->size = reserve_size;
-- 
cgit v1.2.3


From fd1b25364fef1cb5d7898b751f531bf754a39af5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 19 Mar 2019 17:18:13 +0000
Subject: Btrfs: fix assertion failure on fsync with NO_HOLES enabled

commit 0ccc3876e4b2a1559a4dbe3126dda4459d38a83b upstream.

Back in commit a89ca6f24ffe4 ("Btrfs: fix fsync after truncate when
no_holes feature is enabled") I added an assertion that is triggered when
an inline extent is found to assert that the length of the (uncompressed)
data the extent represents is the same as the i_size of the inode, since
that is true most of the time I couldn't find or didn't remembered about
any exception at that time. Later on the assertion was expanded twice to
deal with a case of a compressed inline extent representing a range that
matches the sector size followed by an expanding truncate, and another
case where fallocate can update the i_size of the inode without adding
or updating existing extents (if the fallocate range falls entirely within
the first block of the file). These two expansion/fixes of the assertion
were done by commit 7ed586d0a8241 ("Btrfs: fix assertion on fsync of
regular file when using no-holes feature") and commit 6399fb5a0b69a
("Btrfs: fix assertion failure during fsync in no-holes mode").
These however missed the case where an falloc expands the i_size of an
inode to exactly the sector size and inline extent exists, for example:

 $ mkfs.btrfs -f -O no-holes /dev/sdc
 $ mount /dev/sdc /mnt

 $ xfs_io -f -c "pwrite -S 0xab 0 1096" /mnt/foobar
 wrote 1096/1096 bytes at offset 0
 1 KiB, 1 ops; 0.0002 sec (4.448 MiB/sec and 4255.3191 ops/sec)

 $ xfs_io -c "falloc 1096 3000" /mnt/foobar
 $ xfs_io -c "fsync" /mnt/foobar
 Segmentation fault

 $ dmesg
 [701253.602385] assertion failed: len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != BTRFS_COMPRESS_NONE) || (len < i_size && i_size < fs_info->sectorsize), file: fs/btrfs/tree-log.c, line: 4727
 [701253.602962] ------------[ cut here ]------------
 [701253.603224] kernel BUG at fs/btrfs/ctree.h:3533!
 [701253.603503] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI
 [701253.603774] CPU: 2 PID: 7192 Comm: xfs_io Tainted: G        W         5.0.0-rc8-btrfs-next-45 #1
 [701253.604054] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
 [701253.604650] RIP: 0010:assfail.constprop.23+0x18/0x1a [btrfs]
 (...)
 [701253.605591] RSP: 0018:ffffbb48c186bc48 EFLAGS: 00010286
 [701253.605914] RAX: 00000000000000de RBX: ffff921d0a7afc08 RCX: 0000000000000000
 [701253.606244] RDX: 0000000000000000 RSI: ffff921d36b16868 RDI: ffff921d36b16868
 [701253.606580] RBP: ffffbb48c186bcf0 R08: 0000000000000000 R09: 0000000000000000
 [701253.606913] R10: 0000000000000003 R11: 0000000000000000 R12: ffff921d05d2de18
 [701253.607247] R13: ffff921d03b54000 R14: 0000000000000448 R15: ffff921d059ecf80
 [701253.607769] FS:  00007f14da906700(0000) GS:ffff921d36b00000(0000) knlGS:0000000000000000
 [701253.608163] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 [701253.608516] CR2: 000056087ea9f278 CR3: 00000002268e8001 CR4: 00000000003606e0
 [701253.608880] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 [701253.609250] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 [701253.609608] Call Trace:
 [701253.609994]  btrfs_log_inode+0xdfb/0xe40 [btrfs]
 [701253.610383]  btrfs_log_inode_parent+0x2be/0xa60 [btrfs]
 [701253.610770]  ? do_raw_spin_unlock+0x49/0xc0
 [701253.611150]  btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
 [701253.611537]  btrfs_sync_file+0x3b2/0x440 [btrfs]
 [701253.612010]  ? do_sysinfo+0xb0/0xf0
 [701253.612552]  do_fsync+0x38/0x60
 [701253.612988]  __x64_sys_fsync+0x10/0x20
 [701253.613360]  do_syscall_64+0x60/0x1b0
 [701253.613733]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 [701253.614103] RIP: 0033:0x7f14da4e66d0
 (...)
 [701253.615250] RSP: 002b:00007fffa670fdb8 EFLAGS: 00000246 ORIG_RAX: 000000000000004a
 [701253.615647] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f14da4e66d0
 [701253.616047] RDX: 000056087ea9c260 RSI: 000056087ea9c260 RDI: 0000000000000003
 [701253.616450] RBP: 0000000000000001 R08: 0000000000000020 R09: 0000000000000010
 [701253.616854] R10: 000000000000009b R11: 0000000000000246 R12: 000056087ea9c260
 [701253.617257] R13: 000056087ea9c240 R14: 0000000000000000 R15: 000056087ea9dd10
 (...)
 [701253.619941] ---[ end trace e088d74f132b6da5 ]---

Updating the assertion again to allow for this particular case would result
in a meaningless assertion, plus there is currently no risk of logging
content that would result in any corruption after a log replay if the size
of the data encoded in an inline extent is greater than the inode's i_size
(which is not currently possibe either with or without compression),
therefore just remove the assertion.

CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5af65d53618b..2f4f0958e5f2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4685,15 +4685,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
 					struct btrfs_file_extent_item);
 
 		if (btrfs_file_extent_type(leaf, extent) ==
-		    BTRFS_FILE_EXTENT_INLINE) {
-			len = btrfs_file_extent_ram_bytes(leaf, extent);
-			ASSERT(len == i_size ||
-			       (len == fs_info->sectorsize &&
-				btrfs_file_extent_compression(leaf, extent) !=
-				BTRFS_COMPRESS_NONE) ||
-			       (len < i_size && i_size < fs_info->sectorsize));
+		    BTRFS_FILE_EXTENT_INLINE)
 			return 0;
-		}
 
 		len = btrfs_file_extent_num_bytes(leaf, extent);
 		/* Last extent goes beyond i_size, no need to log a hole. */
-- 
cgit v1.2.3


From da57cba4f3f1c63303cffcee97cf4d7bf1ff386a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 19 Mar 2019 11:33:24 +1100
Subject: NFS: fix mount/umount race in nlmclnt.

commit 4a9be28c45bf02fa0436808bb6c0baeba30e120e upstream.

If the last NFSv3 unmount from a given host races with a mount from the
same host, we can destroy an nlm_host that is still in use.

Specifically nlmclnt_lookup_host() can increment h_count on
an nlm_host that nlmclnt_release_host() has just successfully called
refcount_dec_and_test() on.
Once nlmclnt_lookup_host() drops the mutex, nlm_destroy_host_lock()
will be called to destroy the nlmclnt which is now in use again.

The cause of the problem is that the dec_and_test happens outside the
locked region.  This is easily fixed by using
refcount_dec_and_mutex_lock().

Fixes: 8ea6ecc8b075 ("lockd: Create client-side nlm_host cache")
Cc: stable@vger.kernel.org (v2.6.38+)
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/lockd/host.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 93fb7cf0b92b..f0b5c987d6ae 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host)
 
 	WARN_ON_ONCE(host->h_server);
 
-	if (refcount_dec_and_test(&host->h_count)) {
+	if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) {
 		WARN_ON_ONCE(!list_empty(&host->h_lockowners));
 		WARN_ON_ONCE(!list_empty(&host->h_granted));
 		WARN_ON_ONCE(!list_empty(&host->h_reclaim));
 
-		mutex_lock(&nlm_host_mutex);
 		nlm_destroy_host_locked(host);
 		mutex_unlock(&nlm_host_mutex);
 	}
-- 
cgit v1.2.3


From 64751542d3f3a6e9ccf88dab1fd05cc47bcba301 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 19 Mar 2019 12:12:13 -0400
Subject: NFSv4.1 don't free interrupted slot on open

commit 0cb98abb5bd13b9a636bde603d952d722688b428 upstream.

Allow the async rpc task for finish and update the open state if needed,
then free the slot. Otherwise, the async rpc unable to decode the reply.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Fixes: ae55e59da0e4 ("pnfs: Don't release the sequence slot...")
Cc: stable@vger.kernel.org # v4.18+
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e7abcf7629b3..580e37bc3fe2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2909,7 +2909,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	}
 
 out:
-	nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+	if (!opendata->cancelled)
+		nfs4_sequence_free_slot(&opendata->o_res.seq_res);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 72b790c417b9f6ffec9d04b77b29103f451725f6 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 28 Mar 2019 20:43:30 -0700
Subject: fs/open.c: allow opening only regular files during execve()

commit 73601ea5b7b18eb234219ae2adf77530f389da79 upstream.

syzbot is hitting lockdep warning [1] due to trying to open a fifo
during an execve() operation.  But we don't need to open non regular
files during an execve() operation, for all files which we will need are
the executable file itself and the interpreter programs like /bin/sh and
ld-linux.so.2 .

Since the manpage for execve(2) says that execve() returns EACCES when
the file or a script interpreter is not a regular file, and the manpage
for uselib(2) says that uselib() can return EACCES, and we use
FMODE_EXEC when opening for execve()/uselib(), we can bail out if a non
regular file is requested with FMODE_EXEC set.

Since this deadlock followed by khungtaskd warnings is trivially
reproducible by a local unprivileged user, and syzbot's frequent crash
due to this deadlock defers finding other bugs, let's workaround this
deadlock until we get a chance to find a better solution.

[1] https://syzkaller.appspot.com/bug?id=b5095bfec44ec84213bac54742a82483aad578ce

Link: http://lkml.kernel.org/r/1552044017-7890-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Reported-by: syzbot <syzbot+e93a80c1bb7c5c56e522461c149f8bf55eab1b2b@syzkaller.appspotmail.com>
Fixes: 8924feff66f35fe2 ("splice: lift pipe_lock out of splice_to_pipe()")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>	[4.9+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/open.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 0285ce7dbd51..f1c2f855fd43 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f,
 		return 0;
 	}
 
+	/* Any file opened for execve()/uselib() has to be a regular file. */
+	if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
+		error = -EACCES;
+		goto cleanup_file;
+	}
+
 	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
 		error = get_write_access(inode);
 		if (unlikely(error))
-- 
cgit v1.2.3


From 3b3fcc3d4ffd9046b6ede3f62f473f40b8bd7493 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 28 Mar 2019 20:43:38 -0700
Subject: ocfs2: fix inode bh swapping mixup in ocfs2_reflink_inodes_lock

commit e6a9467ea14bae8691b0f72c500510c42ea8edb8 upstream.

ocfs2_reflink_inodes_lock() can swap the inode1/inode2 variables so that
we always grab cluster locks in order of increasing inode number.

Unfortunately, we forget to swap the inode record buffer head pointers
when we've done this, which leads to incorrect bookkeepping when we're
trying to make the two inodes have the same refcount tree.

This has the effect of causing filesystem shutdowns if you're trying to
reflink data from inode 100 into inode 97, where inode 100 already has a
refcount tree attached and inode 97 doesn't.  The reflink code decides
to copy the refcount tree pointer from 100 to 97, but uses inode 97's
inode record to open the tree root (which it doesn't have) and blows up.
This issue causes filesystem shutdowns and metadata corruption!

Link: http://lkml.kernel.org/r/20190312214910.GK20533@magnolia
Fixes: 29ac8e856cb369 ("ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features")
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ocfs2/refcounttree.c | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7a5ee145c733..fc197e599e8c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4716,22 +4716,23 @@ out:
 
 /* Lock an inode and grab a bh pointing to the inode. */
 static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
-				     struct buffer_head **bh1,
+				     struct buffer_head **bh_s,
 				     struct inode *t_inode,
-				     struct buffer_head **bh2)
+				     struct buffer_head **bh_t)
 {
-	struct inode *inode1;
-	struct inode *inode2;
+	struct inode *inode1 = s_inode;
+	struct inode *inode2 = t_inode;
 	struct ocfs2_inode_info *oi1;
 	struct ocfs2_inode_info *oi2;
+	struct buffer_head *bh1 = NULL;
+	struct buffer_head *bh2 = NULL;
 	bool same_inode = (s_inode == t_inode);
+	bool need_swap = (inode1->i_ino > inode2->i_ino);
 	int status;
 
 	/* First grab the VFS and rw locks. */
 	lock_two_nondirectories(s_inode, t_inode);
-	inode1 = s_inode;
-	inode2 = t_inode;
-	if (inode1->i_ino > inode2->i_ino)
+	if (need_swap)
 		swap(inode1, inode2);
 
 	status = ocfs2_rw_lock(inode1, 1);
@@ -4754,17 +4755,13 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
 	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
 				(unsigned long long)oi2->ip_blkno);
 
-	if (*bh1)
-		*bh1 = NULL;
-	if (*bh2)
-		*bh2 = NULL;
-
 	/* We always want to lock the one with the lower lockid first. */
 	if (oi1->ip_blkno > oi2->ip_blkno)
 		mlog_errno(-ENOLCK);
 
 	/* lock id1 */
-	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+	status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
+					 OI_LS_REFLINK_TARGET);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
@@ -4773,15 +4770,25 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
 
 	/* lock id2 */
 	if (!same_inode) {
-		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+		status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
 						 OI_LS_REFLINK_TARGET);
 		if (status < 0) {
 			if (status != -ENOENT)
 				mlog_errno(status);
 			goto out_cl1;
 		}
-	} else
-		*bh2 = *bh1;
+	} else {
+		bh2 = bh1;
+	}
+
+	/*
+	 * If we swapped inode order above, we have to swap the buffer heads
+	 * before passing them back to the caller.
+	 */
+	if (need_swap)
+		swap(bh1, bh2);
+	*bh_s = bh1;
+	*bh_t = bh2;
 
 	trace_ocfs2_double_lock_end(
 			(unsigned long long)oi1->ip_blkno,
@@ -4791,8 +4798,7 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
 
 out_cl1:
 	ocfs2_inode_unlock(inode1, 1);
-	brelse(*bh1);
-	*bh1 = NULL;
+	brelse(bh1);
 out_rw2:
 	ocfs2_rw_unlock(inode2, 1);
 out_i2:
-- 
cgit v1.2.3


From 07d0d2bd957ad922cf571e7cabb6c34067142b93 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 28 Mar 2019 20:44:40 -0700
Subject: fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links

commit 23da9588037ecdd4901db76a5b79a42b529c4ec3 upstream.

Syzkaller reports:

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN PTI
CPU: 1 PID: 5373 Comm: syz-executor.0 Not tainted 5.0.0-rc8+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
RIP: 0010:put_links+0x101/0x440 fs/proc/proc_sysctl.c:1599
Code: 00 0f 85 3a 03 00 00 48 8b 43 38 48 89 44 24 20 48 83 c0 38 48 89 c2 48 89 44 24 28 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 fe 02 00 00 48 8b 74 24 20 48 c7 c7 60 2a 9d 91
RSP: 0018:ffff8881d828f238 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff8881e01b1140 RCX: ffffffff8ee98267
RDX: 0000000000000007 RSI: ffffc90001479000 RDI: ffff8881e01b1178
RBP: dffffc0000000000 R08: ffffed103ee27259 R09: ffffed103ee27259
R10: 0000000000000001 R11: ffffed103ee27258 R12: fffffffffffffff4
R13: 0000000000000006 R14: ffff8881f59838c0 R15: dffffc0000000000
FS:  00007f072254f700(0000) GS:ffff8881f7100000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff8b286668 CR3: 00000001f0542002 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 drop_sysctl_table+0x152/0x9f0 fs/proc/proc_sysctl.c:1629
 get_subdir fs/proc/proc_sysctl.c:1022 [inline]
 __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335
 br_netfilter_init+0xbc/0x1000 [br_netfilter]
 do_one_initcall+0xfa/0x5ca init/main.c:887
 do_init_module+0x204/0x5f6 kernel/module.c:3460
 load_module+0x66b2/0x8570 kernel/module.c:3808
 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902
 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x462e99
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f072254ec58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99
RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003
RBP: 00007f072254ec70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f072254f6bc
R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004
Modules linked in: br_netfilter(+) dvb_usb_dibusb_mc_common dib3000mc dibx000_common dvb_usb_dibusb_common dvb_usb_dw2102 dvb_usb classmate_laptop palmas_regulator cn videobuf2_v4l2 v4l2_common snd_soc_bd28623 mptbase snd_usb_usx2y snd_usbmidi_lib snd_rawmidi wmi libnvdimm lockd sunrpc grace rc_kworld_pc150u rc_core rtc_da9063 sha1_ssse3 i2c_cros_ec_tunnel adxl34x_spi adxl34x nfnetlink lib80211 i5500_temp dvb_as102 dvb_core videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops udc_core lnbp22 leds_lp3952 hid_roccat_ryos s1d13xxxfb mtd vport_geneve openvswitch nf_conncount nf_nat_ipv6 nsh geneve udp_tunnel ip6_udp_tunnel snd_soc_mt6351 sis_agp phylink snd_soc_adau1761_spi snd_soc_adau1761 snd_soc_adau17x1 snd_soc_core snd_pcm_dmaengine ac97_bus snd_compress snd_soc_adau_utils snd_soc_sigmadsp_regmap snd_soc_sigmadsp raid_class hid_roccat_konepure hid_roccat_common hid_roccat c2port_duramar2150 core mdio_bcm_unimac iptable_security iptable_raw iptable_mangle
 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim devlink vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel joydev mousedev ide_pci_generic piix aesni_intel aes_x86_64 ide_core crypto_simd atkbd cryptd glue_helper serio_raw ata_generic pata_acpi i2c_piix4 floppy sch_fq_codel ip_tables x_tables ipv6 [last unloaded: lm73]
Dumping ftrace buffer:
   (ftrace buffer empty)
---[ end trace 770020de38961fd0 ]---

A new dir entry can be created in get_subdir and its 'header->parent' is
set to NULL.  Only after insert_header success, it will be set to 'dir',
otherwise 'header->parent' is set to NULL and drop_sysctl_table is called.
However in err handling path of get_subdir, drop_sysctl_table also be
called on 'new->header' regardless its value of parent pointer.  Then
put_links is called, which triggers NULL-ptr deref when access member of
header->parent.

In fact we have multiple error paths which call drop_sysctl_table() there,
upon failure on insert_links() we also call drop_sysctl_table().And even
in the successful case on __register_sysctl_table() we still always call
drop_sysctl_table().This patch fix it.

Link: http://lkml.kernel.org/r/20190314085527.13244-1-yuehaibing@huawei.com
Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>    [3.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/proc_sysctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 4d598a399bbf..d65390727541 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1626,7 +1626,8 @@ static void drop_sysctl_table(struct ctl_table_header *header)
 	if (--header->nreg)
 		return;
 
-	put_links(header);
+	if (parent)
+		put_links(header);
 	start_unregistering(header);
 	if (!--header->count)
 		kfree_rcu(header, rcu);
-- 
cgit v1.2.3


From 2dbc7c66d6dafb7781c55d067d8e406d71a34ccc Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Sat, 23 Mar 2019 11:56:01 -0400
Subject: ext4: cleanup bh release code in ext4_ind_remove_space()

commit 5e86bdda41534e17621d5a071b294943cae4376e upstream.

Currently, we are releasing the indirect buffer where we are done with
it in ext4_ind_remove_space(), so we can see the brelse() and
BUFFER_TRACE() everywhere.  It seems fragile and hard to read, and we
may probably forget to release the buffer some day.  This patch cleans
up the code by putting of the code which releases the buffers to the
end of the function.

Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jari Ruusu <jari.ruusu@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/indirect.c | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 9e96a0bd08d9..e1801b288847 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1219,6 +1219,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 	ext4_lblk_t offsets[4], offsets2[4];
 	Indirect chain[4], chain2[4];
 	Indirect *partial, *partial2;
+	Indirect *p = NULL, *p2 = NULL;
 	ext4_lblk_t max_block;
 	__le32 nr = 0, nr2 = 0;
 	int n = 0, n2 = 0;
@@ -1260,7 +1261,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 		}
 
 
-		partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+		partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
 		if (nr) {
 			if (partial == chain) {
 				/* Shared branch grows from the inode */
@@ -1285,13 +1286,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
 				partial->p + 1,
 				(__le32 *)partial->bh->b_data+addr_per_block,
 				(chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
 			partial--;
 		}
 
 end_range:
-		partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+		partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 		if (nr2) {
 			if (partial2 == chain2) {
 				/*
@@ -1321,16 +1320,14 @@ end_range:
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
 					   (chain2+n2-1) - partial2);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
 			partial2--;
 		}
 		goto do_indirects;
 	}
 
 	/* Punch happened within the same level (n == n2) */
-	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
-	partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
+	partial = p = ext4_find_shared(inode, n, offsets, chain, &nr);
+	partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
 
 	/* Free top, but only if partial2 isn't its subtree. */
 	if (nr) {
@@ -1387,15 +1384,7 @@ end_range:
 					   partial->p + 1,
 					   partial2->p,
 					   (chain+n-1) - partial);
-			while (partial > chain) {
-				BUFFER_TRACE(partial->bh, "call brelse");
-				brelse(partial->bh);
-			}
-			while (partial2 > chain2) {
-				BUFFER_TRACE(partial2->bh, "call brelse");
-				brelse(partial2->bh);
-			}
-			return 0;
+			goto cleanup;
 		}
 
 		/*
@@ -1410,8 +1399,6 @@ end_range:
 					   partial->p + 1,
 					   (__le32 *)partial->bh->b_data+addr_per_block,
 					   (chain+n-1) - partial);
-			BUFFER_TRACE(partial->bh, "call brelse");
-			brelse(partial->bh);
 			partial--;
 		}
 		if (partial2 > chain2 && depth2 <= depth) {
@@ -1419,11 +1406,21 @@ end_range:
 					   (__le32 *)partial2->bh->b_data,
 					   partial2->p,
 					   (chain2+n2-1) - partial2);
-			BUFFER_TRACE(partial2->bh, "call brelse");
-			brelse(partial2->bh);
 			partial2--;
 		}
 	}
+
+cleanup:
+	while (p && p > chain) {
+		BUFFER_TRACE(p->bh, "call brelse");
+		brelse(p->bh);
+		p--;
+	}
+	while (p2 && p2 > chain2) {
+		BUFFER_TRACE(p2->bh, "call brelse");
+		brelse(p2->bh);
+		p2--;
+	}
 	return 0;
 
 do_indirects:
@@ -1431,7 +1428,7 @@ do_indirects:
 	switch (offsets[0]) {
 	default:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_IND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
@@ -1439,7 +1436,7 @@ do_indirects:
 		}
 	case EXT4_IND_BLOCK:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_DIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
@@ -1447,7 +1444,7 @@ do_indirects:
 		}
 	case EXT4_DIND_BLOCK:
 		if (++n >= n2)
-			return 0;
+			break;
 		nr = i_data[EXT4_TIND_BLOCK];
 		if (nr) {
 			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
@@ -1456,5 +1453,5 @@ do_indirects:
 	case EXT4_TIND_BLOCK:
 		;
 	}
-	return 0;
+	goto cleanup;
 }
-- 
cgit v1.2.3


From 2938651d36cae8c22e79d293fe9802cc9d5f97f3 Mon Sep 17 00:00:00 2001
From: Aurelien Aptel <aaptel@suse.com>
Date: Thu, 14 Mar 2019 18:44:16 +0100
Subject: CIFS: fix POSIX lock leak and invalid ptr deref

[ Upstream commit bc31d0cdcfbadb6258b45db97e93b1c83822ba33 ]

We have a customer reporting crashes in lock_get_status() with many
"Leaked POSIX lock" messages preceeding the crash.

 Leaked POSIX lock on dev=0x0:0x56 ...
 Leaked POSIX lock on dev=0x0:0x56 ...
 Leaked POSIX lock on dev=0x0:0x56 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 Leaked POSIX lock on dev=0x0:0x53 ...
 POSIX: fl_owner=ffff8900e7b79380 fl_flags=0x1 fl_type=0x1 fl_pid=20709
 Leaked POSIX lock on dev=0x0:0x4b ino...
 Leaked locks on dev=0x0:0x4b ino=0xf911400000029:
 POSIX: fl_owner=ffff89f41c870e00 fl_flags=0x1 fl_type=0x1 fl_pid=19592
 stack segment: 0000 [#1] SMP
 Modules linked in: binfmt_misc msr tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag rpcsec_gss_krb5 arc4 ecb auth_rpcgss nfsv4 md4 nfs nls_utf8 lockd grace cifs sunrpc ccm dns_resolver fscache af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock xfs libcrc32c sb_edac edac_core crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drbg ansi_cprng vmw_balloon aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd joydev pcspkr vmxnet3 i2c_piix4 vmw_vmci shpchp fjes processor button ac btrfs xor raid6_pq sr_mod cdrom ata_generic sd_mod ata_piix vmwgfx crc32c_intel drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm serio_raw ahci libahci drm libata vmw_pvscsi sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4

 Supported: Yes
 CPU: 6 PID: 28250 Comm: lsof Not tainted 4.4.156-94.64-default #1
 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016
 task: ffff88a345f28740 ti: ffff88c74005c000 task.ti: ffff88c74005c000
 RIP: 0010:[<ffffffff8125dcab>]  [<ffffffff8125dcab>] lock_get_status+0x9b/0x3b0
 RSP: 0018:ffff88c74005fd90  EFLAGS: 00010202
 RAX: ffff89bde83e20ae RBX: ffff89e870003d18 RCX: 0000000049534f50
 RDX: ffffffff81a3541f RSI: ffffffff81a3544e RDI: ffff89bde83e20ae
 RBP: 0026252423222120 R08: 0000000020584953 R09: 000000000000ffff
 R10: 0000000000000000 R11: ffff88c74005fc70 R12: ffff89e5ca7b1340
 R13: 00000000000050e5 R14: ffff89e870003d30 R15: ffff89e5ca7b1340
 FS:  00007fafd64be800(0000) GS:ffff89f41fd00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000001c80018 CR3: 000000a522048000 CR4: 0000000000360670
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 Stack:
  0000000000000208 ffffffff81a3d6b6 ffff89e870003d30 ffff89e870003d18
  ffff89e5ca7b1340 ffff89f41738d7c0 ffff89e870003d30 ffff89e5ca7b1340
  ffffffff8125e08f 0000000000000000 ffff89bc22b67d00 ffff88c74005ff28
 Call Trace:
  [<ffffffff8125e08f>] locks_show+0x2f/0x70
  [<ffffffff81230ad1>] seq_read+0x251/0x3a0
  [<ffffffff81275bbc>] proc_reg_read+0x3c/0x70
  [<ffffffff8120e456>] __vfs_read+0x26/0x140
  [<ffffffff8120e9da>] vfs_read+0x7a/0x120
  [<ffffffff8120faf2>] SyS_read+0x42/0xa0
  [<ffffffff8161cbc3>] entry_SYSCALL_64_fastpath+0x1e/0xb7

When Linux closes a FD (close(), close-on-exec, dup2(), ...) it calls
filp_close() which also removes all posix locks.

The lock struct is initialized like so in filp_close() and passed
down to cifs

	...
        lock.fl_type = F_UNLCK;
        lock.fl_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
	...

Note the FL_CLOSE flag, which hints the VFS code that this unlocking
is done for closing the fd.

filp_close()
  locks_remove_posix(filp, id);
    vfs_lock_file(filp, F_SETLK, &lock, NULL);
      return filp->f_op->lock(filp, cmd, fl) => cifs_lock()
        rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, xid);
          rc = server->ops->mand_unlock_range(cfile, flock, xid);
          if (flock->fl_flags & FL_POSIX && !rc)
                  rc = locks_lock_file_wait(file, flock)

Notice how we don't call locks_lock_file_wait() which does the
generic VFS lock/unlock/wait work on the inode if rc != 0.

If we are closing the handle, the SMB server is supposed to remove any
locks associated with it. Similarly, cifs.ko frees and wakes up any
lock and lock waiter when closing the file:

cifs_close()
  cifsFileInfo_put(file->private_data)
	/*
	 * Delete any outstanding lock records. We'll lose them when the file
	 * is closed anyway.
	 */
	down_write(&cifsi->lock_sem);
	list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
		list_del(&li->llist);
		cifs_del_lock_waiters(li);
		kfree(li);
	}
	list_del(&cifs_file->llist->llist);
	kfree(cifs_file->llist);
	up_write(&cifsi->lock_sem);

So we can safely ignore unlocking failures in cifs_lock() if they
happen with the FL_CLOSE flag hint set as both the server and the
client take care of it during the actual closing.

This is not a proper fix for the unlocking failure but it's safe and
it seems to prevent the lock leakages and crashes the customer
experiences.

Signed-off-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: NeilBrown <neil@brown.name>
Signed-off-by: Steve French <stfrench@microsoft.com>
Acked-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/file.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 08761a6a039d..d847132ab027 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1631,8 +1631,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 		rc = server->ops->mand_unlock_range(cfile, flock, xid);
 
 out:
-	if (flock->fl_flags & FL_POSIX && !rc)
+	if (flock->fl_flags & FL_POSIX) {
+		/*
+		 * If this is a request to remove all locks because we
+		 * are closing the file, it doesn't matter if the
+		 * unlocking failed as both cifs.ko and the SMB server
+		 * remove the lock on file close
+		 */
+		if (rc) {
+			cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
+			if (!(flock->fl_flags & FL_CLOSE))
+				return rc;
+		}
 		rc = locks_lock_file_wait(file, flock);
+	}
 	return rc;
 }
 
-- 
cgit v1.2.3


From 198c99857b302ad46a4717dc6351b069fc5ac17f Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 5 Mar 2019 19:32:26 +0800
Subject: f2fs: fix to adapt small inline xattr space in __find_inline_xattr()

[ Upstream commit 2c28aba8b2e2a51749fa66e01b68e1cd5b53e022 ]

With below testcase, we will fail to find existed xattr entry:

1. mkfs.f2fs -O extra_attr -O flexible_inline_xattr /dev/zram0
2. mount -t f2fs -o inline_xattr_size=1 /dev/zram0 /mnt/f2fs/
3. touch /mnt/f2fs/file
4. setfattr -n "user.name" -v 0 /mnt/f2fs/file
5. getfattr -n "user.name" /mnt/f2fs/file

/mnt/f2fs/file: user.name: No such attribute

The reason is for inode which has very small inline xattr size,
__find_inline_xattr() will fail to traverse any entry due to first
entry may not be loaded from xattr node yet, later, we may skip to
check entire xattr datas in __find_xattr(), result in such wrong
condition.

This patch adds condition to check such case to avoid this issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/xattr.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 087e53a2d96c..409a637f7a92 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -227,11 +227,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
 {
 	struct f2fs_xattr_entry *entry;
 	unsigned int inline_size = inline_xattr_size(inode);
+	void *max_addr = base_addr + inline_size;
 
 	list_for_each_xattr(entry, base_addr) {
-		if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
-			(void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) >
-			base_addr + inline_size) {
+		if ((void *)entry + sizeof(__u32) > max_addr ||
+			(void *)XATTR_NEXT_ENTRY(entry) > max_addr) {
 			*last_addr = entry;
 			return NULL;
 		}
@@ -242,6 +242,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
 		if (!memcmp(entry->e_name, name, len))
 			break;
 	}
+
+	/* inline xattr header or entry across max inline xattr size */
+	if (IS_XATTR_LAST_ENTRY(entry) &&
+		(void *)entry + sizeof(__u32) > max_addr) {
+		*last_addr = entry;
+		return NULL;
+	}
 	return entry;
 }
 
-- 
cgit v1.2.3


From d7391962d723c92fbf02d2aa823e9f42413c76a7 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 12 Mar 2019 15:44:27 +0800
Subject: f2fs: fix to avoid deadlock in f2fs_read_inline_dir()

[ Upstream commit aadcef64b22f668c1a107b86d3521d9cac915c24 ]

As Jiqun Li reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202883

sometimes, dead lock when make system call SYS_getdents64 with fsync() is
called by another process.

monkey running on android9.0

1.  task 9785 held sbi->cp_rwsem and waiting lock_page()
2.  task 10349 held mm_sem and waiting sbi->cp_rwsem
3. task 9709 held lock_page() and waiting mm_sem

so this is a dead lock scenario.

task stack is show by crash tools as following

crash_arm64> bt ffffffc03c354080
PID: 9785   TASK: ffffffc03c354080  CPU: 1   COMMAND: "RxIoScheduler-3"
>> #7 [ffffffc01b50fac0] __lock_page at ffffff80081b11e8

crash-arm64> bt 10349
PID: 10349  TASK: ffffffc018b83080  CPU: 1   COMMAND: "BUGLY_ASYNC_UPL"
>> #3 [ffffffc01f8cfa40] rwsem_down_read_failed at ffffff8008a93afc
     PC: 00000033  LR: 00000000  SP: 00000000  PSTATE: ffffffffffffffff

crash-arm64> bt 9709
PID: 9709   TASK: ffffffc03e7f3080  CPU: 1   COMMAND: "IntentService[A"
>> #3 [ffffffc001e67850] rwsem_down_read_failed at ffffff8008a93afc
>> #8 [ffffffc001e67b80] el1_ia at ffffff8008084fc4
     PC: ffffff8008274114  [compat_filldir64+120]
     LR: ffffff80083584d4  [f2fs_fill_dentries+448]
     SP: ffffffc001e67b80  PSTATE: 80400145
    X29: ffffffc001e67b80  X28: 0000000000000000  X27: 000000000000001a
    X26: 00000000000093d7  X25: ffffffc070d52480  X24: 0000000000000008
    X23: 0000000000000028  X22: 00000000d43dfd60  X21: ffffffc001e67e90
    X20: 0000000000000011  X19: ffffff80093a4000  X18: 0000000000000000
    X17: 0000000000000000  X16: 0000000000000000  X15: 0000000000000000
    X14: ffffffffffffffff  X13: 0000000000000008  X12: 0101010101010101
    X11: 7f7f7f7f7f7f7f7f  X10: 6a6a6a6a6a6a6a6a   X9: 7f7f7f7f7f7f7f7f
     X8: 0000000080808000   X7: ffffff800827409c   X6: 0000000080808000
     X5: 0000000000000008   X4: 00000000000093d7   X3: 000000000000001a
     X2: 0000000000000011   X1: ffffffc070d52480   X0: 0000000000800238
>> #9 [ffffffc001e67be0] f2fs_fill_dentries at ffffff80083584d0
     PC: 0000003c  LR: 00000000  SP: 00000000  PSTATE: 000000d9
    X12: f48a02ff X11: d4678960 X10: d43dfc00  X9: d4678ae4
     X8: 00000058  X7: d4678994  X6: d43de800  X5: 000000d9
     X4: d43dfc0c  X3: d43dfc10  X2: d46799c8  X1: 00000000
     X0: 00001068

Below potential deadlock will happen between three threads:
Thread A		Thread B		Thread C
- f2fs_do_sync_file
 - f2fs_write_checkpoint
  - down_write(&sbi->node_change) -- 1)
			- do_page_fault
			 - down_write(&mm->mmap_sem) -- 2)
			  - do_wp_page
			   - f2fs_vm_page_mkwrite
						- getdents64
						 - f2fs_read_inline_dir
						  - lock_page -- 3)
  - f2fs_sync_node_pages
   - lock_page -- 3)
			    - __do_map_lock
			     - down_read(&sbi->node_change) -- 1)
						  - f2fs_fill_dentries
						   - dir_emit
						    - compat_filldir64
						     - do_page_fault
						      - down_read(&mm->mmap_sem) -- 2)

Since f2fs_readdir is protected by inode.i_rwsem, there should not be
any updates in inode page, we're safe to lookup dents in inode page
without its lock held, so taking off the lock to improve concurrency
of readdir and avoid potential deadlock.

Reported-by: Jiqun Li <jiqun.li@unisoc.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/inline.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 115dc219344b..92703efde36e 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -661,6 +661,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	if (IS_ERR(ipage))
 		return PTR_ERR(ipage);
 
+	/*
+	 * f2fs_readdir was protected by inode.i_rwsem, it is safe to access
+	 * ipage without page's lock held.
+	 */
+	unlock_page(ipage);
+
 	inline_dentry = inline_data_addr(inode, ipage);
 
 	make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -669,7 +675,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	if (!err)
 		ctx->pos = d.max;
 
-	f2fs_put_page(ipage, 1);
+	f2fs_put_page(ipage, 0);
 	return err < 0 ? err : 0;
 }
 
-- 
cgit v1.2.3


From 20141feb9bdec6e7daed7782c1535b31081aeb71 Mon Sep 17 00:00:00 2001
From: Jia Guo <guojia12@huawei.com>
Date: Tue, 5 Mar 2019 15:41:41 -0800
Subject: ocfs2: fix a panic problem caused by o2cb_ctl

[ Upstream commit cc725ef3cb202ef2019a3c67c8913efa05c3cce6 ]

In the process of creating a node, it will cause NULL pointer
dereference in kernel if o2cb_ctl failed in the interval (mkdir,
o2cb_set_node_attribute(node_num)] in function o2cb_add_node.

The node num is initialized to 0 in function o2nm_node_group_make_item,
o2nm_node_group_drop_item will mistake the node number 0 for a valid
node number when we delete the node before the node number is set
correctly.  If the local node number of the current host happens to be
0, cluster->cl_local_node will be set to O2NM_INVALID_NODE_NUM while
o2hb_thread still running.  The panic stack is generated as follows:

  o2hb_thread
      \-o2hb_do_disk_heartbeat
          \-o2hb_check_own_slot
              |-slot = &reg->hr_slots[o2nm_this_node()];
              //o2nm_this_node() return O2NM_INVALID_NODE_NUM

We need to check whether the node number is set when we delete the node.

Link: http://lkml.kernel.org/r/133d8045-72cc-863e-8eae-5013f9f6bc51@huawei.com
Signed-off-by: Jia Guo <guojia12@huawei.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Acked-by: Jun Piao <piaojun@huawei.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ocfs2/cluster/nodemanager.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 0e4166cc23a0..4ac775e32240 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group,
 	struct o2nm_node *node = to_o2nm_node(item);
 	struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
 
-	o2net_disconnect_node(node);
+	if (cluster->cl_nodes[node->nd_num] == node) {
+		o2net_disconnect_node(node);
 
-	if (cluster->cl_has_local &&
-	    (cluster->cl_local_node == node->nd_num)) {
-		cluster->cl_has_local = 0;
-		cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
-		o2net_stop_listening(node);
+		if (cluster->cl_has_local &&
+		    (cluster->cl_local_node == node->nd_num)) {
+			cluster->cl_has_local = 0;
+			cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
+			o2net_stop_listening(node);
+		}
 	}
 
 	/* XXX call into net to stop this node from trading messages */
-- 
cgit v1.2.3


From 9b4f27667402c303de02192a2ce84f4c1823be69 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Mon, 4 Feb 2019 13:36:53 +0530
Subject: f2fs: do not use mutex lock in atomic context

[ Upstream commit 9083977dabf3833298ddcd40dee28687f1e6b483 ]

Fix below warning coming because of using mutex lock in atomic context.

BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98
in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh
Preemption disabled at: __radix_tree_preload+0x28/0x130
Call trace:
 dump_backtrace+0x0/0x2b4
 show_stack+0x20/0x28
 dump_stack+0xa8/0xe0
 ___might_sleep+0x144/0x194
 __might_sleep+0x58/0x8c
 mutex_lock+0x2c/0x48
 f2fs_trace_pid+0x88/0x14c
 f2fs_set_node_page_dirty+0xd0/0x184

Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with
spin_lock() acquired.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/trace.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index a1fcd00bbb2b..8ac1851a21c0 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -17,7 +17,7 @@
 #include "trace.h"
 
 static RADIX_TREE(pids, GFP_ATOMIC);
-static struct mutex pids_lock;
+static spinlock_t pids_lock;
 static struct last_io_info last_io;
 
 static inline void __print_last_io(void)
@@ -61,23 +61,29 @@ void f2fs_trace_pid(struct page *page)
 
 	set_page_private(page, (unsigned long)pid);
 
+retry:
 	if (radix_tree_preload(GFP_NOFS))
 		return;
 
-	mutex_lock(&pids_lock);
+	spin_lock(&pids_lock);
 	p = radix_tree_lookup(&pids, pid);
 	if (p == current)
 		goto out;
 	if (p)
 		radix_tree_delete(&pids, pid);
 
-	f2fs_radix_tree_insert(&pids, pid, current);
+	if (radix_tree_insert(&pids, pid, current)) {
+		spin_unlock(&pids_lock);
+		radix_tree_preload_end();
+		cond_resched();
+		goto retry;
+	}
 
 	trace_printk("%3x:%3x %4x %-16s\n",
 			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
 			pid, current->comm);
 out:
-	mutex_unlock(&pids_lock);
+	spin_unlock(&pids_lock);
 	radix_tree_preload_end();
 }
 
@@ -122,7 +128,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush)
 
 void f2fs_build_trace_ios(void)
 {
-	mutex_init(&pids_lock);
+	spin_lock_init(&pids_lock);
 }
 
 #define PIDVEC_SIZE	128
@@ -150,7 +156,7 @@ void f2fs_destroy_trace_ios(void)
 	pid_t next_pid = 0;
 	unsigned int found;
 
-	mutex_lock(&pids_lock);
+	spin_lock(&pids_lock);
 	while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
 		unsigned idx;
 
@@ -158,5 +164,5 @@ void f2fs_destroy_trace_ios(void)
 		for (idx = 0; idx < found; idx++)
 			radix_tree_delete(&pids, pid[idx]);
 	}
-	mutex_unlock(&pids_lock);
+	spin_unlock(&pids_lock);
 }
-- 
cgit v1.2.3


From d609ecd887f8dd8288b9ff58042381a0ce801ec0 Mon Sep 17 00:00:00 2001
From: Shuriyc Chu <sureeju@gmail.com>
Date: Tue, 5 Mar 2019 15:41:56 -0800
Subject: fs/file.c: initialize init_files.resize_wait

[ Upstream commit 5704a06810682683355624923547b41540e2801a ]

(Taken from https://bugzilla.kernel.org/show_bug.cgi?id=200647)

'get_unused_fd_flags' in kthread cause kernel crash.  It works fine on
4.1, but causes crash after get 64 fds.  It also cause crash on
ubuntu1404/1604/1804, centos7.5, and the crash messages are almost the
same.

The crash message on centos7.5 shows below:

  start fd 61
  start fd 62
  start fd 63
  BUG: unable to handle kernel NULL pointer dereference at           (null)
  IP: __wake_up_common+0x2e/0x90
  PGD 0
  Oops: 0000 [#1] SMP
  Modules linked in: test(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter devlink sunrpc kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd sg ppdev pcspkr virtio_balloon parport_pc parport i2c_piix4 joydev ip_tables xfs libcrc32c sr_mod cdrom sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi virtio_scsi virtio_console virtio_net cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm ata_piix serio_raw libata virtio_pci virtio_ring i2c_core
   virtio floppy dm_mirror dm_region_hash dm_log dm_mod
  CPU: 2 PID: 1820 Comm: test_fd Kdump: loaded Tainted: G           OE  ------------   3.10.0-862.3.3.el7.x86_64 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014
  task: ffff8e92b9431fa0 ti: ffff8e94247a0000 task.ti: ffff8e94247a0000
  RIP: 0010:__wake_up_common+0x2e/0x90
  RSP: 0018:ffff8e94247a2d18  EFLAGS: 00010086
  RAX: 0000000000000000 RBX: ffffffff9d09daa0 RCX: 0000000000000000
  RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffffff9d09daa0
  RBP: ffff8e94247a2d50 R08: 0000000000000000 R09: ffff8e92b95dfda8
  R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff9d09daa8
  R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000003
  FS:  0000000000000000(0000) GS:ffff8e9434e80000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 000000017c686000 CR4: 00000000000207e0
  Call Trace:
    __wake_up+0x39/0x50
    expand_files+0x131/0x250
    __alloc_fd+0x47/0x170
    get_unused_fd_flags+0x30/0x40
    test_fd+0x12a/0x1c0 [test]
    kthread+0xd1/0xe0
    ret_from_fork_nospec_begin+0x21/0x21
  Code: 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 54 49 89 fc 49 83 c4 08 53 48 83 ec 10 48 8b 47 08 89 55 cc 4c 89 45 d0 <48> 8b 08 49 39 c4 48 8d 78 e8 4c 8d 69 e8 75 08 eb 3b 4c 89 ef
  RIP   __wake_up_common+0x2e/0x90
   RSP <ffff8e94247a2d18>
  CR2: 0000000000000000

This issue exists since CentOS 7.5 3.10.0-862 and CentOS 7.4
(3.10.0-693.21.1 ) is ok.  Root cause: the item 'resize_wait' is not
initialized before being used.

Reported-by: Richard Zhang <zhang.zijian@h3c.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 7ffd6e9d103d..780d29e58847 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -457,6 +457,7 @@ struct files_struct init_files = {
 		.full_fds_bits	= init_files.full_fds_bits_init,
 	},
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_files.file_lock),
+	.resize_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
 };
 
 static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
-- 
cgit v1.2.3


From 626d98bbdb30126719921f64dacaa3dc8ddc0c8a Mon Sep 17 00:00:00 2001
From: Louis Taylor <louis@kragniz.eu>
Date: Wed, 27 Feb 2019 22:25:15 +0000
Subject: cifs: use correct format characters

[ Upstream commit 259594bea574e515a148171b5cd84ce5cbdc028a ]

When compiling with -Wformat, clang emits the following warnings:

fs/cifs/smb1ops.c:312:20: warning: format specifies type 'unsigned
short' but the argument has type 'unsigned int' [-Wformat]
                         tgt_total_cnt, total_in_tgt);
                                        ^~~~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:289:4: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->flags, ref->server_type);
                 ^~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:289:16: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->flags, ref->server_type);
                             ^~~~~~~~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:291:4: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->ref_flag, ref->path_consumed);
                 ^~~~~~~~~~~~~

fs/cifs/cifs_dfs_ref.c:291:19: warning: format specifies type 'short'
but the argument has type 'int' [-Wformat]
                 ref->ref_flag, ref->path_consumed);
                                ^~~~~~~~~~~~~~~~~~
The types of these arguments are unconditionally defined, so this patch
updates the format character to the correct ones for ints and unsigned
ints.

Link: https://github.com/ClangBuiltLinux/linux/issues/378

Signed-off-by: Louis Taylor <louis@kragniz.eu>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/cifs_dfs_ref.c | 4 ++--
 fs/cifs/smb1ops.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 6b61df117fd4..563e2f6268c3 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -271,9 +271,9 @@ static void dump_referral(const struct dfs_info3_param *ref)
 {
 	cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
 	cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
-	cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n",
+	cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
 		 ref->flags, ref->server_type);
-	cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n",
+	cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
 		 ref->ref_flag, ref->path_consumed);
 }
 
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 378151e09e91..47db8eb6cbcf 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
 	remaining = tgt_total_cnt - total_in_tgt;
 
 	if (remaining < 0) {
-		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n",
+		cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n",
 			 tgt_total_cnt, total_in_tgt);
 		return -EPROTO;
 	}
-- 
cgit v1.2.3


From 4ab78f4d75c644cbe707a6eb858c3737c25c842f Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 15 Feb 2019 00:08:25 +0800
Subject: f2fs: fix to check inline_xattr_size boundary correctly

[ Upstream commit 500e0b28ecd3c5aade98f3c3a339d18dcb166bb6 ]

We use below condition to check inline_xattr_size boundary:

	if (!F2FS_OPTION(sbi).inline_xattr_size ||
		F2FS_OPTION(sbi).inline_xattr_size >=
				DEF_ADDRS_PER_INODE -
				F2FS_TOTAL_EXTRA_ATTR_SIZE -
				DEF_INLINE_RESERVED_SIZE -
				DEF_MIN_INLINE_SIZE)

There is there problems in that check:
- we should allow inline_xattr_size equaling to min size of inline
{data,dentry} area.
- F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on
different size unit, previous one is 4 bytes, latter one is 1 bytes.
- DEF_MIN_INLINE_SIZE only indicate min size of inline data area,
however, we need to consider min size of inline dentry area as well,
minimal inline dentry should at least contain two entries: '.' and
'..', so that min inline_dentry size is 40 bytes.

.bitmap		1 * 1 = 1
.reserved	1 * 1 = 1
.dentry		11 * 2 = 22
.filename	8 * 2 = 16
total		40

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/f2fs.h  |  1 -
 fs/f2fs/super.c | 13 +++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 42aef5c94927..a3ba20e5946f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -450,7 +450,6 @@ struct f2fs_flush_device {
 
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE	1
-#define DEF_MIN_INLINE_SIZE		1
 static inline int get_extra_isize(struct inode *inode);
 static inline int get_inline_xattr_addrs(struct inode *inode);
 #define MAX_INLINE_DATA(inode)	(sizeof(__le32) *			\
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index c9639ef0e8d5..79370b7fa9d2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -822,12 +822,13 @@ static int parse_options(struct super_block *sb, char *options)
 					"set with inline_xattr option");
 			return -EINVAL;
 		}
-		if (!F2FS_OPTION(sbi).inline_xattr_size ||
-			F2FS_OPTION(sbi).inline_xattr_size >=
-					DEF_ADDRS_PER_INODE -
-					F2FS_TOTAL_EXTRA_ATTR_SIZE -
-					DEF_INLINE_RESERVED_SIZE -
-					DEF_MIN_INLINE_SIZE) {
+		if (F2FS_OPTION(sbi).inline_xattr_size <
+			sizeof(struct f2fs_xattr_header) / sizeof(__le32) ||
+			F2FS_OPTION(sbi).inline_xattr_size >
+			DEF_ADDRS_PER_INODE -
+			F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) -
+			DEF_INLINE_RESERVED_SIZE -
+			MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) {
 			f2fs_msg(sb, KERN_ERR,
 					"inline xattr size is out of range");
 			return -EINVAL;
-- 
cgit v1.2.3


From d579b4eae836e8f47e28ea373a579fdb9d4e04b5 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@gmail.com>
Date: Tue, 22 Jan 2019 09:46:45 +0900
Subject: cifs: Accept validate negotiate if server return
 NT_STATUS_NOT_SUPPORTED

[ Upstream commit 969ae8e8d4ee54c99134d3895f2adf96047f5bee ]

Old windows version or Netapp SMB server will return
NT_STATUS_NOT_SUPPORTED since they do not allow or implement
FSCTL_VALIDATE_NEGOTIATE_INFO. The client should accept the response
provided it's properly signed.

See
https://blogs.msdn.microsoft.com/openspecification/2012/06/28/smb3-secure-dialect-negotiation/

and

MS-SMB2 validate negotiate response processing:
https://msdn.microsoft.com/en-us/library/hh880630.aspx

Samba client had already handled it.
https://bugzilla.samba.org/attachment.cgi?id=13285&action=edit

Signed-off-by: Namjae Jeon <linkinjeon@gmail.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/smb2pdu.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 3d0db37d64ad..71f32d983384 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -881,8 +881,14 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
 	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
 		FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
 		(char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen);
-
-	if (rc != 0) {
+	if (rc == -EOPNOTSUPP) {
+		/*
+		 * Old Windows versions or Netapp SMB server can return
+		 * not supported error. Client should accept it.
+		 */
+		cifs_dbg(VFS, "Server does not support validate negotiate\n");
+		return 0;
+	} else if (rc != 0) {
 		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
 		rc = -EIO;
 		goto out_free_inbuf;
-- 
cgit v1.2.3


From 36a3219e617aa4650caf5dff858b98b95efc1d22 Mon Sep 17 00:00:00 2001
From: Yao Liu <yotta.liu@ucloud.cn>
Date: Mon, 28 Jan 2019 19:47:28 +0800
Subject: cifs: Fix NULL pointer dereference of devname

[ Upstream commit 68e2672f8fbd1e04982b8d2798dd318bf2515dd2 ]

There is a NULL pointer dereference of devname in strspn()

The oops looks something like:

  CIFS: Attempting to mount (null)
  BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  ...
  RIP: 0010:strspn+0x0/0x50
  ...
  Call Trace:
   ? cifs_parse_mount_options+0x222/0x1710 [cifs]
   ? cifs_get_volume_info+0x2f/0x80 [cifs]
   cifs_setup_volume_info+0x20/0x190 [cifs]
   cifs_get_volume_info+0x50/0x80 [cifs]
   cifs_smb3_do_mount+0x59/0x630 [cifs]
   ? ida_alloc_range+0x34b/0x3d0
   cifs_do_mount+0x11/0x20 [cifs]
   mount_fs+0x52/0x170
   vfs_kern_mount+0x6b/0x170
   do_mount+0x216/0xdc0
   ksys_mount+0x83/0xd0
   __x64_sys_mount+0x25/0x30
   do_syscall_64+0x65/0x220
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix this by adding a NULL check on devname in cifs_parse_devname()

Signed-off-by: Yao Liu <yotta.liu@ucloud.cn>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/connect.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a5ea742654aa..f31339db45fd 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1347,6 +1347,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol)
 	const char *delims = "/\\";
 	size_t len;
 
+	if (unlikely(!devname || !*devname)) {
+		cifs_dbg(VFS, "Device name not specified.\n");
+		return -EINVAL;
+	}
+
 	/* make sure we have a valid UNC double delimiter prefix */
 	len = strspn(devname, delims);
 	if (len != 2)
-- 
cgit v1.2.3


From 6a817a7aed1c4074274118d6a27f8b094e2fe0dd Mon Sep 17 00:00:00 2001
From: luojiajun <luojiajun3@huawei.com>
Date: Fri, 1 Mar 2019 00:30:00 -0500
Subject: jbd2: fix invalid descriptor block checksum

[ Upstream commit 6e876c3dd205d30b0db6850e97a03d75457df007 ]

In jbd2_journal_commit_transaction(), if we are in abort mode,
we may flush the buffer without setting descriptor block checksum
by goto start_journal_io. Then fs is mounted,
jbd2_descriptor_block_csum_verify() failed.

[  271.379811] EXT4-fs (vdd): shut down requested (2)
[  271.381827] Aborting journal on device vdd-8.
[  271.597136] JBD2: Invalid checksum recovering block 22199 in log
[  271.598023] JBD2: recovery failed
[  271.598484] EXT4-fs (vdd): error loading journal

Fix this problem by keep setting descriptor block checksum if the
descriptor buffer is not NULL.

This checksum problem can be reproduced by xfstests generic/388.

Signed-off-by: luojiajun <luojiajun3@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/jbd2/commit.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 150cc030b4d7..65ea0355a4f6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -691,9 +691,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                            the last tag we set up. */
 
 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
-
-			jbd2_descriptor_block_csum_set(journal, descriptor);
 start_journal_io:
+			if (descriptor)
+				jbd2_descriptor_block_csum_set(journal,
+							descriptor);
+
 			for (i = 0; i < bufs; i++) {
 				struct buffer_head *bh = wbuf[i];
 				/*
-- 
cgit v1.2.3


From 83c395332fdf0810264c3903dd56d76cde1e083c Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Tue, 26 Feb 2019 11:51:50 +0100
Subject: fs: fix guard_bio_eod to check for real EOD errors

[ Upstream commit dce30ca9e3b676fb288c33c1f4725a0621361185 ]

guard_bio_eod() can truncate a segment in bio to allow it to do IO on
odd last sectors of a device.

It already checks if the IO starts past EOD, but it does not consider
the possibility of an IO request starting within device boundaries can
contain more than one segment past EOD.

In such cases, truncated_bytes can be bigger than PAGE_SIZE, and will
underflow bvec->bv_len.

Fix this by checking if truncated_bytes is lower than PAGE_SIZE.

This situation has been found on filesystems such as isofs and vfat,
which doesn't check the device size before mount, if the device is
smaller than the filesystem itself, a readahead on such filesystem,
which spans EOD, can trigger this situation, leading a call to
zero_user() with a wrong size possibly corrupting memory.

I didn't see any crash, or didn't let the system run long enough to
check if memory corruption will be hit somewhere, but adding
instrumentation to guard_bio_end() to check truncated_bytes size, was
enough to see the error.

The following script can trigger the error.

MNT=/mnt
IMG=./DISK.img
DEV=/dev/loop0

mkfs.vfat $IMG
mount $IMG $MNT
cp -R /etc $MNT &> /dev/null
umount $MNT

losetup -D

losetup --find --show --sizelimit 16247280 $IMG
mount $DEV $MNT

find $MNT -type f -exec cat {} + >/dev/null

Kudos to Eric Sandeen for coming up with the reproducer above

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index c083c4b3c1e7..a550e0d8e965 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3027,6 +3027,13 @@ void guard_bio_eod(int op, struct bio *bio)
 	/* Uhhuh. We've got a bio that straddles the device size! */
 	truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
 
+	/*
+	 * The bio contains more than one segment which spans EOD, just return
+	 * and let IO layer turn it into an EIO
+	 */
+	if (truncated_bytes > bvec->bv_len)
+		return;
+
 	/* Truncate the bio.. */
 	bio->bi_iter.bi_size -= truncated_bytes;
 	bvec->bv_len -= truncated_bytes;
-- 
cgit v1.2.3


From dcedd37957de3e54a194a044befefd7687e023a1 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 25 Jan 2019 07:55:27 +0800
Subject: btrfs: qgroup: Make qgroup async transaction commit more aggressive

[ Upstream commit f5fef4593653dfa2a865c485bb81415de51d5c99 ]

[BUG]
Btrfs qgroup will still hit EDQUOT under the following case:

  $ dev=/dev/test/test
  $ mnt=/mnt/btrfs
  $ umount $mnt &> /dev/null
  $ umount $dev &> /dev/null

  $ mkfs.btrfs -f $dev
  $ mount $dev $mnt -o nospace_cache

  $ btrfs subv create $mnt/subv
  $ btrfs quota enable $mnt
  $ btrfs quota rescan -w $mnt
  $ btrfs qgroup limit -e 1G $mnt/subv

  $ fallocate -l 900M $mnt/subv/padding
  $ sync

  $ rm $mnt/subv/padding

  # Hit EDQUOT
  $ xfs_io -f -c "pwrite 0 512M" $mnt/subv/real_file

[CAUSE]
Since commit a514d63882c3 ("btrfs: qgroup: Commit transaction in advance
to reduce early EDQUOT"), btrfs is not forced to commit transaction to
reclaim more quota space.

Instead, we just check pertrans metadata reservation against some
threshold and try to do asynchronously transaction commit.

However in above case, the pertrans metadata reservation is pretty small
thus it will never trigger asynchronous transaction commit.

[FIX]
Instead of only accounting pertrans metadata reservation, we calculate
how much free space we have, and if there isn't much free space left,
commit transaction asynchronously to try to free some space.

This may slow down the fs when we have less than 32M free qgroup space,
but should reduce a lot of false EDQUOT, so the cost should be
acceptable.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/qgroup.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e1fcb28ad4cc..e46e83e87600 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2427,16 +2427,15 @@ out:
 /*
  * Two limits to commit transaction in advance.
  *
- * For RATIO, it will be 1/RATIO of the remaining limit
- * (excluding data and prealloc meta) as threshold.
+ * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
  * For SIZE, it will be in byte unit as threshold.
  */
-#define QGROUP_PERTRANS_RATIO		32
-#define QGROUP_PERTRANS_SIZE		SZ_32M
+#define QGROUP_FREE_RATIO		32
+#define QGROUP_FREE_SIZE		SZ_32M
 static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
 				const struct btrfs_qgroup *qg, u64 num_bytes)
 {
-	u64 limit;
+	u64 free;
 	u64 threshold;
 
 	if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
@@ -2455,20 +2454,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
 	 */
 	if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
 			      BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
-		if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
-			limit = qg->max_excl;
-		else
-			limit = qg->max_rfer;
-		threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] -
-			    qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) /
-			    QGROUP_PERTRANS_RATIO;
-		threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE);
+		if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+			free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
+			threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
+					  QGROUP_FREE_SIZE);
+		} else {
+			free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
+			threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
+					  QGROUP_FREE_SIZE);
+		}
 
 		/*
 		 * Use transaction_kthread to commit transaction, so we no
 		 * longer need to bother nested transaction nor lock context.
 		 */
-		if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold)
+		if (free < threshold)
 			btrfs_commit_transaction_locksafe(fs_info);
 	}
 
-- 
cgit v1.2.3


From c26d61ea90db085ad4871b5987f1cf033ce6cfb0 Mon Sep 17 00:00:00 2001
From: Aurelien Jarno <aurelien@aurel32.net>
Date: Thu, 6 Dec 2018 20:05:34 +0100
Subject: vfs: fix preadv64v2 and pwritev64v2 compat syscalls with offset == -1

[ Upstream commit cc4b1242d7e3b42eed73881fc749944146493e4f ]

The preadv2 and pwritev2 syscalls are supposed to emulate the readv and
writev syscalls when offset == -1. Therefore the compat code should
check for offset before calling do_compat_preadv64 and
do_compat_pwritev64. This is the case for the preadv2 and pwritev2
syscalls, but handling of offset == -1 is missing in their 64-bit
equivalent.

This patch fixes that, calling do_compat_readv and do_compat_writev when
offset == -1. This fixes the following glibc tests on x32:
 - misc/tst-preadvwritev2
 - misc/tst-preadvwritev64v2

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/read_write.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 8a2737f0d61d..562974a0616c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1241,6 +1241,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
+	if (pos == -1)
+		return do_compat_readv(fd, vec, vlen, flags);
+
 	return do_compat_preadv64(fd, vec, vlen, pos, flags);
 }
 #endif
@@ -1347,6 +1350,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
 		const struct compat_iovec __user *,vec,
 		unsigned long, vlen, loff_t, pos, rwf_t, flags)
 {
+	if (pos == -1)
+		return do_compat_writev(fd, vec, vlen, flags);
+
 	return do_compat_pwritev64(fd, vec, vlen, pos, flags);
 }
 #endif
-- 
cgit v1.2.3


From 9154420173d56cedc5739fdc3a57799e94ae8238 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 Feb 2019 16:27:14 -0500
Subject: jbd2: fix race when writing superblock

[ Upstream commit 538bcaa6261b77e71d37f5596c33127c1a3ec3f7 ]

The jbd2 superblock is lockless now, so there is probably a race
condition between writing it so disk and modifing contents of it, which
may lead to checksum error. The following race is the one case that we
have captured.

jbd2                                fsstress
jbd2_journal_commit_transaction
 jbd2_journal_update_sb_log_tail
  jbd2_write_superblock
   jbd2_superblock_csum_set         jbd2_journal_revoke
                                     jbd2_journal_set_features(revork)
                                     modify superblock
   submit_bh(checksum incorrect)

Fix this by locking the buffer head before modifing it.  We always
write the jbd2 superblock after we modify it, so this just means
calling the lock_buffer() a little earlier.

This checksum corruption problem can be reproduced by xfstests
generic/475.

Reported-by: zhangyi (F) <yi.zhang@huawei.com>
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/jbd2/journal.c | 52 +++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8ef6b6daaa7a..88f2a49338a1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1356,6 +1356,10 @@ static int journal_reset(journal_t *journal)
 	return jbd2_journal_start_thread(journal);
 }
 
+/*
+ * This function expects that the caller will have locked the journal
+ * buffer head, and will return with it unlocked
+ */
 static int jbd2_write_superblock(journal_t *journal, int write_flags)
 {
 	struct buffer_head *bh = journal->j_sb_buffer;
@@ -1365,7 +1369,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 	trace_jbd2_write_superblock(journal, write_flags);
 	if (!(journal->j_flags & JBD2_BARRIER))
 		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
-	lock_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the journal
@@ -1424,6 +1427,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
 		  tail_block, tail_tid);
 
+	lock_buffer(journal->j_sb_buffer);
 	sb->s_sequence = cpu_to_be32(tail_tid);
 	sb->s_start    = cpu_to_be32(tail_block);
 
@@ -1454,18 +1458,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 	journal_superblock_t *sb = journal->j_superblock;
 
 	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
-	read_lock(&journal->j_state_lock);
-	/* Is it already empty? */
-	if (sb->s_start == 0) {
-		read_unlock(&journal->j_state_lock);
+	lock_buffer(journal->j_sb_buffer);
+	if (sb->s_start == 0) {		/* Is it already empty? */
+		unlock_buffer(journal->j_sb_buffer);
 		return;
 	}
+
 	jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
 		  journal->j_tail_sequence);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
 	sb->s_start    = cpu_to_be32(0);
-	read_unlock(&journal->j_state_lock);
 
 	jbd2_write_superblock(journal, write_op);
 
@@ -1488,9 +1491,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	journal_superblock_t *sb = journal->j_superblock;
 	int errcode;
 
-	read_lock(&journal->j_state_lock);
+	lock_buffer(journal->j_sb_buffer);
 	errcode = journal->j_errno;
-	read_unlock(&journal->j_state_lock);
 	if (errcode == -ESHUTDOWN)
 		errcode = 0;
 	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
@@ -1894,28 +1896,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
 
 	sb = journal->j_superblock;
 
+	/* Load the checksum driver if necessary */
+	if ((journal->j_chksum_driver == NULL) &&
+	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+		if (IS_ERR(journal->j_chksum_driver)) {
+			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+			journal->j_chksum_driver = NULL;
+			return 0;
+		}
+		/* Precompute checksum seed for all metadata */
+		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+						   sizeof(sb->s_uuid));
+	}
+
+	lock_buffer(journal->j_sb_buffer);
+
 	/* If enabling v3 checksums, update superblock */
 	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
 		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
 		sb->s_feature_compat &=
 			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
-
-		/* Load the checksum driver */
-		if (journal->j_chksum_driver == NULL) {
-			journal->j_chksum_driver = crypto_alloc_shash("crc32c",
-								      0, 0);
-			if (IS_ERR(journal->j_chksum_driver)) {
-				printk(KERN_ERR "JBD2: Cannot load crc32c "
-				       "driver.\n");
-				journal->j_chksum_driver = NULL;
-				return 0;
-			}
-
-			/* Precompute checksum seed for all metadata */
-			journal->j_csum_seed = jbd2_chksum(journal, ~0,
-							   sb->s_uuid,
-							   sizeof(sb->s_uuid));
-		}
 	}
 
 	/* If enabling v1 checksums, downgrade superblock */
@@ -1927,6 +1928,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
 	sb->s_feature_compat    |= cpu_to_be32(compat);
 	sb->s_feature_ro_compat |= cpu_to_be32(ro);
 	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+	unlock_buffer(journal->j_sb_buffer);
 
 	return 1;
 #undef COMPAT_FEATURE_ON
-- 
cgit v1.2.3


From dbeca415575f2cbeb4594ab63fa83d97987994c1 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Tue, 15 Jan 2019 20:02:15 +0000
Subject: f2fs: UBSAN: set boolean value iostat_enable correctly

[ Upstream commit ac92985864e187a1735502f6a02f54eaa655b2aa ]

When setting /sys/fs/f2fs/<DEV>/iostat_enable with non-bool value, UBSAN
reports the following warning.

[ 7562.295484] ================================================================================
[ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10
[ 7562.297651] load of value 64 is not a valid value for type '_Bool'
[ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79
[ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[ 7562.298662] Call Trace:
[ 7562.298760]  dump_stack+0x46/0x5b
[ 7562.298811]  ubsan_epilogue+0x9/0x40
[ 7562.298830]  __ubsan_handle_load_invalid_value+0x72/0x90
[ 7562.298863]  f2fs_file_write_iter+0x29f/0x3f0
[ 7562.298905]  __vfs_write+0x115/0x160
[ 7562.298922]  vfs_write+0xa7/0x190
[ 7562.298934]  ksys_write+0x50/0xc0
[ 7562.298973]  do_syscall_64+0x4a/0xe0
[ 7562.298992]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 7562.299001] RIP: 0033:0x7fa45ec19c00
[ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24
[ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00
[ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001
[ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000
[ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400
[ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000
[ 7562.299091] ================================================================================

So, if iostat_enable is enabled, set its value as true.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/sysfs.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 81c0e5337443..98887187af4c 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -273,10 +273,16 @@ out:
 		return count;
 	}
 
-	*ui = t;
 
-	if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
-		f2fs_reset_iostat(sbi);
+	if (!strcmp(a->attr.name, "iostat_enable")) {
+		sbi->iostat_enable = !!t;
+		if (!sbi->iostat_enable)
+			f2fs_reset_iostat(sbi);
+		return count;
+	}
+
+	*ui = (unsigned int)t;
+
 	return count;
 }
 
-- 
cgit v1.2.3


From 16515acd5bc33e86985fa93ee1b894815e52cdc5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 26 Mar 2019 10:49:56 +0000
Subject: Btrfs: do not allow trimming when a fs is mounted with the
 nologreplay option

commit f35f06c35560a86e841631f0243b83a984dc11a9 upstream.

Whan a filesystem is mounted with the nologreplay mount option, which
requires it to be mounted in RO mode as well, we can not allow discard on
free space inside block groups, because log trees refer to extents that
are not pinned in a block group's free space cache (pinning the extents is
precisely the first phase of replaying a log tree).

So do not allow the fitrim ioctl to do anything when the filesystem is
mounted with the nologreplay option, because later it can be mounted RW
without that option, which causes log replay to happen and result in
either a failure to replay the log trees (leading to a mount failure), a
crash or some silent corruption.

Reported-by: Darrick J. Wong <darrick.wong@oracle.com>
Fixes: 96da09192cda ("btrfs: Introduce new mount option to disable tree log replay")
CC: stable@vger.kernel.org # 4.9+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ioctl.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8bf9cce11213..0eb333c62fe4 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -496,6 +496,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	/*
+	 * If the fs is mounted with nologreplay, which requires it to be
+	 * mounted in RO mode as well, we can not allow discard on free space
+	 * inside block groups, because log trees refer to extents that are not
+	 * pinned in a block group's free space cache (pinning the extents is
+	 * precisely the first phase of replaying a log tree).
+	 */
+	if (btrfs_test_opt(fs_info, NOLOGREPLAY))
+		return -EROFS;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
 				dev_list) {
-- 
cgit v1.2.3


From fbfbb996d58ebf5ddd0c2ae36efcb7015dc50e95 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 2 Apr 2019 18:07:38 +0800
Subject: btrfs: prop: fix zstd compression parameter validation

commit 50398fde997f6be8faebdb5f38e9c9c467370f51 upstream.

We let pass zstd compression parameter even if it is not fully valid.
For example:

  $ btrfs prop set /btrfs compression zst
  $ btrfs prop get /btrfs compression
     compression=zst

zlib and lzo are fine.

Fix it by checking the correct prefix length.

Fixes: 5c1aab1dd544 ("btrfs: Add zstd support")
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/props.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dc6140013ae8..fd19f3078566 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode,
 		btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
 	} else if (!strncmp("zlib", value, 4)) {
 		type = BTRFS_COMPRESS_ZLIB;
-	} else if (!strncmp("zstd", value, len)) {
+	} else if (!strncmp("zstd", value, 4)) {
 		type = BTRFS_COMPRESS_ZSTD;
 		btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
 	} else {
-- 
cgit v1.2.3


From 54fb5c9da6cd665ddee9bcc5645b5f428dbfae2c Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Tue, 2 Apr 2019 18:07:40 +0800
Subject: btrfs: prop: fix vanished compression property after failed set

commit 272e5326c7837697882ce3162029ba893059b616 upstream.

The compression property resets to NULL, instead of the old value if we
fail to set the new compression parameter.

  $ btrfs prop get /btrfs compression
    compression=lzo
  $ btrfs prop set /btrfs compression zli
    ERROR: failed to set compression for /btrfs: Invalid argument
  $ btrfs prop get /btrfs compression

This is because the compression property ->validate() is successful for
'zli' as the strncmp() used the length passed from the userspace.

Fix it by using the expected string length in strncmp().

Fixes: 63541927c8d1 ("Btrfs: add support for inode properties")
Fixes: 5c1aab1dd544 ("btrfs: Add zstd support")
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/props.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index fd19f3078566..61d22a56c0ba 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
 
 static int prop_compression_validate(const char *value, size_t len)
 {
-	if (!strncmp("lzo", value, len))
+	if (!strncmp("lzo", value, 3))
 		return 0;
-	else if (!strncmp("zlib", value, len))
+	else if (!strncmp("zlib", value, 4))
 		return 0;
-	else if (!strncmp("zstd", value, len))
+	else if (!strncmp("zstd", value, 4))
 		return 0;
 
 	return -EINVAL;
-- 
cgit v1.2.3


From 543bb48dc48b4b65a51d91c9680afbdb573a9fa7 Mon Sep 17 00:00:00 2001
From: Jason Yan <yanaijie@huawei.com>
Date: Fri, 12 Apr 2019 10:09:16 +0800
Subject: block: fix the return errno for direct IO

commit a89afe58f1a74aac768a5eb77af95ef4ee15beaa upstream.

If the last bio returned is not dio->bio, the status of the bio will
not assigned to dio->bio if it is error. This will cause the whole IO
status wrong.

    ksoftirqd/21-117   [021] ..s.  4017.966090:   8,0    C   N 4883648 [0]
          <idle>-0     [018] ..s.  4017.970888:   8,0    C  WS 4924800 + 1024 [0]
          <idle>-0     [018] ..s.  4017.970909:   8,0    D  WS 4935424 + 1024 [<idle>]
          <idle>-0     [018] ..s.  4017.970924:   8,0    D  WS 4936448 + 321 [<idle>]
    ksoftirqd/21-117   [021] ..s.  4017.995033:   8,0    C   R 4883648 + 336 [65475]
    ksoftirqd/21-117   [021] d.s.  4018.001988: myprobe1: (blkdev_bio_end_io+0x0/0x168) bi_status=7
    ksoftirqd/21-117   [021] d.s.  4018.001992: myprobe: (aio_complete_rw+0x0/0x148) x0=0xffff802f2595ad80 res=0x12a000 res2=0x0

We always have to assign bio->bi_status to dio->bio.bi_status because we
will only check dio->bio.bi_status when we return the whole IO to
the upper layer.

Fixes: 542ff7bf18c6 ("block: new direct I/O implementation")
Cc: stable@vger.kernel.org
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/block_dev.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdbb888a8d4a..1c25dae083a8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -296,10 +296,10 @@ static void blkdev_bio_end_io(struct bio *bio)
 	struct blkdev_dio *dio = bio->bi_private;
 	bool should_dirty = dio->should_dirty;
 
-	if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
-		if (bio->bi_status && !dio->bio.bi_status)
-			dio->bio.bi_status = bio->bi_status;
-	} else {
+	if (bio->bi_status && !dio->bio.bi_status)
+		dio->bio.bi_status = bio->bi_status;
+
+	if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
 		if (!dio->is_sync) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
-- 
cgit v1.2.3


From ca306c17d2edcc8aa3bf1724a5cb1ecefc31ef3b Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Date: Sat, 2 Mar 2019 09:17:32 +0800
Subject: inotify: Fix fsnotify_mark refcount leak in
 inotify_update_existing_watch()

[ Upstream commit 62c9d2674b31d4c8a674bee86b7edc6da2803aea ]

Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for
inotify_add_watch()") forgot to call fsnotify_put_mark() with
IN_MASK_CREATE after fsnotify_find_mark()

Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()")
Signed-off-by: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/notify/inotify/inotify_user.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 780bba695453..97a51690338e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark)
 		return -ENOENT;
-	else if (create)
-		return -EEXIST;
+	else if (create) {
+		ret = -EEXIST;
+		goto out;
+	}
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
@@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	/* return the wd */
 	ret = i_mark->wd;
 
+out:
 	/* match the get from fsnotify_find_mark() */
 	fsnotify_put_mark(fsn_mark);
 
-- 
cgit v1.2.3


From 90a1327e4ed3e2c840b86f1f35d5aec712c652c4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 14 Mar 2019 23:46:05 -0400
Subject: ext4: avoid panic during forced reboot

[ Upstream commit 1dc1097ff60e4105216da7cd0aa99032b039a994 ]

When admin calls "reboot -f" - i.e., does a hard system reboot by
directly calling reboot(2) - ext4 filesystem mounted with errors=panic
can panic the system. This happens because the underlying device gets
disabled without unmounting the filesystem and thus some syscall running
in parallel to reboot(2) can result in the filesystem getting IO errors.

This is somewhat surprising to the users so try improve the behavior by
switching to errors=remount-ro behavior when the system is running
reboot(2).

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/super.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a1cf7d68b4f0..abba7ece78e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 	spin_unlock(&sbi->s_md_lock);
 }
 
+static bool system_going_down(void)
+{
+	return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
+		|| system_state == SYSTEM_RESTART;
+}
+
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
  *
@@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb)
 		if (journal)
 			jbd2_journal_abort(journal, -EIO);
 	}
-	if (test_opt(sb, ERRORS_RO)) {
+	/*
+	 * We force ERRORS_RO behavior when system is rebooting. Otherwise we
+	 * could panic during 'reboot -f' as the underlying device got already
+	 * disabled.
+	 */
+	if (test_opt(sb, ERRORS_RO) || system_going_down()) {
 		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		/*
 		 * Make sure updated value of ->s_mount_flags will be visible
@@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb)
 		 */
 		smp_wmb();
 		sb->s_flags |= SB_RDONLY;
-	}
-	if (test_opt(sb, ERRORS_PANIC)) {
+	} else if (test_opt(sb, ERRORS_PANIC)) {
 		if (EXT4_SB(sb)->s_journal &&
 		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 			return;
-- 
cgit v1.2.3


From f5a94fd3b375f44bcb27b81595b22f2f464a47d2 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 15 Mar 2019 00:15:32 -0400
Subject: ext4: add missing brelse() in add_new_gdb_meta_bg()

[ Upstream commit d64264d6218e6892edd832dc3a5a5857c2856c53 ]

Currently in add_new_gdb_meta_bg() there is a missing brelse of gdb_bh
in case ext4_journal_get_write_access() fails.
Additionally kvfree() is missing in the same error path. Fix it by
moving the ext4_journal_get_write_access() before the ext4 sb update as
Ted suggested and release n_group_desc and gdb_bh in case it fails.

Fixes: 61a9c11e5e7a ("ext4: add missing brelse() add_new_gdb_meta_bg()'s error path")
Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/resize.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3d9b18505c0c..90061c3d048b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
 	memcpy(n_group_desc, o_group_desc,
 	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
 	n_group_desc[gdb_num] = gdb_bh;
+
+	BUFFER_TRACE(gdb_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gdb_bh);
+	if (err) {
+		kvfree(n_group_desc);
+		brelse(gdb_bh);
+		return err;
+	}
+
 	EXT4_SB(sb)->s_group_desc = n_group_desc;
 	EXT4_SB(sb)->s_gdb_count++;
 	kvfree(o_group_desc);
-	BUFFER_TRACE(gdb_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, gdb_bh);
 	return err;
 }
 
-- 
cgit v1.2.3


From a793860c0f52f28d0d5f052a20981057eb9fb9f1 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 15 Mar 2019 00:22:28 -0400
Subject: ext4: report real fs size after failed resize

[ Upstream commit 6c7328400e0488f7d49e19e02290ba343b6811b2 ]

Currently when the file system resize using ext4_resize_fs() fails it
will report into log that "resized filesystem to <requested block
count>".  However this may not be true in the case of failure.  Use the
current block count as returned by ext4_blocks_count() to report the
block count.

Additionally, report a warning that "error occurred during file system
resize"

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/resize.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 90061c3d048b..e7ae26e36c9c 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -2080,6 +2080,10 @@ out:
 		free_flex_gd(flex_gd);
 	if (resize_inode != NULL)
 		iput(resize_inode);
-	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count);
+	if (err)
+		ext4_warning(sb, "error (%d) occurred during "
+			     "file system resize", err);
+	ext4_msg(sb, KERN_INFO, "resized filesystem to %llu",
+		 ext4_blocks_count(es));
 	return err;
 }
-- 
cgit v1.2.3


From 40276e4e2fd050e7bed856b6278770347179ae37 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Sun, 17 Mar 2019 15:58:38 -0500
Subject: fix incorrect error code mapping for OBJECTID_NOT_FOUND

[ Upstream commit 85f9987b236cf46e06ffdb5c225cf1f3c0acb789 ]

It was mapped to EIO which can be confusing when user space
queries for an object GUID for an object for which the server
file system doesn't support (or hasn't saved one).

As Amir Goldstein suggested this is similar to ENOATTR
(equivalently ENODATA in Linux errno definitions) so
changing NT STATUS code mapping for OBJECTID_NOT_FOUND
to ENODATA.

Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/smb2maperror.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index c3ae8c1d6089..18814f1d67d9 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
 	{STATUS_UNFINISHED_CONTEXT_DELETED, -EIO,
 	"STATUS_UNFINISHED_CONTEXT_DELETED"},
 	{STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"},
-	{STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"},
+	/* Note that ENOATTTR and ENODATA are the same errno */
+	{STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"},
 	{STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"},
 	{STATUS_WRONG_CREDENTIAL_HANDLE, -EIO,
 	"STATUS_WRONG_CREDENTIAL_HANDLE"},
-- 
cgit v1.2.3


From 83e3e89d66393f0209d4d236f0663a0de7b1927f Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@redhat.com>
Date: Fri, 8 Mar 2019 11:05:08 +0800
Subject: x86/gart: Exclude GART aperture from kcore

[ Upstream commit ffc8599aa9763f39f6736a79da4d1575e7006f9a ]

On machines where the GART aperture is mapped over physical RAM,
/proc/kcore contains the GART aperture range. Accessing the GART range via
/proc/kcore results in a kernel crash.

vmcore used to have the same issue, until it was fixed with commit
2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")', leveraging
existing hook infrastructure in vmcore to let /proc/vmcore return zeroes
when attempting to read the aperture region, and so it won't read from the
actual memory.

Apply the same workaround for kcore. First implement the same hook
infrastructure for kcore, then reuse the hook functions introduced in the
previous vmcore fix. Just with some minor adjustment, rename some functions
for more general usage, and simplify the hook infrastructure a bit as there
is no module usage yet.

Suggested-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Kairui Song <kasong@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Dave Young <dyoung@redhat.com>
Link: https://lkml.kernel.org/r/20190308030508.13548-1-kasong@redhat.com
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/proc/kcore.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index d297fe4472a9..d0137e3e585e 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head);
 static DECLARE_RWSEM(kclist_lock);
 static int kcore_need_update = 1;
 
+/*
+ * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
+ * Same as oldmem_pfn_is_ram in vmcore
+ */
+static int (*mem_pfn_is_ram)(unsigned long pfn);
+
+int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn))
+{
+	if (mem_pfn_is_ram)
+		return -EBUSY;
+	mem_pfn_is_ram = fn;
+	return 0;
+}
+
+static int pfn_is_ram(unsigned long pfn)
+{
+	if (mem_pfn_is_ram)
+		return mem_pfn_is_ram(pfn);
+	else
+		return 1;
+}
+
 /* This doesn't grab kclist_lock, so it should only be used at init time. */
 void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
 		       int type)
@@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 				goto out;
 			}
 			m = NULL;	/* skip the list anchor */
+		} else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) {
+			if (clear_user(buffer, tsz)) {
+				ret = -EFAULT;
+				goto out;
+			}
 		} else if (m->type == KCORE_VMALLOC) {
 			vread(buf, (char *)start, tsz);
 			/* we have to zero-fill user buffer even if no read */
-- 
cgit v1.2.3


From 6fd66bec6d6a7e52bcb969dc35b67103ed717664 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sat, 23 Mar 2019 12:10:29 -0400
Subject: ext4: prohibit fstrim in norecovery mode

[ Upstream commit 18915b5873f07e5030e6fb108a050fa7c71c59fb ]

The ext4 fstrim implementation uses the block bitmaps to find free space
that can be discarded.  If we haven't replayed the journal, the bitmaps
will be stale and we absolutely *cannot* use stale metadata to zap the
underlying storage.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2e76fb55d94a..5f24fdc140ad 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -999,6 +999,13 @@ resizefs_out:
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
 
+		/*
+		 * We haven't replayed the journal, so we cannot use our
+		 * block-bitmap-guided storage zapping commands.
+		 */
+		if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb))
+			return -EROFS;
+
 		if (copy_from_user(&range, (struct fstrim_range __user *)arg,
 		    sizeof(range)))
 			return -EFAULT;
-- 
cgit v1.2.3


From f9368366b4d190bacad99601fc5a026e13edc368 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Tue, 4 Sep 2018 03:52:17 +0800
Subject: f2fs: fix to avoid NULL pointer dereference on se->discard_map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 7d20c8abb2edcf962ca857d51f4d0f9cd4b19053 ]

https://bugzilla.kernel.org/show_bug.cgi?id=200951

These is a NULL pointer dereference issue reported in bugzilla:

Hi,
in the setup there is a SATA SSD connected to a SATA-to-USB bridge.

The disc is "Samsung SSD 850 PRO 256G" which supports TRIM.
There are four partitions:
 sda1: FAT  /boot
 sda2: F2FS /
 sda3: F2FS /home
 sda4: F2FS

The bridge is ASMT1153e which uses the "uas" driver.
There is no TRIM pass-through, so, when mounting it reports:
 mounting with "discard" option, but the device does not support discard

The USB host is USB3.0 and UASP capable. It is the one on RK3399.

Given this everything works fine, except there is no TRIM support.

In order to enable TRIM a new UDEV rule is added [1]:
 /etc/udev/rules.d/10-sata-bridge-trim.rules:
 ACTION=="add|change", ATTRS{idVendor}=="174c", ATTRS{idProduct}=="55aa", SUBSYSTEM=="scsi_disk", ATTR{provisioning_mode}="unmap"
After reboot any F2FS write hangs forever and dmesg reports:
 Unable to handle kernel NULL pointer dereference

Also tested on a x86_64 system: works fine even with TRIM enabled.
 same disc
 same bridge
 different usb host controller
 different cpu architecture
 not root filesystem

Regards,
  Vicenç.

[1] Post #5 in https://bbs.archlinux.org/viewtopic.php?id=236280

 Unable to handle kernel NULL pointer dereference at virtual address 000000000000003e
 Mem abort info:
   ESR = 0x96000004
   Exception class = DABT (current EL), IL = 32 bits
   SET = 0, FnV = 0
   EA = 0, S1PTW = 0
 Data abort info:
   ISV = 0, ISS = 0x00000004
   CM = 0, WnR = 0
 user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000626e3122
 [000000000000003e] pgd=0000000000000000
 Internal error: Oops: 96000004 [#1] SMP
 Modules linked in: overlay snd_soc_hdmi_codec rc_cec dw_hdmi_i2s_audio dw_hdmi_cec snd_soc_simple_card snd_soc_simple_card_utils snd_soc_rockchip_i2s rockchip_rga snd_soc_rockchip_pcm rockchipdrm videobuf2_dma_sg v4l2_mem2mem rtc_rk808 videobuf2_memops analogix_dp videobuf2_v4l2 videobuf2_common dw_hdmi dw_wdt cec rc_core videodev drm_kms_helper media drm rockchip_thermal rockchip_saradc realtek drm_panel_orientation_quirks syscopyarea sysfillrect sysimgblt fb_sys_fops dwmac_rk stmmac_platform stmmac pwm_bl squashfs loop crypto_user gpio_keys hid_kensington
 CPU: 5 PID: 957 Comm: nvim Not tainted 4.19.0-rc1-1-ARCH #1
 Hardware name: Sapphire-RK3399 Board (DT)
 pstate: 00000005 (nzcv daif -PAN -UAO)
 pc : update_sit_entry+0x304/0x4b0
 lr : update_sit_entry+0x108/0x4b0
 sp : ffff00000ca13bd0
 x29: ffff00000ca13bd0 x28: 000000000000003e
 x27: 0000000000000020 x26: 0000000000080000
 x25: 0000000000000048 x24: ffff8000ebb85cf8
 x23: 0000000000000253 x22: 00000000ffffffff
 x21: 00000000000535f2 x20: 00000000ffffffdf
 x19: ffff8000eb9e6800 x18: ffff8000eb9e6be8
 x17: 0000000007ce6926 x16: 000000001c83ffa8
 x15: 0000000000000000 x14: ffff8000f602df90
 x13: 0000000000000006 x12: 0000000000000040
 x11: 0000000000000228 x10: 0000000000000000
 x9 : 0000000000000000 x8 : 0000000000000000
 x7 : 00000000000535f2 x6 : ffff8000ebff3440
 x5 : ffff8000ebff3440 x4 : ffff8000ebe3a6c8
 x3 : 00000000ffffffff x2 : 0000000000000020
 x1 : 0000000000000000 x0 : ffff8000eb9e5800
 Process nvim (pid: 957, stack limit = 0x0000000063a78320)
 Call trace:
  update_sit_entry+0x304/0x4b0
  f2fs_invalidate_blocks+0x98/0x140
  truncate_node+0x90/0x400
  f2fs_remove_inode_page+0xe8/0x340
  f2fs_evict_inode+0x2b0/0x408
  evict+0xe0/0x1e0
  iput+0x160/0x260
  do_unlinkat+0x214/0x298
  __arm64_sys_unlinkat+0x3c/0x68
  el0_svc_handler+0x94/0x118
  el0_svc+0x8/0xc
 Code: f9400800 b9488400 36080140 f9400f01 (387c4820)
 ---[ end trace a0f21a307118c477 ]---

The reason is it is possible to enable discard flag on block queue via
UDEV, but during mount, f2fs will initialize se->discard_map only if
this flag is set, once the flag is set after mount, f2fs may dereference
NULL pointer on se->discard_map.

So this patch does below changes to fix this issue:
- initialize and update se->discard_map all the time.
- don't clear DISCARD option if device has no QUEUE_FLAG_DISCARD flag
during mount.
- don't issue small discard on zoned block device.
- introduce some functions to enhance the readability.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Tested-by: Vicente Bergas <vicencb@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/debug.c   |  3 +--
 fs/f2fs/f2fs.h    | 15 ++++++++++---
 fs/f2fs/file.c    |  2 +-
 fs/f2fs/segment.c | 65 ++++++++++++++++++++++++-------------------------------
 fs/f2fs/super.c   | 16 ++++----------
 5 files changed, 46 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 214a968962a1..ebe649d9793c 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -190,8 +190,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 	si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
 	si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
 	si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
-	if (f2fs_discard_en(sbi))
-		si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+	si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
 	si->base_mem += SIT_VBLOCK_MAP_SIZE;
 	if (sbi->segs_per_sec > 1)
 		si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a3ba20e5946f..1f5d5f62bb77 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3409,11 +3409,20 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi,
 }
 #endif
 
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
+static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
 {
-	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+	return f2fs_sb_has_blkzoned(sbi->sb);
+}
 
-	return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb);
+static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi)
+{
+	return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev));
+}
+
+static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi)
+{
+	return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) ||
+					f2fs_hw_should_discard(sbi);
 }
 
 static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 81c1dd635a8d..9fc076ca002d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1983,7 +1983,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!blk_queue_discard(q))
+	if (!f2fs_hw_support_discard(F2FS_SB(sb)))
 		return -EOPNOTSUPP;
 
 	if (copy_from_user(&range, (struct fstrim_range __user *)arg,
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1fa6f8185766..ac038563273d 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1744,11 +1744,11 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
 	int i;
 
-	if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi))
+	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi))
 		return false;
 
 	if (!force) {
-		if (!test_opt(sbi, DISCARD) || !se->valid_blocks ||
+		if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
 			SM_I(sbi)->dcc_info->nr_discards >=
 				SM_I(sbi)->dcc_info->max_discards)
 			return false;
@@ -1854,7 +1854,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 				dirty_i->nr_dirty[PRE]--;
 		}
 
-		if (!test_opt(sbi, DISCARD))
+		if (!f2fs_realtime_discard_enable(sbi))
 			continue;
 
 		if (force && start >= cpc->trim_start &&
@@ -2044,8 +2044,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 			del = 0;
 		}
 
-		if (f2fs_discard_en(sbi) &&
-			!f2fs_test_and_set_bit(offset, se->discard_map))
+		if (!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 
 		/* don't overwrite by SSR to keep node chain */
@@ -2073,8 +2072,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 			del = 0;
 		}
 
-		if (f2fs_discard_en(sbi) &&
-			f2fs_test_and_clear_bit(offset, se->discard_map))
+		if (f2fs_test_and_clear_bit(offset, se->discard_map))
 			sbi->discard_blks++;
 	}
 	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
@@ -2690,7 +2688,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	 * discard option. User configuration looks like using runtime discard
 	 * or periodic fstrim instead of it.
 	 */
-	if (test_opt(sbi, DISCARD))
+	if (f2fs_realtime_discard_enable(sbi))
 		goto out;
 
 	start_block = START_BLOCK(sbi, start_segno);
@@ -3781,13 +3779,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 			return -ENOMEM;
 #endif
 
-		if (f2fs_discard_en(sbi)) {
-			sit_i->sentries[start].discard_map
-				= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE,
-								GFP_KERNEL);
-			if (!sit_i->sentries[start].discard_map)
-				return -ENOMEM;
-		}
+		sit_i->sentries[start].discard_map
+			= f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE,
+							GFP_KERNEL);
+		if (!sit_i->sentries[start].discard_map)
+			return -ENOMEM;
 	}
 
 	sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
@@ -3935,18 +3931,16 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 				total_node_blocks += se->valid_blocks;
 
 			/* build discard map only one time */
-			if (f2fs_discard_en(sbi)) {
-				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-					memset(se->discard_map, 0xff,
-						SIT_VBLOCK_MAP_SIZE);
-				} else {
-					memcpy(se->discard_map,
-						se->cur_valid_map,
-						SIT_VBLOCK_MAP_SIZE);
-					sbi->discard_blks +=
-						sbi->blocks_per_seg -
-						se->valid_blocks;
-				}
+			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+				memset(se->discard_map, 0xff,
+					SIT_VBLOCK_MAP_SIZE);
+			} else {
+				memcpy(se->discard_map,
+					se->cur_valid_map,
+					SIT_VBLOCK_MAP_SIZE);
+				sbi->discard_blks +=
+					sbi->blocks_per_seg -
+					se->valid_blocks;
 			}
 
 			if (sbi->segs_per_sec > 1)
@@ -3984,16 +3978,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 		if (IS_NODESEG(se->type))
 			total_node_blocks += se->valid_blocks;
 
-		if (f2fs_discard_en(sbi)) {
-			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-				memset(se->discard_map, 0xff,
-							SIT_VBLOCK_MAP_SIZE);
-			} else {
-				memcpy(se->discard_map, se->cur_valid_map,
-							SIT_VBLOCK_MAP_SIZE);
-				sbi->discard_blks += old_valid_blocks;
-				sbi->discard_blks -= se->valid_blocks;
-			}
+		if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+			memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
+		} else {
+			memcpy(se->discard_map, se->cur_valid_map,
+						SIT_VBLOCK_MAP_SIZE);
+			sbi->discard_blks += old_valid_blocks;
+			sbi->discard_blks -= se->valid_blocks;
 		}
 
 		if (sbi->segs_per_sec > 1) {
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 79370b7fa9d2..6539969a1c7f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -360,7 +360,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
 static int parse_options(struct super_block *sb, char *options)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	struct request_queue *q;
 	substring_t args[MAX_OPT_ARGS];
 	char *p, *name;
 	int arg = 0;
@@ -415,14 +414,7 @@ static int parse_options(struct super_block *sb, char *options)
 				return -EINVAL;
 			break;
 		case Opt_discard:
-			q = bdev_get_queue(sb->s_bdev);
-			if (blk_queue_discard(q)) {
-				set_opt(sbi, DISCARD);
-			} else if (!f2fs_sb_has_blkzoned(sb)) {
-				f2fs_msg(sb, KERN_WARNING,
-					"mounting with \"discard\" option, but "
-					"the device does not support discard");
-			}
+			set_opt(sbi, DISCARD);
 			break;
 		case Opt_nodiscard:
 			if (f2fs_sb_has_blkzoned(sb)) {
@@ -1033,7 +1025,8 @@ static void f2fs_put_super(struct super_block *sb)
 	/* be sure to wait for any on-going discard commands */
 	dropped = f2fs_wait_discard_bios(sbi);
 
-	if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
+	if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
+					!sbi->discard_blks && !dropped) {
 		struct cp_control cpc = {
 			.reason = CP_UMOUNT | CP_TRIMMED,
 		};
@@ -1403,8 +1396,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, NOHEAP);
 	sbi->sb->s_flags |= SB_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
-	if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)))
-		set_opt(sbi, DISCARD);
+	set_opt(sbi, DISCARD);
 	if (f2fs_sb_has_blkzoned(sbi->sb))
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
 	else
-- 
cgit v1.2.3


From db77c7890ed73cfbdeb65954f12f6e0dab51b823 Mon Sep 17 00:00:00 2001
From: Gertjan Halkes <gertjan@google.com>
Date: Wed, 5 Sep 2018 15:41:29 +0900
Subject: 9p: do not trust pdu content for stat item size

[ Upstream commit 2803cf4379ed252894f046cb8812a48db35294e3 ]

v9fs_dir_readdir() could deadloop if a struct was sent with a size set
to -2

Link: http://lkml.kernel.org/r/1536134432-11997-1-git-send-email-asmadeus@codewreck.org
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=88021
Signed-off-by: Gertjan Halkes <gertjan@google.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/9p/vfs_dir.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 48db9a9f13f9..cb6c4031af55 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -105,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 	int err = 0;
 	struct p9_fid *fid;
 	int buflen;
-	int reclen = 0;
 	struct p9_rdir *rdir;
 	struct kvec kvec;
 
@@ -138,11 +137,10 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 		while (rdir->head < rdir->tail) {
 			err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
 					  rdir->tail - rdir->head, &st);
-			if (err) {
+			if (err <= 0) {
 				p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
 				return -EIO;
 			}
-			reclen = st.size+2;
 
 			over = !dir_emit(ctx, st.name, strlen(st.name),
 					 v9fs_qid2ino(&st.qid), dt_type(&st));
@@ -150,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 			if (over)
 				return 0;
 
-			rdir->head += reclen;
-			ctx->pos += reclen;
+			rdir->head += err;
+			ctx->pos += err;
 		}
 	}
 }
-- 
cgit v1.2.3


From 4369f8a38085347d9cf78fe6261b1296f664132c Mon Sep 17 00:00:00 2001
From: Dinu-Razvan Chis-Serban <justcsdr@gmail.com>
Date: Wed, 5 Sep 2018 16:44:12 +0900
Subject: 9p locks: add mount option for lock retry interval

[ Upstream commit 5e172f75e51e3de1b4274146d9b990f803cb5c2a ]

The default P9_LOCK_TIMEOUT can be too long for some users exporting
a local file system to a guest VM (30s), make this configurable at
mount time.

Link: http://lkml.kernel.org/r/1536295827-3181-1-git-send-email-asmadeus@codewreck.org
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195727
Signed-off-by: Dinu-Razvan Chis-Serban <justcsdr@gmail.com>
Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/9p/v9fs.c     | 21 +++++++++++++++++++++
 fs/9p/v9fs.h     |  1 +
 fs/9p/vfs_file.c |  6 +++++-
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 89bac3d2f05b..619128b55837 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -61,6 +61,8 @@ enum {
 	Opt_cache_loose, Opt_fscache, Opt_mmap,
 	/* Access options */
 	Opt_access, Opt_posixacl,
+	/* Lock timeout option */
+	Opt_locktimeout,
 	/* Error token */
 	Opt_err
 };
@@ -80,6 +82,7 @@ static const match_table_t tokens = {
 	{Opt_cachetag, "cachetag=%s"},
 	{Opt_access, "access=%s"},
 	{Opt_posixacl, "posixacl"},
+	{Opt_locktimeout, "locktimeout=%u"},
 	{Opt_err, NULL}
 };
 
@@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #ifdef CONFIG_9P_FSCACHE
 	v9ses->cachetag = NULL;
 #endif
+	v9ses->session_lock_timeout = P9_LOCK_TIMEOUT;
 
 	if (!opts)
 		return 0;
@@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 #endif
 			break;
 
+		case Opt_locktimeout:
+			r = match_int(&args[0], &option);
+			if (r < 0) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "integer field, but no integer?\n");
+				ret = r;
+				continue;
+			}
+			if (option < 1) {
+				p9_debug(P9_DEBUG_ERROR,
+					 "locktimeout must be a greater than zero integer.\n");
+				ret = -EINVAL;
+				continue;
+			}
+			v9ses->session_lock_timeout = (long)option * HZ;
+			break;
+
 		default:
 			continue;
 		}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 982e017acadb..129e5243a6bf 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -116,6 +116,7 @@ struct v9fs_session_info {
 	struct p9_client *clnt;	/* 9p client */
 	struct list_head slist; /* list of sessions registered with v9fs */
 	struct rw_semaphore rename_sem;
+	long session_lock_timeout; /* retry interval for blocking locks */
 };
 
 /* cache_validity flags */
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c87e6d6ec069..05454a7e22dc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	uint8_t status = P9_LOCK_ERROR;
 	int res = 0;
 	unsigned char fl_type;
+	struct v9fs_session_info *v9ses;
 
 	fid = filp->private_data;
 	BUG_ON(fid == NULL);
@@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	if (IS_SETLKW(cmd))
 		flock.flags = P9_LOCK_FLAGS_BLOCK;
 
+	v9ses = v9fs_inode2v9ses(file_inode(filp));
+
 	/*
 	 * if its a blocked request and we get P9_LOCK_BLOCKED as the status
 	 * for lock request, keep on trying
@@ -202,7 +205,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 			break;
 		if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
 			break;
-		if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
+		if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
+				!= 0)
 			break;
 		/*
 		 * p9_client_lock_dotl overwrites flock.client_id with the
-- 
cgit v1.2.3


From 14b183214c08ba2dcd4fee8017879a1ad6f8f0e6 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 6 Sep 2018 20:34:12 +0800
Subject: f2fs: fix to do sanity check with current segment number

[ Upstream commit 042be0f849e5fc24116d0afecfaf926eed5cac63 ]

https://bugzilla.kernel.org/show_bug.cgi?id=200219

Reproduction way:
- mount image
- run poc code
- umount image

F2FS-fs (loop1): Bitmap was wrongly set, blk:15364
------------[ cut here ]------------
kernel BUG at /home/yuchao/git/devf2fs/segment.c:2061!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 2 PID: 17686 Comm: umount Tainted: G        W  O      4.18.0-rc2+ #39
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
EIP: update_sit_entry+0x459/0x4e0 [f2fs]
Code: e8 1c b5 fd ff 0f 0b 0f 0b 8b 45 e4 c7 44 24 08 9c 7a 6c f8 c7 44 24 04 bc 4a 6c f8 89 44 24 0c 8b 06 89 04 24 e8 f7 b4 fd ff <0f> 0b 8b 45 e4 0f b6 d2 89 54 24 10 c7 44 24 08 60 7a 6c f8 c7 44
EAX: 00000032 EBX: 000000f8 ECX: 00000002 EDX: 00000001
ESI: d7177000 EDI: f520fe68 EBP: d6477c6c ESP: d6477c34
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010282
CR0: 80050033 CR2: b7fbe000 CR3: 2a99b3c0 CR4: 000406f0
Call Trace:
 f2fs_allocate_data_block+0x124/0x580 [f2fs]
 do_write_page+0x78/0x150 [f2fs]
 f2fs_do_write_node_page+0x25/0xa0 [f2fs]
 __write_node_page+0x2bf/0x550 [f2fs]
 f2fs_sync_node_pages+0x60e/0x6d0 [f2fs]
 ? sync_inode_metadata+0x2f/0x40
 ? f2fs_write_checkpoint+0x28f/0x7d0 [f2fs]
 ? up_write+0x1e/0x80
 f2fs_write_checkpoint+0x2a9/0x7d0 [f2fs]
 ? mark_held_locks+0x5d/0x80
 ? _raw_spin_unlock_irq+0x27/0x50
 kill_f2fs_super+0x68/0x90 [f2fs]
 deactivate_locked_super+0x3d/0x70
 deactivate_super+0x40/0x60
 cleanup_mnt+0x39/0x70
 __cleanup_mnt+0x10/0x20
 task_work_run+0x81/0xa0
 exit_to_usermode_loop+0x59/0xa7
 do_fast_syscall_32+0x1f5/0x22c
 entry_SYSENTER_32+0x53/0x86
EIP: 0xb7f95c51
Code: c1 1e f7 ff ff 89 e5 8b 55 08 85 d2 8b 81 64 cd ff ff 74 02 89 02 5d c3 8b 0c 24 c3 8b 1c 24 c3 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76
EAX: 00000000 EBX: 0871ab90 ECX: bfb2cd00 EDX: 00000000
ESI: 00000000 EDI: 0871ab90 EBP: 0871ab90 ESP: bfb2cd7c
DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246
Modules linked in: f2fs(O) crc32_generic bnep rfcomm bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq pcbc joydev aesni_intel snd_seq_device aes_i586 snd_timer crypto_simd snd cryptd soundcore mac_hid serio_raw video i2c_piix4 parport_pc ppdev lp parport hid_generic psmouse usbhid hid e1000 [last unloaded: f2fs]
---[ end trace d423f83982cfcdc5 ]---

The reason is, different log headers using the same segment, once
one log's next block address is used by another log, it will cause
panic as above.

Main area: 24 segs, 24 secs 24 zones
  - COLD  data: 0, 0, 0
  - WARM  data: 1, 1, 1
  - HOT   data: 20, 20, 20
  - Dir   dnode: 22, 22, 22
  - File   dnode: 22, 22, 22
  - Indir nodes: 21, 21, 21

So this patch adds sanity check to detect such condition to avoid
this issue.

Signed-off-by: Chao Yu <yuchao0@huawei.com>

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>

Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/super.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6539969a1c7f..06017138eab0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2329,7 +2329,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int segment_count_main;
 	unsigned int cp_pack_start_sum, cp_payload;
 	block_t user_block_count;
-	int i;
+	int i, j;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -2370,11 +2370,43 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
 		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
 			le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg)
 			return 1;
+		for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+				le32_to_cpu(ckpt->cur_node_segno[j])) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Node segment (%u, %u) has the same "
+					"segno: %u", i, j,
+					le32_to_cpu(ckpt->cur_node_segno[i]));
+				return 1;
+			}
+		}
 	}
 	for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
 		if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs ||
 			le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg)
 			return 1;
+		for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_data_segno[i]) ==
+				le32_to_cpu(ckpt->cur_data_segno[j])) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Data segment (%u, %u) has the same "
+					"segno: %u", i, j,
+					le32_to_cpu(ckpt->cur_data_segno[i]));
+				return 1;
+			}
+		}
+	}
+	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
+		for (j = i; j < NR_CURSEG_DATA_TYPE; j++) {
+			if (le32_to_cpu(ckpt->cur_node_segno[i]) ==
+				le32_to_cpu(ckpt->cur_data_segno[j])) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Data segment (%u) and Data segment (%u)"
+					" has the same segno: %u", i, j,
+					le32_to_cpu(ckpt->cur_node_segno[i]));
+				return 1;
+			}
+		}
 	}
 
 	sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
-- 
cgit v1.2.3


From 8722566b7870e1a0d631001c66bc11d590317a38 Mon Sep 17 00:00:00 2001
From: Sheng Yong <shengyong1@huawei.com>
Date: Fri, 12 Oct 2018 18:49:26 +0800
Subject: f2fs: cleanup dirty pages if recover failed

[ Upstream commit 26b5a079197c8cb6725565968b7fd3299bd1877b ]

During recover, we will try to create new dentries for inodes with
dentry_mark. But if the parent is missing (e.g. killed by fsck),
recover will break. But those recovered dirty pages are not cleanup.
This will hit f2fs_bug_on:

[   53.519566] F2FS-fs (loop0): Found nat_bits in checkpoint
[   53.539354] F2FS-fs (loop0): recover_inode: ino = 5, name = file, inline = 3
[   53.539402] F2FS-fs (loop0): recover_dentry: ino = 5, name = file, dir = 0, err = -2
[   53.545760] F2FS-fs (loop0): Cannot recover all fsync data errno=-2
[   53.546105] F2FS-fs (loop0): access invalid blkaddr:4294967295
[   53.546171] WARNING: CPU: 1 PID: 1798 at fs/f2fs/checkpoint.c:163 f2fs_is_valid_blkaddr+0x26c/0x320
[   53.546174] Modules linked in:
[   53.546183] CPU: 1 PID: 1798 Comm: mount Not tainted 4.19.0-rc2+ #1
[   53.546186] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
[   53.546191] RIP: 0010:f2fs_is_valid_blkaddr+0x26c/0x320
[   53.546195] Code: 85 bb 00 00 00 48 89 df 88 44 24 07 e8 ad a8 db ff 48 8b 3b 44 89 e1 48 c7 c2 40 03 72 a9 48 c7 c6 e0 01 72 a9 e8 84 3c ff ff <0f> 0b 0f b6 44 24 07 e9 8a 00 00 00 48 8d bf 38 01 00 00 e8 7c a8
[   53.546201] RSP: 0018:ffff88006c067768 EFLAGS: 00010282
[   53.546208] RAX: 0000000000000000 RBX: ffff880068844200 RCX: ffffffffa83e1a33
[   53.546211] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff88006d51e590
[   53.546215] RBP: 0000000000000005 R08: ffffed000daa3cb3 R09: ffffed000daa3cb3
[   53.546218] R10: 0000000000000001 R11: ffffed000daa3cb2 R12: 00000000ffffffff
[   53.546221] R13: ffff88006a1f8000 R14: 0000000000000200 R15: 0000000000000009
[   53.546226] FS:  00007fb2f3646840(0000) GS:ffff88006d500000(0000) knlGS:0000000000000000
[   53.546229] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   53.546234] CR2: 00007f0fd77f0008 CR3: 00000000687e6002 CR4: 00000000000206e0
[   53.546237] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   53.546240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   53.546242] Call Trace:
[   53.546248]  f2fs_submit_page_bio+0x95/0x740
[   53.546253]  read_node_page+0x161/0x1e0
[   53.546271]  ? truncate_node+0x650/0x650
[   53.546283]  ? add_to_page_cache_lru+0x12c/0x170
[   53.546288]  ? pagecache_get_page+0x262/0x2d0
[   53.546292]  __get_node_page+0x200/0x660
[   53.546302]  f2fs_update_inode_page+0x4a/0x160
[   53.546306]  f2fs_write_inode+0x86/0xb0
[   53.546317]  __writeback_single_inode+0x49c/0x620
[   53.546322]  writeback_single_inode+0xe4/0x1e0
[   53.546326]  sync_inode_metadata+0x93/0xd0
[   53.546330]  ? sync_inode+0x10/0x10
[   53.546342]  ? do_raw_spin_unlock+0xed/0x100
[   53.546347]  f2fs_sync_inode_meta+0xe0/0x130
[   53.546351]  f2fs_fill_super+0x287d/0x2d10
[   53.546367]  ? vsnprintf+0x742/0x7a0
[   53.546372]  ? f2fs_commit_super+0x180/0x180
[   53.546379]  ? up_write+0x20/0x40
[   53.546385]  ? set_blocksize+0x5f/0x140
[   53.546391]  ? f2fs_commit_super+0x180/0x180
[   53.546402]  mount_bdev+0x181/0x200
[   53.546406]  mount_fs+0x94/0x180
[   53.546411]  vfs_kern_mount+0x6c/0x1e0
[   53.546415]  do_mount+0xe5e/0x1510
[   53.546420]  ? fs_reclaim_release+0x9/0x30
[   53.546424]  ? copy_mount_string+0x20/0x20
[   53.546428]  ? fs_reclaim_acquire+0xd/0x30
[   53.546435]  ? __might_sleep+0x2c/0xc0
[   53.546440]  ? ___might_sleep+0x53/0x170
[   53.546453]  ? __might_fault+0x4c/0x60
[   53.546468]  ? _copy_from_user+0x95/0xa0
[   53.546474]  ? memdup_user+0x39/0x60
[   53.546478]  ksys_mount+0x88/0xb0
[   53.546482]  __x64_sys_mount+0x5d/0x70
[   53.546495]  do_syscall_64+0x65/0x130
[   53.546503]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   53.547639] ---[ end trace b804d1ea2fec893e ]---

So if recover fails, we need to drop all recovered data.

Signed-off-by: Sheng Yong <shengyong1@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/recovery.c | 32 +++++++++++++++++++++-----------
 fs/f2fs/super.c    | 15 ++++++++++++++-
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9a8579fb3a30..ae0e5f2e67b4 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -99,8 +99,12 @@ err_out:
 	return ERR_PTR(err);
 }
 
-static void del_fsync_inode(struct fsync_inode_entry *entry)
+static void del_fsync_inode(struct fsync_inode_entry *entry, int drop)
 {
+	if (drop) {
+		/* inode should not be recovered, drop it */
+		f2fs_inode_synced(entry->inode);
+	}
 	iput(entry->inode);
 	list_del(&entry->list);
 	kmem_cache_free(fsync_entry_slab, entry);
@@ -321,12 +325,12 @@ next:
 	return err;
 }
 
-static void destroy_fsync_dnodes(struct list_head *head)
+static void destroy_fsync_dnodes(struct list_head *head, int drop)
 {
 	struct fsync_inode_entry *entry, *tmp;
 
 	list_for_each_entry_safe(entry, tmp, head, list)
-		del_fsync_inode(entry);
+		del_fsync_inode(entry, drop);
 }
 
 static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
@@ -561,7 +565,7 @@ out:
 }
 
 static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
-						struct list_head *dir_list)
+		struct list_head *tmp_inode_list, struct list_head *dir_list)
 {
 	struct curseg_info *curseg;
 	struct page *page = NULL;
@@ -615,7 +619,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
 		}
 
 		if (entry->blkaddr == blkaddr)
-			del_fsync_inode(entry);
+			list_move_tail(&entry->list, tmp_inode_list);
 next:
 		/* check next segment */
 		blkaddr = next_blkaddr_of_node(page);
@@ -628,7 +632,7 @@ next:
 
 int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 {
-	struct list_head inode_list;
+	struct list_head inode_list, tmp_inode_list;
 	struct list_head dir_list;
 	int err;
 	int ret = 0;
@@ -659,6 +663,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	}
 
 	INIT_LIST_HEAD(&inode_list);
+	INIT_LIST_HEAD(&tmp_inode_list);
 	INIT_LIST_HEAD(&dir_list);
 
 	/* prevent checkpoint */
@@ -677,11 +682,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	need_writecp = true;
 
 	/* step #2: recover data */
-	err = recover_data(sbi, &inode_list, &dir_list);
+	err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
+	else {
+		/* restore s_flags to let iput() trash data */
+		sbi->sb->s_flags = s_flags;
+	}
 skip:
-	destroy_fsync_dnodes(&inode_list);
+	destroy_fsync_dnodes(&inode_list, err);
+	destroy_fsync_dnodes(&tmp_inode_list, err);
 
 	/* truncate meta pages to be used by the recovery */
 	truncate_inode_pages_range(META_MAPPING(sbi),
@@ -690,13 +700,13 @@ skip:
 	if (err) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
 		truncate_inode_pages_final(META_MAPPING(sbi));
+	} else {
+		clear_sbi_flag(sbi, SBI_POR_DOING);
 	}
-
-	clear_sbi_flag(sbi, SBI_POR_DOING);
 	mutex_unlock(&sbi->cp_mutex);
 
 	/* let's drop all the directory inodes for clean checkpoint */
-	destroy_fsync_dnodes(&dir_list);
+	destroy_fsync_dnodes(&dir_list, err);
 
 	if (need_writecp) {
 		set_sbi_flag(sbi, SBI_IS_RECOVERED);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 06017138eab0..2264f27fd26d 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1885,6 +1885,19 @@ void f2fs_quota_off_umount(struct super_block *sb)
 	}
 }
 
+static void f2fs_truncate_quota_inode_pages(struct super_block *sb)
+{
+	struct quota_info *dqopt = sb_dqopt(sb);
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!dqopt->files[type])
+			continue;
+		f2fs_inode_synced(dqopt->files[type]);
+	}
+}
+
+
 static int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
 {
 	*projid = F2FS_I(inode)->i_projid;
@@ -3131,10 +3144,10 @@ skip_recovery:
 
 free_meta:
 #ifdef CONFIG_QUOTA
+	f2fs_truncate_quota_inode_pages(sb);
 	if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb))
 		f2fs_quota_off_umount(sbi->sb);
 #endif
-	f2fs_sync_inode_meta(sbi);
 	/*
 	 * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes()
 	 * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
-- 
cgit v1.2.3


From e9603cffb1ca7699b22cec844d4092a7a2e0e416 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 19 Oct 2018 01:58:22 -0500
Subject: cifs: fallback to older infolevels on findfirst queryinfo retry

[ Upstream commit 3b7960caceafdfc2cdfe2850487f8d091eb41144 ]

In cases where queryinfo fails, we have cases in cifs (vers=1.0)
where with backupuid mounts we retry the query info with findfirst.
This doesn't work to some NetApp servers which don't support
WindowsXP (and later) infolevel 261 (SMB_FIND_FILE_ID_FULL_DIR_INFO)
so in this case use other info levels (in this case it will usually
be level 257, SMB_FIND_FILE_DIRECTORY_INFO).

(Also fixes some indentation)

See kernel bugzilla 201435

Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/cifs/inode.c | 67 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 020f49c15b30..b59ebed4f615 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -780,43 +780,50 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	} else if ((rc == -EACCES) && backup_cred(cifs_sb) &&
 		   (strcmp(server->vals->version_string, SMB1_VERSION_STRING)
 		      == 0)) {
-			/*
-			 * For SMB2 and later the backup intent flag is already
-			 * sent if needed on open and there is no path based
-			 * FindFirst operation to use to retry with
-			 */
+		/*
+		 * For SMB2 and later the backup intent flag is already
+		 * sent if needed on open and there is no path based
+		 * FindFirst operation to use to retry with
+		 */
 
-			srchinf = kzalloc(sizeof(struct cifs_search_info),
-						GFP_KERNEL);
-			if (srchinf == NULL) {
-				rc = -ENOMEM;
-				goto cgii_exit;
-			}
+		srchinf = kzalloc(sizeof(struct cifs_search_info),
+					GFP_KERNEL);
+		if (srchinf == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 
-			srchinf->endOfSearch = false;
+		srchinf->endOfSearch = false;
+		if (tcon->unix_ext)
+			srchinf->info_level = SMB_FIND_FILE_UNIX;
+		else if ((tcon->ses->capabilities &
+			 tcon->ses->server->vals->cap_nt_find) == 0)
+			srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD;
+		else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
 			srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
+		else /* no srvino useful for fallback to some netapp */
+			srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO;
 
-			srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
-					CIFS_SEARCH_CLOSE_AT_END |
-					CIFS_SEARCH_BACKUP_SEARCH;
+		srchflgs = CIFS_SEARCH_CLOSE_ALWAYS |
+				CIFS_SEARCH_CLOSE_AT_END |
+				CIFS_SEARCH_BACKUP_SEARCH;
 
-			rc = CIFSFindFirst(xid, tcon, full_path,
-				cifs_sb, NULL, srchflgs, srchinf, false);
-			if (!rc) {
-				data =
-				(FILE_ALL_INFO *)srchinf->srch_entries_start;
+		rc = CIFSFindFirst(xid, tcon, full_path,
+			cifs_sb, NULL, srchflgs, srchinf, false);
+		if (!rc) {
+			data = (FILE_ALL_INFO *)srchinf->srch_entries_start;
 
-				cifs_dir_info_to_fattr(&fattr,
-				(FILE_DIRECTORY_INFO *)data, cifs_sb);
-				fattr.cf_uniqueid = le64_to_cpu(
-				((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
-				validinum = true;
+			cifs_dir_info_to_fattr(&fattr,
+			(FILE_DIRECTORY_INFO *)data, cifs_sb);
+			fattr.cf_uniqueid = le64_to_cpu(
+			((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId);
+			validinum = true;
 
-				cifs_buf_release(srchinf->ntwrk_buf_start);
-			}
-			kfree(srchinf);
-			if (rc)
-				goto cgii_exit;
+			cifs_buf_release(srchinf->ntwrk_buf_start);
+		}
+		kfree(srchinf);
+		if (rc)
+			goto cgii_exit;
 	} else
 		goto cgii_exit;
 
-- 
cgit v1.2.3


From 48b0309f85ae819da5bbe992b8769729f39c66c7 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Sat, 23 Feb 2019 09:48:27 +0800
Subject: f2fs: fix to dirty inode for i_mode recovery

[ Upstream commit ca597bddedd94906cd761d8be6a3ad21292725de ]

As Seulbae Kim reported in bugzilla:

https://bugzilla.kernel.org/show_bug.cgi?id=202637

We didn't recover permission field correctly after sudden power-cut,
the reason is in setattr we didn't add inode into global dirty list
once i_mode is changed, so latter checkpoint triggered by fsync will
not flush last i_mode into disk, result in this problem, fix it.

Reported-by: Seulbae Kim <seulbae@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/f2fs/file.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9fc076ca002d..b3f46e3bec17 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -770,7 +770,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
-	bool size_changed = false;
 
 	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
 		return -EIO;
@@ -830,8 +829,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		down_write(&F2FS_I(inode)->i_sem);
 		F2FS_I(inode)->last_disk_size = i_size_read(inode);
 		up_write(&F2FS_I(inode)->i_sem);
-
-		size_changed = true;
 	}
 
 	__setattr_copy(inode, attr);
@@ -845,7 +842,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	/* file size may changed here */
-	f2fs_mark_inode_dirty_sync(inode, size_changed);
+	f2fs_mark_inode_dirty_sync(inode, true);
 
 	/* inode change will produce dirty node pages flushed by checkpoint */
 	f2fs_balance_fs(F2FS_I_SB(inode), true);
-- 
cgit v1.2.3


From 8092ecc306d81186a64cda42411121f4d35aaff4 Mon Sep 17 00:00:00 2001
From: Aurelien Aptel <aaptel@suse.com>
Date: Fri, 29 Mar 2019 10:49:12 +0100
Subject: CIFS: keep FileInfo handle live during oplock break

commit b98749cac4a695f084a5ff076f4510b23e353ecd upstream.

In the oplock break handler, writing pending changes from pages puts
the FileInfo handle. If the refcount reaches zero it closes the handle
and waits for any oplock break handler to return, thus causing a deadlock.

To prevent this situation:

* We add a wait flag to cifsFileInfo_put() to decide whether we should
  wait for running/pending oplock break handlers

* We keep an additionnal reference of the SMB FileInfo handle so that
  for the rest of the handler putting the handle won't close it.
  - The ref is bumped everytime we queue the handler via the
    cifs_queue_oplock_break() helper.
  - The ref is decremented at the end of the handler

This bug was triggered by xfstest 464.

Also important fix to address the various reports of
oops in smb2_push_mandatory_locks

Signed-off-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/cifsglob.h |  2 ++
 fs/cifs/file.c     | 30 +++++++++++++++++++++++++-----
 fs/cifs/misc.c     | 25 +++++++++++++++++++++++--
 fs/cifs/smb2misc.c |  6 +++---
 4 files changed, 53 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 80f33582059e..6f227cc781e5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1263,6 +1263,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
 }
 
 struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr);
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 
 #define CIFS_CACHE_READ_FLG	1
@@ -1763,6 +1764,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock;
 #endif /* CONFIG_CIFS_ACL */
 
 void cifs_oplock_break(struct work_struct *work);
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile);
 
 extern const struct slow_work_ops cifs_oplock_break_ops;
 extern struct workqueue_struct *cifsiod_wq;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d847132ab027..d6b45682833b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -358,12 +358,30 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file)
 	return cifs_file;
 }
 
-/*
- * Release a reference on the file private data. This may involve closing
- * the filehandle out on the server. Must be called without holding
- * tcon->open_file_lock and cifs_file->file_info_lock.
+/**
+ * cifsFileInfo_put - release a reference of file priv data
+ *
+ * Always potentially wait for oplock handler. See _cifsFileInfo_put().
  */
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+	_cifsFileInfo_put(cifs_file, true);
+}
+
+/**
+ * _cifsFileInfo_put - release a reference of file priv data
+ *
+ * This may involve closing the filehandle @cifs_file out on the
+ * server. Must be called without holding tcon->open_file_lock and
+ * cifs_file->file_info_lock.
+ *
+ * If @wait_for_oplock_handler is true and we are releasing the last
+ * reference, wait for any running oplock break handler of the file
+ * and cancel any pending one. If calling this function from the
+ * oplock break handler, you need to pass false.
+ *
+ */
+void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
 {
 	struct inode *inode = d_inode(cifs_file->dentry);
 	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
@@ -411,7 +429,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 
 	spin_unlock(&tcon->open_file_lock);
 
-	oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
+	oplock_break_cancelled = wait_oplock_handler ?
+		cancel_work_sync(&cifs_file->oplock_break) : false;
 
 	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
 		struct TCP_Server_Info *server = tcon->ses->server;
@@ -4170,6 +4189,7 @@ void cifs_oplock_break(struct work_struct *work)
 							     cinode);
 		cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
 	}
+	_cifsFileInfo_put(cfile, false /* do not wait for ourself */);
 	cifs_done_oplock_break(cinode);
 }
 
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 6926685e513c..facc94e159a1 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -490,8 +490,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
 					   &pCifsInode->flags);
 
-				queue_work(cifsoplockd_wq,
-					   &netfile->oplock_break);
+				cifs_queue_oplock_break(netfile);
 				netfile->oplock_break_cancelled = false;
 
 				spin_unlock(&tcon->open_file_lock);
@@ -588,6 +587,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode)
 	spin_unlock(&cinode->writers_lock);
 }
 
+/**
+ * cifs_queue_oplock_break - queue the oplock break handler for cfile
+ *
+ * This function is called from the demultiplex thread when it
+ * receives an oplock break for @cfile.
+ *
+ * Assumes the tcon->open_file_lock is held.
+ * Assumes cfile->file_info_lock is NOT held.
+ */
+void cifs_queue_oplock_break(struct cifsFileInfo *cfile)
+{
+	/*
+	 * Bump the handle refcount now while we hold the
+	 * open_file_lock to enforce the validity of it for the oplock
+	 * break handler. The matching put is done at the end of the
+	 * handler.
+	 */
+	cifsFileInfo_get(cfile);
+
+	queue_work(cifsoplockd_wq, &cfile->oplock_break);
+}
+
 void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
 {
 	clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 58700d2ba8cd..0a7ed2e3ad4f 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -555,7 +555,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
 			clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
 				  &cinode->flags);
 
-		queue_work(cifsoplockd_wq, &cfile->oplock_break);
+		cifs_queue_oplock_break(cfile);
 		kfree(lw);
 		return true;
 	}
@@ -719,8 +719,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 					   CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
 					   &cinode->flags);
 				spin_unlock(&cfile->file_info_lock);
-				queue_work(cifsoplockd_wq,
-					   &cfile->oplock_break);
+
+				cifs_queue_oplock_break(cfile);
 
 				spin_unlock(&tcon->open_file_lock);
 				spin_unlock(&cifs_tcp_ses_lock);
-- 
cgit v1.2.3


From 8fb89b43b65fcd35f15d982712904b96fc64c68a Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Date: Sat, 6 Apr 2019 15:47:38 +0800
Subject: cifs: Fix use-after-free in SMB2_write

commit 6a3eb3360667170988f8a6477f6686242061488a upstream.

There is a KASAN use-after-free:
BUG: KASAN: use-after-free in SMB2_write+0x1342/0x1580
Read of size 8 at addr ffff8880b6a8e450 by task ln/4196

Should not release the 'req' because it will use in the trace.

Fixes: eccb4422cf97 ("smb3: Add ftrace tracepoints for improved SMB3 debugging")

Signed-off-by: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org> 4.18+
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 71f32d983384..299d8e46f01f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3591,7 +3591,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst,
 			    &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(req);
 	rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
@@ -3609,6 +3608,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 				     io_parms->offset, *nbytes);
 	}
 
+	cifs_small_buf_release(req);
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
 }
-- 
cgit v1.2.3


From c69330a855ab4342d304f67f8c1e7d1fa2686bec Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Date: Sat, 6 Apr 2019 15:47:39 +0800
Subject: cifs: Fix use-after-free in SMB2_read

commit 088aaf17aa79300cab14dbee2569c58cfafd7d6e upstream.

There is a KASAN use-after-free:
BUG: KASAN: use-after-free in SMB2_read+0x1136/0x1190
Read of size 8 at addr ffff8880b4e45e50 by task ln/1009

Should not release the 'req' because it will use in the trace.

Fixes: eccb4422cf97 ("smb3: Add ftrace tracepoints for improved SMB3 debugging")

Signed-off-by: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org> 4.18+
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 299d8e46f01f..c6fd3acc5560 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3273,8 +3273,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	rqst.rq_nvec = 1;
 
 	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
-	cifs_small_buf_release(req);
-
 	rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
@@ -3293,6 +3291,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 				    io_parms->tcon->tid, ses->Suid,
 				    io_parms->offset, io_parms->length);
 
+	cifs_small_buf_release(req);
+
 	*nbytes = le32_to_cpu(rsp->DataLength);
 	if ((*nbytes > CIFS_MAX_MSGSIZE) ||
 	    (*nbytes > io_parms->length)) {
-- 
cgit v1.2.3


From 2fcee5eaae6ee1c2ccbe629ffd46c6674a98824c Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Wed, 10 Apr 2019 07:47:22 +1000
Subject: cifs: fix handle leak in smb2_query_symlink()

commit e6d0fb7b34f264f72c33053558a360a6a734905e upstream.

If we enter smb2_query_symlink() for something that is not a symlink
and where the SMB2_open() would succeed we would never end up
closing this handle and would thus leak a handle on the server.

Fix this by immediately calling SMB2_close() on successfull open.

Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2ops.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index d4d7d61a6fe2..2001184afe70 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1906,6 +1906,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 
 	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov,
 		       &resp_buftype);
+	if (!rc)
+		SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
 	if (!rc || !err_iov.iov_base) {
 		rc = -ENOENT;
 		goto free_path;
-- 
cgit v1.2.3


From 6ff17bc5936e5fab33de8064dc0690f6c8c789ca Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 18 Apr 2019 17:50:52 -0700
Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm()
 and core dumping

commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream.

The core dumping code has always run without holding the mmap_sem for
writing, despite that is the only way to ensure that the entire vma
layout will not change from under it.  Only using some signal
serialization on the processes belonging to the mm is not nearly enough.
This was pointed out earlier.  For example in Hugh's post from Jul 2017:

  https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils

  "Not strictly relevant here, but a related note: I was very surprised
   to discover, only quite recently, how handle_mm_fault() may be called
   without down_read(mmap_sem) - when core dumping. That seems a
   misguided optimization to me, which would also be nice to correct"

In particular because the growsdown and growsup can move the
vm_start/vm_end the various loops the core dump does around the vma will
not be consistent if page faults can happen concurrently.

Pretty much all users calling mmget_not_zero()/get_task_mm() and then
taking the mmap_sem had the potential to introduce unexpected side
effects in the core dumping code.

Adding mmap_sem for writing around the ->core_dump invocation is a
viable long term fix, but it requires removing all copy user and page
faults and to replace them with get_dump_page() for all binary formats
which is not suitable as a short term fix.

For the time being this solution manually covers the places that can
confuse the core dump either by altering the vma layout or the vma flags
while it runs.  Once ->core_dump runs under mmap_sem for writing the
function mmget_still_valid() can be dropped.

Allowing mmap_sem protected sections to run in parallel with the
coredump provides some minor parallelism advantage to the swapoff code
(which seems to be safe enough by never mangling any vma field and can
keep doing swapins in parallel to the core dumping) and to some other
corner case.

In order to facilitate the backporting I added "Fixes: 86039bd3b4e6"
however the side effect of this same race condition in /proc/pid/mem
should be reproducible since before 2.6.12-rc2 so I couldn't add any
other "Fixes:" because there's no hash beyond the git genesis commit.

Because find_extend_vma() is the only location outside of the process
context that could modify the "mm" structures under mmap_sem for
reading, by adding the mmget_still_valid() check to it, all other cases
that take the mmap_sem for reading don't need the new check after
mmget_not_zero()/get_task_mm().  The expand_stack() in page fault
context also doesn't need the new check, because all tasks under core
dumping are frozen.

Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com
Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Jann Horn <jannh@google.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Jann Horn <jannh@google.com>
Acked-by: Jason Gunthorpe <jgg@mellanox.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/task_mmu.c | 18 ++++++++++++++++++
 fs/userfaultfd.c   |  9 +++++++++
 2 files changed, 27 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index d76fe166f6ce..c5819baee35c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1138,6 +1138,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 					count = -EINTR;
 					goto out_mm;
 				}
+				/*
+				 * Avoid to modify vma->vm_flags
+				 * without locked ops while the
+				 * coredump reads the vm_flags.
+				 */
+				if (!mmget_still_valid(mm)) {
+					/*
+					 * Silently return "count"
+					 * like if get_task_mm()
+					 * failed. FIXME: should this
+					 * function have returned
+					 * -ESRCH if get_task_mm()
+					 * failed like if
+					 * get_proc_task() fails?
+					 */
+					up_write(&mm->mmap_sem);
+					goto out_mm;
+				}
 				for (vma = mm->mmap; vma; vma = vma->vm_next) {
 					vma->vm_flags &= ~VM_SOFTDIRTY;
 					vma_set_page_prot(vma);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d8b8323e80f4..aaca81b5e119 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -630,6 +630,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 
 		/* the various vma->vm_userfaultfd_ctx still points to it */
 		down_write(&mm->mmap_sem);
+		/* no task can run (and in turn coredump) yet */
+		VM_WARN_ON(!mmget_still_valid(mm));
 		for (vma = mm->mmap; vma; vma = vma->vm_next)
 			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
 				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -884,6 +886,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	 * taking the mmap_sem for writing.
 	 */
 	down_write(&mm->mmap_sem);
+	if (!mmget_still_valid(mm))
+		goto skip_mm;
 	prev = NULL;
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		cond_resched();
@@ -906,6 +910,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 		vma->vm_flags = new_flags;
 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 	}
+skip_mm:
 	up_write(&mm->mmap_sem);
 	mmput(mm);
 wakeup:
@@ -1334,6 +1339,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	down_write(&mm->mmap_sem);
+	if (!mmget_still_valid(mm))
+		goto out_unlock;
 	vma = find_vma_prev(mm, start, &prev);
 	if (!vma)
 		goto out_unlock;
@@ -1521,6 +1528,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	down_write(&mm->mmap_sem);
+	if (!mmget_still_valid(mm))
+		goto out_unlock;
 	vma = find_vma_prev(mm, start, &prev);
 	if (!vma)
 		goto out_unlock;
-- 
cgit v1.2.3


From 8766cc7d0d1d3de0bc0ede9cbf61532925b7f23f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 21 Feb 2019 11:17:34 -0500
Subject: ext4: fix some error pointer dereferences

[ Upstream commit 7159a986b4202343f6cca3bb8079ecace5816fd6 ]

We can't pass error pointers to brelse().

Fixes: fb265c9cb49e ("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ext4/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c0ba5206cd9d..006c277dc22e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
 		bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
 		if (IS_ERR(bh)) {
 			ret = PTR_ERR(bh);
+			bh = NULL;
 			goto out;
 		}
 
@@ -2907,6 +2908,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 			if (error == -EIO)
 				EXT4_ERROR_INODE(inode, "block %llu read error",
 						 EXT4_I(inode)->i_file_acl);
+			bh = NULL;
 			goto cleanup;
 		}
 		error = ext4_xattr_check_block(inode, bh);
@@ -3063,6 +3065,7 @@ ext4_xattr_block_cache_find(struct inode *inode,
 		if (IS_ERR(bh)) {
 			if (PTR_ERR(bh) == -ENOMEM)
 				return NULL;
+			bh = NULL;
 			EXT4_ERROR_INODE(inode, "block %lu read error",
 					 (unsigned long)ce->e_value);
 		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
-- 
cgit v1.2.3


From d5bf783a09a06c81ca4783054355f1d243e124e7 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <lsahlber@redhat.com>
Date: Tue, 23 Apr 2019 16:39:45 +1000
Subject: cifs: fix memory leak in SMB2_read

commit 05fd5c2c61732152a6bddc318aae62d7e436629b upstream.

Commit 088aaf17aa79300cab14dbee2569c58cfafd7d6e introduced a leak where
if SMB2_read() returned an error we would return without freeing the
request buffer.

Cc: Stable <stable@vger.kernel.org>
Signed-off-by: Ronnie Sahlberg <lsahlber@redhat.com>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index c6fd3acc5560..33afb637e6f8 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -3285,6 +3285,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 					    rc);
 		}
 		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+		cifs_small_buf_release(req);
 		return rc == -ENODATA ? 0 : rc;
 	} else
 		trace_smb3_read_done(xid, req->PersistentFileId,
-- 
cgit v1.2.3


From ee231063ff954dbd46d1c55bbb6519e60dd9c649 Mon Sep 17 00:00:00 2001
From: Frank Sorenson <sorenson@redhat.com>
Date: Tue, 16 Apr 2019 08:37:27 -0500
Subject: cifs: do not attempt cifs operation on smb2+ rename error

commit 652727bbe1b17993636346716ae5867627793647 upstream.

A path-based rename returning EBUSY will incorrectly try opening
the file with a cifs (NT Create AndX) operation on an smb2+ mount,
which causes the server to force a session close.

If the mount is smb2+, skip the fallback.

Signed-off-by: Frank Sorenson <sorenson@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Reviewed-by: Ronnie Sahlberg <lsahlber@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b59ebed4f615..1fadd314ae7f 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
 	if (rc == 0 || rc != -EBUSY)
 		goto do_rename_exit;
 
+	/* Don't fall back to using SMB on SMB 2+ mount */
+	if (server->vals->protocol_id != 0)
+		goto do_rename_exit;
+
 	/* open-file renames don't work across directories */
 	if (to_dentry->d_parent != from_dentry->d_parent)
 		goto do_rename_exit;
-- 
cgit v1.2.3


From cffeb9c84d20816a2173e3cfeca210c8bfa8e357 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Thu, 4 Apr 2019 23:59:25 +0200
Subject: tracing: Fix buffer_ref pipe ops

commit b987222654f84f7b4ca95b3a55eca784cb30235b upstream.

This fixes multiple issues in buffer_pipe_buf_ops:

 - The ->steal() handler must not return zero unless the pipe buffer has
   the only reference to the page. But generic_pipe_buf_steal() assumes
   that every reference to the pipe is tracked by the page's refcount,
   which isn't true for these buffers - buffer_pipe_buf_get(), which
   duplicates a buffer, doesn't touch the page's refcount.
   Fix it by using generic_pipe_buf_nosteal(), which refuses every
   attempted theft. It should be easy to actually support ->steal, but the
   only current users of pipe_buf_steal() are the virtio console and FUSE,
   and they also only use it as an optimization. So it's probably not worth
   the effort.
 - The ->get() and ->release() handlers can be invoked concurrently on pipe
   buffers backed by the same struct buffer_ref. Make them safe against
   concurrency by using refcount_t.
 - The pointers stored in ->private were only zeroed out when the last
   reference to the buffer_ref was dropped. As far as I know, this
   shouldn't be necessary anyway, but if we do it, let's always do it.

Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/splice.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 29e92b506394..c78e0e3ff6c4 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -333,8 +333,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
-static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
-				    struct pipe_buffer *buf)
+int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+			     struct pipe_buffer *buf)
 {
 	return 1;
 }
-- 
cgit v1.2.3


From 8d693ef0141c90dd94fe47be1727214bf3adb3dd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 15 Apr 2019 12:00:42 -0400
Subject: ceph: only use d_name directly when parent is locked

commit 1bcb344086f3ecf8d6705f6d708441baa823beb3 upstream.

Ben reported tripping the BUG_ON in create_request_message during some
performance testing. Analysis of the vmcore showed that the length of
the r_dentry->d_name string changed after we allocated the buffer, but
before we encoded it.

build_dentry_path returns pointers to d_name in the common case of
non-snapped dentries, but this optimization isn't safe unless the parent
directory is locked. When it isn't, have the code make a copy of the
d_name while holding the d_lock.

Cc: stable@vger.kernel.org
Reported-by: Ben England <bengland@redhat.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mds_client.c | 61 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index bc43c822426a..d87ca0bd1bcd 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1945,10 +1945,39 @@ retry:
 	return path;
 }
 
+/* Duplicate the dentry->d_name.name safely */
+static int clone_dentry_name(struct dentry *dentry, const char **ppath,
+			     int *ppathlen)
+{
+	u32 len;
+	char *name;
+
+retry:
+	len = READ_ONCE(dentry->d_name.len);
+	name = kmalloc(len + 1, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_name.len != len) {
+		spin_unlock(&dentry->d_lock);
+		kfree(name);
+		goto retry;
+	}
+	memcpy(name, dentry->d_name.name, len);
+	spin_unlock(&dentry->d_lock);
+
+	name[len] = '\0';
+	*ppath = name;
+	*ppathlen = len;
+	return 0;
+}
+
 static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 			     const char **ppath, int *ppathlen, u64 *pino,
-			     int *pfreepath)
+			     bool *pfreepath, bool parent_locked)
 {
+	int ret;
 	char *path;
 
 	rcu_read_lock();
@@ -1957,8 +1986,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 	if (dir && ceph_snap(dir) == CEPH_NOSNAP) {
 		*pino = ceph_ino(dir);
 		rcu_read_unlock();
-		*ppath = dentry->d_name.name;
-		*ppathlen = dentry->d_name.len;
+		if (parent_locked) {
+			*ppath = dentry->d_name.name;
+			*ppathlen = dentry->d_name.len;
+		} else {
+			ret = clone_dentry_name(dentry, ppath, ppathlen);
+			if (ret)
+				return ret;
+			*pfreepath = true;
+		}
 		return 0;
 	}
 	rcu_read_unlock();
@@ -1966,13 +2002,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
-	*pfreepath = 1;
+	*pfreepath = true;
 	return 0;
 }
 
 static int build_inode_path(struct inode *inode,
 			    const char **ppath, int *ppathlen, u64 *pino,
-			    int *pfreepath)
+			    bool *pfreepath)
 {
 	struct dentry *dentry;
 	char *path;
@@ -1988,7 +2024,7 @@ static int build_inode_path(struct inode *inode,
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	*ppath = path;
-	*pfreepath = 1;
+	*pfreepath = true;
 	return 0;
 }
 
@@ -1999,7 +2035,7 @@ static int build_inode_path(struct inode *inode,
 static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 				  struct inode *rdiri, const char *rpath,
 				  u64 rino, const char **ppath, int *pathlen,
-				  u64 *ino, int *freepath)
+				  u64 *ino, bool *freepath, bool parent_locked)
 {
 	int r = 0;
 
@@ -2009,7 +2045,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 		     ceph_snap(rinode));
 	} else if (rdentry) {
 		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
-					freepath);
+					freepath, parent_locked);
 		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
 		     *ppath);
 	} else if (rpath || rino) {
@@ -2035,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
-	int freepath1 = 0, freepath2 = 0;
+	bool freepath1 = false, freepath2 = false;
 	int len;
 	u16 releases;
 	void *p, *end;
@@ -2043,16 +2079,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
-			      &path1, &pathlen1, &ino1, &freepath1);
+			      &path1, &pathlen1, &ino1, &freepath1,
+			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
+					&req->r_req_flags));
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out;
 	}
 
+	/* If r_old_dentry is set, then assume that its parent is locked */
 	ret = set_request_path_attr(NULL, req->r_old_dentry,
 			      req->r_old_dentry_dir,
 			      req->r_path2, req->r_ino2.ino,
-			      &path2, &pathlen2, &ino2, &freepath2);
+			      &path2, &pathlen2, &ino2, &freepath2, true);
 	if (ret < 0) {
 		msg = ERR_PTR(ret);
 		goto out_free1;
-- 
cgit v1.2.3


From 246d2bf32da1df979b2bb0b4ca38b9b2b6ac2bf4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 17 Apr 2019 12:58:28 -0400
Subject: ceph: ensure d_name stability in ceph_dentry_hash()

commit 76a495d666e5043ffc315695f8241f5e94a98849 upstream.

Take the d_lock here to ensure that d_name doesn't change.

Cc: stable@vger.kernel.org
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 82928cea0209..7f3f64ba464f 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1470,6 +1470,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
 unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 {
 	struct ceph_inode_info *dci = ceph_inode(dir);
+	unsigned hash;
 
 	switch (dci->i_dir_layout.dl_dir_hash) {
 	case 0:	/* for backward compat */
@@ -1477,8 +1478,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn)
 		return dn->d_name.hash;
 
 	default:
-		return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+		spin_lock(&dn->d_lock);
+		hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
 				     dn->d_name.name, dn->d_name.len);
+		spin_unlock(&dn->d_lock);
+		return hash;
 	}
 }
 
-- 
cgit v1.2.3


From 950eec8126001c63ec6055d22d370d8ab833ed4f Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 18 Apr 2019 11:24:57 +0800
Subject: ceph: fix ci->i_head_snapc leak

commit 37659182bff1eeaaeadcfc8f853c6d2b6dbc3f47 upstream.

We missed two places that i_wrbuffer_ref_head, i_wr_ref, i_dirty_caps
and i_flushing_caps may change. When they are all zeros, we should free
i_head_snapc.

Cc: stable@vger.kernel.org
Link: https://tracker.ceph.com/issues/38224
Reported-and-tested-by: Luis Henriques <lhenriques@suse.com>
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/mds_client.c | 9 +++++++++
 fs/ceph/snap.c       | 7 ++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d87ca0bd1bcd..bfcf11c70bfa 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1290,6 +1290,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 			list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
 			ci->i_prealloc_cap_flush = NULL;
 		}
+
+               if (drop &&
+                  ci->i_wrbuffer_ref_head == 0 &&
+                  ci->i_wr_ref == 0 &&
+                  ci->i_dirty_caps == 0 &&
+                  ci->i_flushing_caps == 0) {
+                      ceph_put_snap_context(ci->i_head_snapc);
+                      ci->i_head_snapc = NULL;
+               }
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	while (!list_empty(&to_remove)) {
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index f74193da0e09..1f46b02f7314 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -568,7 +568,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 	old_snapc = NULL;
 
 update_snapc:
-	if (ci->i_head_snapc) {
+       if (ci->i_wrbuffer_ref_head == 0 &&
+           ci->i_wr_ref == 0 &&
+           ci->i_dirty_caps == 0 &&
+           ci->i_flushing_caps == 0) {
+               ci->i_head_snapc = NULL;
+       } else {
 		ci->i_head_snapc = ceph_get_snap_context(new_snapc);
 		dout(" new snapc is %p\n", new_snapc);
 	}
-- 
cgit v1.2.3


From b4d4b5e4b83979b8b0888295bda1751b4340747c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trondmy@gmail.com>
Date: Fri, 5 Apr 2019 08:54:37 -0700
Subject: nfsd: Don't release the callback slot unless it was actually held

commit e6abc8caa6deb14be2a206253f7e1c5e37e9515b upstream.

If there are multiple callbacks queued, waiting for the callback
slot when the callback gets shut down, then they all currently
end up acting as if they hold the slot, and call
nfsd4_cb_sequence_done() resulting in interesting side-effects.

In addition, the 'retry_nowait' path in nfsd4_cb_sequence_done()
causes a loop back to nfsd4_cb_prepare() without first freeing the
slot, which causes a deadlock when nfsd41_cb_get_slot() gets called
a second time.

This patch therefore adds a boolean to track whether or not the
callback did pick up the slot, so that it can do the right thing
in these 2 cases.

Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfsd/nfs4callback.c | 8 +++++++-
 fs/nfsd/state.h        | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 601bf33c26a0..ebbb0285addb 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -926,8 +926,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	if (minorversion) {
-		if (!nfsd41_cb_get_slot(clp, task))
+		if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task))
 			return;
+		cb->cb_holds_slot = true;
 	}
 	rpc_call_start(task);
 }
@@ -954,6 +955,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		return true;
 	}
 
+	if (!cb->cb_holds_slot)
+		goto need_restart;
+
 	switch (cb->cb_seq_status) {
 	case 0:
 		/*
@@ -992,6 +996,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 			cb->cb_seq_status);
 	}
 
+	cb->cb_holds_slot = false;
 	clear_bit(0, &clp->cl_cb_slot_busy);
 	rpc_wake_up_next(&clp->cl_cb_waitq);
 	dprintk("%s: freed slot, new seqid=%d\n", __func__,
@@ -1199,6 +1204,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_seq_status = 1;
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
+	cb->cb_holds_slot = false;
 }
 
 void nfsd4_run_cb(struct nfsd4_callback *cb)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0b15dac7e609..0f07ad6dc1ef 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -70,6 +70,7 @@ struct nfsd4_callback {
 	int cb_seq_status;
 	int cb_status;
 	bool cb_need_restart;
+	bool cb_holds_slot;
 };
 
 struct nfsd4_callback_ops {
-- 
cgit v1.2.3


From 4d476a00b3f98334fd8a680dc0d813f4d82a5f3d Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Thu, 25 Apr 2019 22:24:05 -0700
Subject: fs/proc/proc_sysctl.c: Fix a NULL pointer dereference

commit 89189557b47b35683a27c80ee78aef18248eefb4 upstream.

Syzkaller report this:

  sysctl could not get directory: /net//bridge -12
  kasan: CONFIG_KASAN_INLINE enabled
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP KASAN PTI
  CPU: 1 PID: 7027 Comm: syz-executor.0 Tainted: G         C        5.1.0-rc3+ #8
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
  RIP: 0010:__write_once_size include/linux/compiler.h:220 [inline]
  RIP: 0010:__rb_change_child include/linux/rbtree_augmented.h:144 [inline]
  RIP: 0010:__rb_erase_augmented include/linux/rbtree_augmented.h:186 [inline]
  RIP: 0010:rb_erase+0x5f4/0x19f0 lib/rbtree.c:459
  Code: 00 0f 85 60 13 00 00 48 89 1a 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 f2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 75 0c 00 00 4d 85 ed 4c 89 2e 74 ce 4c 89 ea 48
  RSP: 0018:ffff8881bb507778 EFLAGS: 00010206
  RAX: dffffc0000000000 RBX: ffff8881f224b5b8 RCX: ffffffff818f3f6a
  RDX: 000000000000000a RSI: 0000000000000050 RDI: ffff8881f224b568
  RBP: 0000000000000000 R08: ffffed10376a0ef4 R09: ffffed10376a0ef4
  R10: 0000000000000001 R11: ffffed10376a0ef4 R12: ffff8881f224b558
  R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
  FS:  00007f3e7ce13700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fd60fbe9398 CR3: 00000001cb55c001 CR4: 00000000007606e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  PKRU: 55555554
  Call Trace:
   erase_entry fs/proc/proc_sysctl.c:178 [inline]
   erase_header+0xe3/0x160 fs/proc/proc_sysctl.c:207
   start_unregistering fs/proc/proc_sysctl.c:331 [inline]
   drop_sysctl_table+0x558/0x880 fs/proc/proc_sysctl.c:1631
   get_subdir fs/proc/proc_sysctl.c:1022 [inline]
   __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335
   br_netfilter_init+0x68/0x1000 [br_netfilter]
   do_one_initcall+0xbc/0x47d init/main.c:901
   do_init_module+0x1b5/0x547 kernel/module.c:3456
   load_module+0x6405/0x8c10 kernel/module.c:3804
   __do_sys_finit_module+0x162/0x190 kernel/module.c:3898
   do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe
  Modules linked in: br_netfilter(+) backlight comedi(C) hid_sensor_hub max3100 ti_ads8688 udc_core fddi snd_mona leds_gpio rc_streamzap mtd pata_netcell nf_log_common rc_winfast udp_tunnel snd_usbmidi_lib snd_usb_toneport snd_usb_line6 snd_rawmidi snd_seq_device snd_hwdep videobuf2_v4l2 videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops rc_gadmei_rm008z 8250_of smm665 hid_tmff hid_saitek hwmon_vid rc_ati_tv_wonder_hd_600 rc_core pata_pdc202xx_old dn_rtmsg as3722 ad714x_i2c ad714x snd_soc_cs4265 hid_kensington panel_ilitek_ili9322 drm drm_panel_orientation_quirks ipack cdc_phonet usbcore phonet hid_jabra hid extcon_arizona can_dev industrialio_triggered_buffer kfifo_buf industrialio adm1031 i2c_mux_ltc4306 i2c_mux ipmi_msghandler mlxsw_core snd_soc_cs35l34 snd_soc_core snd_pcm_dmaengine snd_pcm snd_timer ac97_bus snd_compress snd soundcore gpio_da9055 uio ecdh_generic mdio_thunder of_mdio fixed_phy libphy mdio_cavium iptable_security iptable_raw iptable_mangle
   iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun joydev mousedev ppdev tpm kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ide_pci_generic piix aes_x86_64 crypto_simd cryptd ide_core glue_helper input_leds psmouse intel_agp intel_gtt serio_raw ata_generic i2c_piix4 agpgart pata_acpi parport_pc parport floppy rtc_cmos sch_fq_codel ip_tables x_tables sha1_ssse3 sha1_generic ipv6 [last unloaded: br_netfilter]
  Dumping ftrace buffer:
     (ftrace buffer empty)
  ---[ end trace 68741688d5fbfe85 ]---

commit 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer
dereference in put_links") forgot to handle start_unregistering() case,
while header->parent is NULL, it calls erase_header() and as seen in the
above syzkaller call trace, accessing &header->parent->root will trigger
a NULL pointer dereference.

As that commit explained, there is also no need to call
start_unregistering() if header->parent is NULL.

Link: http://lkml.kernel.org/r/20190409153622.28112-1-yuehaibing@huawei.com
Fixes: 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links")
Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Reported-by: Hulk Robot <hulkci@huawei.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/proc_sysctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d65390727541..7325baa8f9d4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -1626,9 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header)
 	if (--header->nreg)
 		return;
 
-	if (parent)
+	if (parent) {
 		put_links(header);
-	start_unregistering(header);
+		start_unregistering(header);
+	}
+
 	if (!--header->count)
 		kfree_rcu(header, rcu);
 
-- 
cgit v1.2.3


From 94ad68a6e5704b4475f4cc95dd541d21227e749b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sat, 30 Mar 2019 10:21:07 +0900
Subject: NFS: Forbid setting AF_INET6 to "struct sockaddr_in"->sin_family.

commit 7c2bd9a39845bfb6d72ddb55ce737650271f6f96 upstream.

syzbot is reporting uninitialized value at rpc_sockaddr2uaddr() [1]. This
is because syzbot is setting AF_INET6 to "struct sockaddr_in"->sin_family
(which is embedded into user-visible "struct nfs_mount_data" structure)
despite nfs23_validate_mount_data() cannot pass sizeof(struct sockaddr_in6)
bytes of AF_INET6 address to rpc_sockaddr2uaddr().

Since "struct nfs_mount_data" structure is user-visible, we can't change
"struct nfs_mount_data" to use "struct sockaddr_storage". Therefore,
assuming that everybody is using AF_INET family when passing address via
"struct nfs_mount_data"->addr, reject if its sin_family is not AF_INET.

[1] https://syzkaller.appspot.com/bug?id=599993614e7cbbf66bc2656a919ab2a95fb5d75c

Reported-by: syzbot <syzbot+047a11c361b872896a4f@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/super.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6b666d187907..6df9b85caf20 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2052,7 +2052,8 @@ static int nfs23_validate_mount_data(void *options,
 		memcpy(sap, &data->addr, sizeof(data->addr));
 		args->nfs_server.addrlen = sizeof(data->addr);
 		args->nfs_server.port = ntohs(data->addr.sin_port);
-		if (!nfs_verify_server_address(sap))
+		if (sap->sa_family != AF_INET ||
+		    !nfs_verify_server_address(sap))
 			goto out_no_address;
 
 		if (!(data->flags & NFS_MOUNT_TCP))
-- 
cgit v1.2.3


From 9101cbe70ef64c7f35fb75552005a3a696cc288e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 22 Nov 2018 16:44:07 +0100
Subject: aio: clear IOCB_HIPRI

commit 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 upstream.

No one is going to poll for aio (yet), so we must clear the HIPRI
flag, as we would otherwise send it down the poll queues, where no
one will be polling for completions.

Signed-off-by: Christoph Hellwig <hch@lst.de>

IOCB_HIPRI, not RWF_HIPRI.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 45d5ef8dd0a8..78aa249070b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1438,8 +1438,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 		ret = ioprio_check_cap(iocb->aio_reqprio);
 		if (ret) {
 			pr_debug("aio ioprio check cap error: %d\n", ret);
-			fput(req->ki_filp);
-			return ret;
+			goto out_fput;
 		}
 
 		req->ki_ioprio = iocb->aio_reqprio;
@@ -1448,7 +1447,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
-		fput(req->ki_filp);
+		goto out_fput;
+
+	req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
+	return 0;
+
+out_fput:
+	fput(req->ki_filp);
 	return ret;
 }
 
-- 
cgit v1.2.3


From b3373253f0bab538a7521537dfcb73e731b3d732 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Nov 2018 14:27:13 -0700
Subject: aio: use assigned completion handler

commit bc9bff61624ac33b7c95861abea1af24ee7a94fc upstream.

We know this is a read/write request, but in preparation for
having different kinds of those, ensure that we call the assigned
handler instead of assuming it's aio_complete_rq().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 78aa249070b1..3df3fb0678e5 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1492,7 +1492,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
 		ret = -EINTR;
 		/*FALLTHRU*/
 	default:
-		aio_complete_rw(req, ret, 0);
+		req->ki_complete(req, ret, 0);
 	}
 }
 
-- 
cgit v1.2.3


From 730198c889d85db78058cfb57c1b41c65f55c94e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Nov 2018 15:57:42 -0700
Subject: aio: separate out ring reservation from req allocation

commit 432c79978c33ecef91b1b04cea6936c20810da29 upstream.

This is in preparation for certain types of IO not needing a ring
reserveration.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 3df3fb0678e5..b9e0df08277b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -902,7 +902,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
 	local_irq_restore(flags);
 }
 
-static bool get_reqs_available(struct kioctx *ctx)
+static bool __get_reqs_available(struct kioctx *ctx)
 {
 	struct kioctx_cpu *kcpu;
 	bool ret = false;
@@ -994,6 +994,14 @@ static void user_refill_reqs_available(struct kioctx *ctx)
 	spin_unlock_irq(&ctx->completion_lock);
 }
 
+static bool get_reqs_available(struct kioctx *ctx)
+{
+	if (__get_reqs_available(ctx))
+		return true;
+	user_refill_reqs_available(ctx);
+	return __get_reqs_available(ctx);
+}
+
 /* aio_get_req
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
@@ -1002,24 +1010,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct aio_kiocb *req;
 
-	if (!get_reqs_available(ctx)) {
-		user_refill_reqs_available(ctx);
-		if (!get_reqs_available(ctx))
-			return NULL;
-	}
-
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
-		goto out_put;
+		return NULL;
 
 	percpu_ref_get(&ctx->reqs);
 	INIT_LIST_HEAD(&req->ki_list);
 	refcount_set(&req->ki_refcnt, 0);
 	req->ki_ctx = ctx;
 	return req;
-out_put:
-	put_reqs_available(ctx, 1);
-	return NULL;
 }
 
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
@@ -1813,9 +1812,13 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		return -EINVAL;
 	}
 
+	if (!get_reqs_available(ctx))
+		return -EAGAIN;
+
+	ret = -EAGAIN;
 	req = aio_get_req(ctx);
 	if (unlikely(!req))
-		return -EAGAIN;
+		goto out_put_reqs_available;
 
 	if (iocb.aio_flags & IOCB_FLAG_RESFD) {
 		/*
@@ -1878,11 +1881,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 	return 0;
 out_put_req:
-	put_reqs_available(ctx, 1);
 	percpu_ref_put(&ctx->reqs);
 	if (req->ki_eventfd)
 		eventfd_ctx_put(req->ki_eventfd);
 	kmem_cache_free(kiocb_cachep, req);
+out_put_reqs_available:
+	put_reqs_available(ctx, 1);
 	return ret;
 }
 
-- 
cgit v1.2.3


From ef529eead8cfc11c051b90d239e7137f7141ea94 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 4 Dec 2018 09:44:49 -0700
Subject: aio: don't zero entire aio_kiocb aio_get_req()

commit 2bc4ca9bb600cbe36941da2b2a67189fc4302a04 upstream.

It's 192 bytes, fairly substantial. Most items don't need to be cleared,
especially not upfront. Clear the ones we do need to clear, and leave
the other ones for setup when the iocb is prepared and submitted.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index b9e0df08277b..2547f17b4fef 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1010,14 +1010,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct aio_kiocb *req;
 
-	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
 	if (unlikely(!req))
 		return NULL;
 
 	percpu_ref_get(&ctx->reqs);
+	req->ki_ctx = ctx;
 	INIT_LIST_HEAD(&req->ki_list);
 	refcount_set(&req->ki_refcnt, 0);
-	req->ki_ctx = ctx;
+	req->ki_eventfd = NULL;
 	return req;
 }
 
@@ -1738,6 +1739,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
 	if (unlikely(!req->file))
 		return -EBADF;
 
+	req->head = NULL;
+	req->woken = false;
+	req->cancelled = false;
+
 	apt.pt._qproc = aio_poll_queue_proc;
 	apt.pt._key = req->events;
 	apt.iocb = aiocb;
-- 
cgit v1.2.3


From 4d677689742ab60d5be46e20708276368564427a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 24 Nov 2018 21:33:09 -0700
Subject: aio: use iocb_put() instead of open coding it

commit 71ebc6fef0f53459f37fb39e1466792232fa52ee upstream.

Replace the percpu_ref_put() + kmem_cache_free() with a call to
iocb_put() instead.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 2547f17b4fef..e2b63ab28ecc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1886,10 +1886,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 	return 0;
 out_put_req:
-	percpu_ref_put(&ctx->reqs);
 	if (req->ki_eventfd)
 		eventfd_ctx_put(req->ki_eventfd);
-	kmem_cache_free(kiocb_cachep, req);
+	iocb_put(req);
 out_put_reqs_available:
 	put_reqs_available(ctx, 1);
 	return ret;
-- 
cgit v1.2.3


From d384f8b855a573ea301fd7f5558cc64cb22107e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 24 Nov 2018 14:46:14 -0700
Subject: aio: split out iocb copy from io_submit_one()

commit 88a6f18b950e2e4dce57d31daa151105f4f3dcff upstream.

In preparation of handing in iocbs in a different fashion as well. Also
make it clear that the iocb being passed in isn't modified, by marking
it const throughout.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 68 ++++++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index e2b63ab28ecc..6e1da220f04b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1416,7 +1416,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 	aio_complete(iocb, res, res2);
 }
 
-static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
+static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 {
 	int ret;
 
@@ -1457,7 +1457,7 @@ out_fput:
 	return ret;
 }
 
-static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
+static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
 		bool vectored, bool compat, struct iov_iter *iter)
 {
 	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
@@ -1496,8 +1496,8 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
 	}
 }
 
-static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
-		bool compat)
+static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
+			bool vectored, bool compat)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
@@ -1529,8 +1529,8 @@ out_fput:
 	return ret;
 }
 
-static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
-		bool compat)
+static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
+			 bool vectored, bool compat)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct iov_iter iter;
@@ -1585,7 +1585,8 @@ static void aio_fsync_work(struct work_struct *work)
 	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
 }
 
-static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
+static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
+		     bool datasync)
 {
 	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
 			iocb->aio_rw_flags))
@@ -1719,7 +1720,7 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 	add_wait_queue(head, &pt->iocb->poll.wait);
 }
 
-static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
+static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 {
 	struct kioctx *ctx = aiocb->ki_ctx;
 	struct poll_iocb *req = &aiocb->poll;
@@ -1791,27 +1792,23 @@ out:
 	return 0;
 }
 
-static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 bool compat)
+static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
+			   struct iocb __user *user_iocb, bool compat)
 {
 	struct aio_kiocb *req;
-	struct iocb iocb;
 	ssize_t ret;
 
-	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
-		return -EFAULT;
-
 	/* enforce forwards compatibility on users */
-	if (unlikely(iocb.aio_reserved2)) {
+	if (unlikely(iocb->aio_reserved2)) {
 		pr_debug("EINVAL: reserve field set\n");
 		return -EINVAL;
 	}
 
 	/* prevent overflows */
 	if (unlikely(
-	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
-	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
-	    ((ssize_t)iocb.aio_nbytes < 0)
+	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
+	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
+	    ((ssize_t)iocb->aio_nbytes < 0)
 	   )) {
 		pr_debug("EINVAL: overflow check\n");
 		return -EINVAL;
@@ -1825,14 +1822,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	if (unlikely(!req))
 		goto out_put_reqs_available;
 
-	if (iocb.aio_flags & IOCB_FLAG_RESFD) {
+	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
 		 * instance of the file* now. The file descriptor must be
 		 * an eventfd() fd, and will be signaled for each completed
 		 * event using the eventfd_signal() function.
 		 */
-		req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
+		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
 			req->ki_eventfd = NULL;
@@ -1847,32 +1844,32 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	}
 
 	req->ki_user_iocb = user_iocb;
-	req->ki_user_data = iocb.aio_data;
+	req->ki_user_data = iocb->aio_data;
 
-	switch (iocb.aio_lio_opcode) {
+	switch (iocb->aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
-		ret = aio_read(&req->rw, &iocb, false, compat);
+		ret = aio_read(&req->rw, iocb, false, compat);
 		break;
 	case IOCB_CMD_PWRITE:
-		ret = aio_write(&req->rw, &iocb, false, compat);
+		ret = aio_write(&req->rw, iocb, false, compat);
 		break;
 	case IOCB_CMD_PREADV:
-		ret = aio_read(&req->rw, &iocb, true, compat);
+		ret = aio_read(&req->rw, iocb, true, compat);
 		break;
 	case IOCB_CMD_PWRITEV:
-		ret = aio_write(&req->rw, &iocb, true, compat);
+		ret = aio_write(&req->rw, iocb, true, compat);
 		break;
 	case IOCB_CMD_FSYNC:
-		ret = aio_fsync(&req->fsync, &iocb, false);
+		ret = aio_fsync(&req->fsync, iocb, false);
 		break;
 	case IOCB_CMD_FDSYNC:
-		ret = aio_fsync(&req->fsync, &iocb, true);
+		ret = aio_fsync(&req->fsync, iocb, true);
 		break;
 	case IOCB_CMD_POLL:
-		ret = aio_poll(req, &iocb);
+		ret = aio_poll(req, iocb);
 		break;
 	default:
-		pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode);
+		pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
 		ret = -EINVAL;
 		break;
 	}
@@ -1894,6 +1891,17 @@ out_put_reqs_available:
 	return ret;
 }
 
+static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
+			 bool compat)
+{
+	struct iocb iocb;
+
+	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
+		return -EFAULT;
+
+	return __io_submit_one(ctx, &iocb, user_iocb, compat);
+}
+
 /* sys_io_submit:
  *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
  *	the number of iocbs queued.  May return -EINVAL if the aio_context
-- 
cgit v1.2.3


From a812f7b68a3940e0369fd0fb24febec794a67623 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 20 Nov 2018 20:06:23 -0700
Subject: aio: abstract out io_event filler helper

commit 875736bb3f3ded168469f6a14df7a938416a99d5 upstream.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 6e1da220f04b..f6ce01ca6903 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1059,6 +1059,15 @@ static inline void iocb_put(struct aio_kiocb *iocb)
 	}
 }
 
+static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
+			   long res, long res2)
+{
+	ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
+	ev->data = iocb->ki_user_data;
+	ev->res = res;
+	ev->res2 = res2;
+}
+
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
@@ -1086,10 +1095,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
 
-	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
-	event->data = iocb->ki_user_data;
-	event->res = res;
-	event->res2 = res2;
+	aio_fill_event(event, iocb, res, res2);
 
 	kunmap_atomic(ev_page);
 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
-- 
cgit v1.2.3


From 2afa01cd9186974051b38b7d1f31bb2407e41e3a Mon Sep 17 00:00:00 2001
From: Mike Marshall <hubcap@omnibond.com>
Date: Tue, 5 Feb 2019 14:13:35 -0500
Subject: aio: initialize kiocb private in case any filesystems expect it.

commit ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 upstream.

A recent optimization had left private uninitialized.

Fixes: 2bc4ca9bb600 ("aio: don't zero entire aio_kiocb aio_get_req()")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index f6ce01ca6903..d74fc9e112ac 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1430,6 +1430,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	if (unlikely(!req->ki_filp))
 		return -EBADF;
 	req->ki_complete = aio_complete_rw;
+	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
 	req->ki_flags = iocb_flags(req->ki_filp);
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)
-- 
cgit v1.2.3


From d6b2615f7d31d8e58b685d42dbafcc7dc1204bbd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 3 Mar 2019 14:23:33 -0800
Subject: aio: simplify - and fix - fget/fput for io_submit()

commit 84c4e1f89fefe70554da0ab33be72c9be7994379 upstream.

Al Viro root-caused a race where the IOCB_CMD_POLL handling of
fget/fput() could cause us to access the file pointer after it had
already been freed:

 "In more details - normally IOCB_CMD_POLL handling looks so:

   1) io_submit(2) allocates aio_kiocb instance and passes it to
      aio_poll()

   2) aio_poll() resolves the descriptor to struct file by req->file =
      fget(iocb->aio_fildes)

   3) aio_poll() sets ->woken to false and raises ->ki_refcnt of that
      aio_kiocb to 2 (bumps by 1, that is).

   4) aio_poll() calls vfs_poll(). After sanity checks (basically,
      "poll_wait() had been called and only once") it locks the queue.
      That's what the extra reference to iocb had been for - we know we
      can safely access it.

   5) With queue locked, we check if ->woken has already been set to
      true (by aio_poll_wake()) and, if it had been, we unlock the
      queue, drop a reference to aio_kiocb and bugger off - at that
      point it's a responsibility to aio_poll_wake() and the stuff
      called/scheduled by it. That code will drop the reference to file
      in req->file, along with the other reference to our aio_kiocb.

   6) otherwise, we see whether we need to wait. If we do, we unlock the
      queue, drop one reference to aio_kiocb and go away - eventual
      wakeup (or cancel) will deal with the reference to file and with
      the other reference to aio_kiocb

   7) otherwise we remove ourselves from waitqueue (still under the
      queue lock), so that wakeup won't get us. No async activity will
      be happening, so we can safely drop req->file and iocb ourselves.

  If wakeup happens while we are in vfs_poll(), we are fine - aio_kiocb
  won't get freed under us, so we can do all the checks and locking
  safely. And we don't touch ->file if we detect that case.

  However, vfs_poll() most certainly *does* touch the file it had been
  given. So wakeup coming while we are still in ->poll() might end up
  doing fput() on that file. That case is not too rare, and usually we
  are saved by the still present reference from descriptor table - that
  fput() is not the final one.

  But if another thread closes that descriptor right after our fget()
  and wakeup does happen before ->poll() returns, we are in trouble -
  final fput() done while we are in the middle of a method:

Al also wrote a patch to take an extra reference to the file descriptor
to fix this, but I instead suggested we just streamline the whole file
pointer handling by submit_io() so that the generic aio submission code
simply keeps the file pointer around until the aio has completed.

Fixes: bfe4037e722e ("aio: implement IOCB_CMD_POLL")
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: syzbot+503d4cc169fcec1cb18c@syzkaller.appspotmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 72 ++++++++++++++++++++++++++--------------------------------------
 1 file changed, 29 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index d74fc9e112ac..46229e663b57 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -161,9 +161,13 @@ struct kioctx {
 	unsigned		id;
 };
 
+/*
+ * First field must be the file pointer in all the
+ * iocb unions! See also 'struct kiocb' in <linux/fs.h>
+ */
 struct fsync_iocb {
-	struct work_struct	work;
 	struct file		*file;
+	struct work_struct	work;
 	bool			datasync;
 };
 
@@ -177,8 +181,15 @@ struct poll_iocb {
 	struct work_struct	work;
 };
 
+/*
+ * NOTE! Each of the iocb union members has the file pointer
+ * as the first entry in their struct definition. So you can
+ * access the file pointer through any of the sub-structs,
+ * or directly as just 'ki_filp' in this struct.
+ */
 struct aio_kiocb {
 	union {
+		struct file		*ki_filp;
 		struct kiocb		rw;
 		struct fsync_iocb	fsync;
 		struct poll_iocb	poll;
@@ -1054,6 +1065,8 @@ static inline void iocb_put(struct aio_kiocb *iocb)
 {
 	if (refcount_read(&iocb->ki_refcnt) == 0 ||
 	    refcount_dec_and_test(&iocb->ki_refcnt)) {
+		if (iocb->ki_filp)
+			fput(iocb->ki_filp);
 		percpu_ref_put(&iocb->ki_ctx->reqs);
 		kmem_cache_free(kiocb_cachep, iocb);
 	}
@@ -1418,7 +1431,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 		file_end_write(kiocb->ki_filp);
 	}
 
-	fput(kiocb->ki_filp);
 	aio_complete(iocb, res, res2);
 }
 
@@ -1426,9 +1438,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 {
 	int ret;
 
-	req->ki_filp = fget(iocb->aio_fildes);
-	if (unlikely(!req->ki_filp))
-		return -EBADF;
 	req->ki_complete = aio_complete_rw;
 	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
@@ -1445,7 +1454,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 		ret = ioprio_check_cap(iocb->aio_reqprio);
 		if (ret) {
 			pr_debug("aio ioprio check cap error: %d\n", ret);
-			goto out_fput;
+			return ret;
 		}
 
 		req->ki_ioprio = iocb->aio_reqprio;
@@ -1454,14 +1463,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
-		goto out_fput;
+		return ret;
 
 	req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
 	return 0;
-
-out_fput:
-	fput(req->ki_filp);
-	return ret;
 }
 
 static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
@@ -1515,24 +1520,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb,
 	if (ret)
 		return ret;
 	file = req->ki_filp;
-
-	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_READ)))
-		goto out_fput;
+		return -EBADF;
 	ret = -EINVAL;
 	if (unlikely(!file->f_op->read_iter))
-		goto out_fput;
+		return -EINVAL;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
 	if (ret)
-		goto out_fput;
+		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
 		aio_rw_done(req, call_read_iter(file, req, &iter));
 	kfree(iovec);
-out_fput:
-	if (unlikely(ret))
-		fput(file);
 	return ret;
 }
 
@@ -1549,16 +1549,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
 		return ret;
 	file = req->ki_filp;
 
-	ret = -EBADF;
 	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out_fput;
-	ret = -EINVAL;
+		return -EBADF;
 	if (unlikely(!file->f_op->write_iter))
-		goto out_fput;
+		return -EINVAL;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
 	if (ret)
-		goto out_fput;
+		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
 		/*
@@ -1576,9 +1574,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
 		aio_rw_done(req, call_write_iter(file, req, &iter));
 	}
 	kfree(iovec);
-out_fput:
-	if (unlikely(ret))
-		fput(file);
 	return ret;
 }
 
@@ -1588,7 +1583,6 @@ static void aio_fsync_work(struct work_struct *work)
 	int ret;
 
 	ret = vfs_fsync(req->file, req->datasync);
-	fput(req->file);
 	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
 }
 
@@ -1599,13 +1593,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
 			iocb->aio_rw_flags))
 		return -EINVAL;
 
-	req->file = fget(iocb->aio_fildes);
-	if (unlikely(!req->file))
-		return -EBADF;
-	if (unlikely(!req->file->f_op->fsync)) {
-		fput(req->file);
+	if (unlikely(!req->file->f_op->fsync))
 		return -EINVAL;
-	}
 
 	req->datasync = datasync;
 	INIT_WORK(&req->work, aio_fsync_work);
@@ -1615,10 +1604,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
 
 static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
 {
-	struct file *file = iocb->poll.file;
-
 	aio_complete(iocb, mangle_poll(mask), 0);
-	fput(file);
 }
 
 static void aio_poll_complete_work(struct work_struct *work)
@@ -1743,9 +1729,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 
 	INIT_WORK(&req->work, aio_poll_complete_work);
 	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
-	req->file = fget(iocb->aio_fildes);
-	if (unlikely(!req->file))
-		return -EBADF;
 
 	req->head = NULL;
 	req->woken = false;
@@ -1788,10 +1771,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 	spin_unlock_irq(&ctx->ctx_lock);
 
 out:
-	if (unlikely(apt.error)) {
-		fput(req->file);
+	if (unlikely(apt.error))
 		return apt.error;
-	}
 
 	if (mask)
 		aio_poll_complete(aiocb, mask);
@@ -1829,6 +1810,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 	if (unlikely(!req))
 		goto out_put_reqs_available;
 
+	req->ki_filp = fget(iocb->aio_fildes);
+	ret = -EBADF;
+	if (unlikely(!req->ki_filp))
+		goto out_put_req;
+
 	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
 		/*
 		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
-- 
cgit v1.2.3


From c7f2525abfecf8a57a1417837b6a809df79b299e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 6 Mar 2019 20:22:54 -0500
Subject: pin iocb through aio.

commit b53119f13a04879c3bf502828d99d13726639ead upstream.

aio_poll() is not the only case that needs file pinned; worse, while
aio_read()/aio_write() can live without pinning iocb itself, the
proof is rather brittle and can easily break on later changes.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 46229e663b57..10e5a8f52dce 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1016,6 +1016,9 @@ static bool get_reqs_available(struct kioctx *ctx)
 /* aio_get_req
  *	Allocate a slot for an aio request.
  * Returns NULL if no requests are free.
+ *
+ * The refcount is initialized to 2 - one for the async op completion,
+ * one for the synchronous code that does this.
  */
 static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 {
@@ -1028,7 +1031,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
 	percpu_ref_get(&ctx->reqs);
 	req->ki_ctx = ctx;
 	INIT_LIST_HEAD(&req->ki_list);
-	refcount_set(&req->ki_refcnt, 0);
+	refcount_set(&req->ki_refcnt, 2);
 	req->ki_eventfd = NULL;
 	return req;
 }
@@ -1061,15 +1064,18 @@ out:
 	return ret;
 }
 
+static inline void iocb_destroy(struct aio_kiocb *iocb)
+{
+	if (iocb->ki_filp)
+		fput(iocb->ki_filp);
+	percpu_ref_put(&iocb->ki_ctx->reqs);
+	kmem_cache_free(kiocb_cachep, iocb);
+}
+
 static inline void iocb_put(struct aio_kiocb *iocb)
 {
-	if (refcount_read(&iocb->ki_refcnt) == 0 ||
-	    refcount_dec_and_test(&iocb->ki_refcnt)) {
-		if (iocb->ki_filp)
-			fput(iocb->ki_filp);
-		percpu_ref_put(&iocb->ki_ctx->reqs);
-		kmem_cache_free(kiocb_cachep, iocb);
-	}
+	if (refcount_dec_and_test(&iocb->ki_refcnt))
+		iocb_destroy(iocb);
 }
 
 static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
@@ -1743,9 +1749,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 	INIT_LIST_HEAD(&req->wait.entry);
 	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
 
-	/* one for removal from waitqueue, one for this function */
-	refcount_set(&aiocb->ki_refcnt, 2);
-
 	mask = vfs_poll(req->file, &apt.pt) & req->events;
 	if (unlikely(!req->head)) {
 		/* we did not manage to set up a waitqueue, done */
@@ -1776,7 +1779,6 @@ out:
 
 	if (mask)
 		aio_poll_complete(aiocb, mask);
-	iocb_put(aiocb);
 	return 0;
 }
 
@@ -1867,18 +1869,21 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 		break;
 	}
 
+	/* Done with the synchronous reference */
+	iocb_put(req);
+
 	/*
 	 * If ret is 0, we'd either done aio_complete() ourselves or have
 	 * arranged for that to be done asynchronously.  Anything non-zero
 	 * means that we need to destroy req ourselves.
 	 */
-	if (ret)
-		goto out_put_req;
-	return 0;
+	if (!ret)
+		return 0;
+
 out_put_req:
 	if (req->ki_eventfd)
 		eventfd_ctx_put(req->ki_eventfd);
-	iocb_put(req);
+	iocb_destroy(req);
 out_put_reqs_available:
 	put_reqs_available(ctx, 1);
 	return ret;
-- 
cgit v1.2.3


From 592ea630b081a6c97ec56499b0e12f68fd2da2d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 11 Mar 2019 19:00:36 -0400
Subject: aio: fold lookup_kiocb() into its sole caller

commit 833f4154ed560232120bc475935ee1d6a20e159f upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 10e5a8f52dce..cda193f6de76 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1992,24 +1992,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
 }
 #endif
 
-/* lookup_kiocb
- *	Finds a given iocb for cancellation.
- */
-static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
-{
-	struct aio_kiocb *kiocb;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	/* TODO: use a hash or array, this sucks. */
-	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
-		if (kiocb->ki_user_iocb == iocb)
-			return kiocb;
-	}
-	return NULL;
-}
-
 /* sys_io_cancel:
  *	Attempts to cancel an iocb previously passed to io_submit.  If
  *	the operation is successfully cancelled, the resulting event is
@@ -2038,10 +2020,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-	kiocb = lookup_kiocb(ctx, iocb);
-	if (kiocb) {
-		ret = kiocb->ki_cancel(&kiocb->rw);
-		list_del_init(&kiocb->ki_list);
+	/* TODO: use a hash or array, this sucks. */
+	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+		if (kiocb->ki_user_iocb == iocb) {
+			ret = kiocb->ki_cancel(&kiocb->rw);
+			list_del_init(&kiocb->ki_list);
+			break;
+		}
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
 
-- 
cgit v1.2.3


From c20202c51d2b6703a4e539235f892f34daabd791 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Mar 2019 19:43:45 -0500
Subject: aio: keep io_event in aio_kiocb

commit a9339b7855094ba11a97e8822ae038135e879e79 upstream.

We want to separate forming the resulting io_event from putting it
into the ring buffer.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index cda193f6de76..ec30f1bdac0c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -198,8 +198,7 @@ struct aio_kiocb {
 	struct kioctx		*ki_ctx;
 	kiocb_cancel_fn		*ki_cancel;
 
-	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
-	__u64			ki_user_data;	/* user's data for completion */
+	struct io_event		ki_res;
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
@@ -1078,15 +1077,6 @@ static inline void iocb_put(struct aio_kiocb *iocb)
 		iocb_destroy(iocb);
 }
 
-static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb,
-			   long res, long res2)
-{
-	ev->obj = (u64)(unsigned long)iocb->ki_user_iocb;
-	ev->data = iocb->ki_user_data;
-	ev->res = res;
-	ev->res2 = res2;
-}
-
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
@@ -1098,6 +1088,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
+	iocb->ki_res.res = res;
+	iocb->ki_res.res2 = res2;
 	/*
 	 * Add a completion event to the ring buffer. Must be done holding
 	 * ctx->completion_lock to prevent other code from messing with the tail
@@ -1114,14 +1106,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
 
-	aio_fill_event(event, iocb, res, res2);
+	*event = iocb->ki_res;
 
 	kunmap_atomic(ev_page);
 	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
 
-	pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
-		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
-		 res, res2);
+	pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb,
+		 (void __user *)(unsigned long)iocb->ki_res.obj,
+		 iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2);
 
 	/* after flagging the request as done, we
 	 * must never even look at it again
@@ -1838,8 +1830,10 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
 		goto out_put_req;
 	}
 
-	req->ki_user_iocb = user_iocb;
-	req->ki_user_data = iocb->aio_data;
+	req->ki_res.obj = (u64)(unsigned long)user_iocb;
+	req->ki_res.data = iocb->aio_data;
+	req->ki_res.res = 0;
+	req->ki_res.res2 = 0;
 
 	switch (iocb->aio_lio_opcode) {
 	case IOCB_CMD_PREAD:
@@ -2009,6 +2003,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	struct aio_kiocb *kiocb;
 	int ret = -EINVAL;
 	u32 key;
+	u64 obj = (u64)(unsigned long)iocb;
 
 	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
@@ -2022,7 +2017,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	spin_lock_irq(&ctx->ctx_lock);
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
-		if (kiocb->ki_user_iocb == iocb) {
+		if (kiocb->ki_res.obj == obj) {
 			ret = kiocb->ki_cancel(&kiocb->rw);
 			list_del_init(&kiocb->ki_list);
 			break;
-- 
cgit v1.2.3


From aab66dfb757aa5b211ec6b0c322b42f4ef5ab34f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Mar 2019 19:49:55 -0500
Subject: aio: store event at final iocb_put()

commit 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 upstream.

Instead of having aio_complete() set ->ki_res.{res,res2}, do that
explicitly in its callers, drop the reference (as aio_complete()
used to do) and delay the rest until the final iocb_put().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index ec30f1bdac0c..556ee620038f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1071,16 +1071,10 @@ static inline void iocb_destroy(struct aio_kiocb *iocb)
 	kmem_cache_free(kiocb_cachep, iocb);
 }
 
-static inline void iocb_put(struct aio_kiocb *iocb)
-{
-	if (refcount_dec_and_test(&iocb->ki_refcnt))
-		iocb_destroy(iocb);
-}
-
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  */
-static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
+static void aio_complete(struct aio_kiocb *iocb)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring	*ring;
@@ -1088,8 +1082,6 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 	unsigned tail, pos, head;
 	unsigned long	flags;
 
-	iocb->ki_res.res = res;
-	iocb->ki_res.res2 = res2;
 	/*
 	 * Add a completion event to the ring buffer. Must be done holding
 	 * ctx->completion_lock to prevent other code from messing with the tail
@@ -1155,7 +1147,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
 
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
-	iocb_put(iocb);
+}
+
+static inline void iocb_put(struct aio_kiocb *iocb)
+{
+	if (refcount_dec_and_test(&iocb->ki_refcnt)) {
+		aio_complete(iocb);
+		iocb_destroy(iocb);
+	}
 }
 
 /* aio_read_events_ring
@@ -1429,7 +1428,9 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
 		file_end_write(kiocb->ki_filp);
 	}
 
-	aio_complete(iocb, res, res2);
+	iocb->ki_res.res = res;
+	iocb->ki_res.res2 = res2;
+	iocb_put(iocb);
 }
 
 static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
@@ -1577,11 +1578,10 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb,
 
 static void aio_fsync_work(struct work_struct *work)
 {
-	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
-	int ret;
+	struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
 
-	ret = vfs_fsync(req->file, req->datasync);
-	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
+	iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+	iocb_put(iocb);
 }
 
 static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
@@ -1602,7 +1602,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
 
 static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
 {
-	aio_complete(iocb, mangle_poll(mask), 0);
+	iocb->ki_res.res = mangle_poll(mask);
+	iocb_put(iocb);
 }
 
 static void aio_poll_complete_work(struct work_struct *work)
-- 
cgit v1.2.3


From e9e47779aaa7212ccb75f8d8d4d16ab188efb313 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 Mar 2019 21:45:41 -0500
Subject: Fix aio_poll() races

commit af5c72b1fc7a00aa484e90b0c4e0eeb582545634 upstream.

aio_poll() has to cope with several unpleasant problems:
	* requests that might stay around indefinitely need to
be made visible for io_cancel(2); that must not be done to
a request already completed, though.
	* in cases when ->poll() has placed us on a waitqueue,
wakeup might have happened (and request completed) before ->poll()
returns.
	* worse, in some early wakeup cases request might end
up re-added into the queue later - we can't treat "woken up and
currently not in the queue" as "it's not going to stick around
indefinitely"
	* ... moreover, ->poll() might have decided not to
put it on any queues to start with, and that needs to be distinguished
from the previous case
	* ->poll() might have tried to put us on more than one queue.
Only the first will succeed for aio poll, so we might end up missing
wakeups.  OTOH, we might very well notice that only after the
wakeup hits and request gets completed (all before ->poll() gets
around to the second poll_wait()).  In that case it's too late to
decide that we have an error.

req->woken was an attempt to deal with that.  Unfortunately, it was
broken.  What we need to keep track of is not that wakeup has happened -
the thing might come back after that.  It's that async reference is
already gone and won't come back, so we can't (and needn't) put the
request on the list of cancellables.

The easiest case is "request hadn't been put on any waitqueues"; we
can tell by seeing NULL apt.head, and in that case there won't be
anything async.  We should either complete the request ourselves
(if vfs_poll() reports anything of interest) or return an error.

In all other cases we get exclusion with wakeups by grabbing the
queue lock.

If request is currently on queue and we have something interesting
from vfs_poll(), we can steal it and complete the request ourselves.

If it's on queue and vfs_poll() has not reported anything interesting,
we either put it on the cancellable list, or, if we know that it
hadn't been put on all queues ->poll() wanted it on, we steal it and
return an error.

If it's _not_ on queue, it's either been already dealt with (in which
case we do nothing), or there's aio_poll_complete_work() about to be
executed.  In that case we either put it on the cancellable list,
or, if we know it hadn't been put on all queues ->poll() wanted it on,
simulate what cancel would've done.

It's a lot more convoluted than I'd like it to be.  Single-consumer APIs
suck, and unfortunately aio is not an exception...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/aio.c | 90 +++++++++++++++++++++++++++++-----------------------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 556ee620038f..911e23087dfb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -175,7 +175,7 @@ struct poll_iocb {
 	struct file		*file;
 	struct wait_queue_head	*head;
 	__poll_t		events;
-	bool			woken;
+	bool			done;
 	bool			cancelled;
 	struct wait_queue_entry	wait;
 	struct work_struct	work;
@@ -1600,12 +1600,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
 	return 0;
 }
 
-static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
-{
-	iocb->ki_res.res = mangle_poll(mask);
-	iocb_put(iocb);
-}
-
 static void aio_poll_complete_work(struct work_struct *work)
 {
 	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1631,9 +1625,11 @@ static void aio_poll_complete_work(struct work_struct *work)
 		return;
 	}
 	list_del_init(&iocb->ki_list);
+	iocb->ki_res.res = mangle_poll(mask);
+	req->done = true;
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	aio_poll_complete(iocb, mask);
+	iocb_put(iocb);
 }
 
 /* assumes we are called with irqs disabled */
@@ -1661,31 +1657,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
 	__poll_t mask = key_to_poll(key);
 	unsigned long flags;
 
-	req->woken = true;
-
 	/* for instances that support it check for an event match first: */
-	if (mask) {
-		if (!(mask & req->events))
-			return 0;
+	if (mask && !(mask & req->events))
+		return 0;
 
+	list_del_init(&req->wait.entry);
+
+	if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
 		/*
 		 * Try to complete the iocb inline if we can. Use
 		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
 		 * call this function with IRQs disabled and because IRQs
 		 * have to be disabled before ctx_lock is obtained.
 		 */
-		if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
-			list_del(&iocb->ki_list);
-			spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
-
-			list_del_init(&req->wait.entry);
-			aio_poll_complete(iocb, mask);
-			return 1;
-		}
+		list_del(&iocb->ki_list);
+		iocb->ki_res.res = mangle_poll(mask);
+		req->done = true;
+		spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
+		iocb_put(iocb);
+	} else {
+		schedule_work(&req->work);
 	}
-
-	list_del_init(&req->wait.entry);
-	schedule_work(&req->work);
 	return 1;
 }
 
@@ -1717,6 +1709,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 	struct kioctx *ctx = aiocb->ki_ctx;
 	struct poll_iocb *req = &aiocb->poll;
 	struct aio_poll_table apt;
+	bool cancel = false;
 	__poll_t mask;
 
 	/* reject any unknown events outside the normal event mask. */
@@ -1730,7 +1723,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
 
 	req->head = NULL;
-	req->woken = false;
+	req->done = false;
 	req->cancelled = false;
 
 	apt.pt._qproc = aio_poll_queue_proc;
@@ -1743,36 +1736,33 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
 	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
 
 	mask = vfs_poll(req->file, &apt.pt) & req->events;
-	if (unlikely(!req->head)) {
-		/* we did not manage to set up a waitqueue, done */
-		goto out;
-	}
-
 	spin_lock_irq(&ctx->ctx_lock);
-	spin_lock(&req->head->lock);
-	if (req->woken) {
-		/* wake_up context handles the rest */
-		mask = 0;
+	if (likely(req->head)) {
+		spin_lock(&req->head->lock);
+		if (unlikely(list_empty(&req->wait.entry))) {
+			if (apt.error)
+				cancel = true;
+			apt.error = 0;
+			mask = 0;
+		}
+		if (mask || apt.error) {
+			list_del_init(&req->wait.entry);
+		} else if (cancel) {
+			WRITE_ONCE(req->cancelled, true);
+		} else if (!req->done) { /* actually waiting for an event */
+			list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
+			aiocb->ki_cancel = aio_poll_cancel;
+		}
+		spin_unlock(&req->head->lock);
+	}
+	if (mask) { /* no async, we'd stolen it */
+		aiocb->ki_res.res = mangle_poll(mask);
 		apt.error = 0;
-	} else if (mask || apt.error) {
-		/* if we get an error or a mask we are done */
-		WARN_ON_ONCE(list_empty(&req->wait.entry));
-		list_del_init(&req->wait.entry);
-	} else {
-		/* actually waiting for an event */
-		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
-		aiocb->ki_cancel = aio_poll_cancel;
 	}
-	spin_unlock(&req->head->lock);
 	spin_unlock_irq(&ctx->ctx_lock);
-
-out:
-	if (unlikely(apt.error))
-		return apt.error;
-
 	if (mask)
-		aio_poll_complete(aiocb, mask);
-	return 0;
+		iocb_put(aiocb);
+	return apt.error;
 }
 
 static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb,
-- 
cgit v1.2.3


From 0311ff82b70fa12e80d188635bff24029ec06ae1 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Fri, 5 Apr 2019 14:02:10 -0700
Subject: fs: prevent page refcount overflow in pipe_buf_get

commit 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb upstream.

Change pipe_buf_get() to return a bool indicating whether it succeeded
in raising the refcount of the page (if the thing in the pipe is a page).
This removes another mechanism for overflowing the page refcount.  All
callers converted to handle a failure.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Matthew Wilcox <willy@infradead.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/dev.c | 12 ++++++------
 fs/pipe.c     |  4 ++--
 fs/splice.c   | 12 ++++++++++--
 3 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index baaed4d05b22..249de20f752a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1989,10 +1989,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 		rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
 
 	ret = -EINVAL;
-	if (rem < len) {
-		pipe_unlock(pipe);
-		goto out;
-	}
+	if (rem < len)
+		goto out_free;
 
 	rem = len;
 	while (rem) {
@@ -2010,7 +2008,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
 		} else {
-			pipe_buf_get(pipe, ibuf);
+			if (!pipe_buf_get(pipe, ibuf))
+				goto out_free;
+
 			*obuf = *ibuf;
 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
 			obuf->len = rem;
@@ -2033,11 +2033,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
 	ret = fuse_dev_do_write(fud, &cs, len);
 
 	pipe_lock(pipe);
+out_free:
 	for (idx = 0; idx < nbuf; idx++)
 		pipe_buf_release(pipe, &bufs[idx]);
 	pipe_unlock(pipe);
 
-out:
 	kvfree(bufs);
 	return ret;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index c51750ed4011..2a297bce381f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -189,9 +189,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
  *	in the tee() system call, when we duplicate the buffers in one
  *	pipe into another.
  */
-void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
+bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
-	get_page(buf->page);
+	return try_get_page(buf->page);
 }
 EXPORT_SYMBOL(generic_pipe_buf_get);
 
diff --git a/fs/splice.c b/fs/splice.c
index c78e0e3ff6c4..485e409ef841 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1584,7 +1584,11 @@ retry:
 			 * Get a reference to this pipe buffer,
 			 * so we can copy the contents over.
 			 */
-			pipe_buf_get(ipipe, ibuf);
+			if (!pipe_buf_get(ipipe, ibuf)) {
+				if (ret == 0)
+					ret = -EFAULT;
+				break;
+			}
 			*obuf = *ibuf;
 
 			/*
@@ -1658,7 +1662,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		 * Get a reference to this pipe buffer,
 		 * so we can copy the contents over.
 		 */
-		pipe_buf_get(ipipe, ibuf);
+		if (!pipe_buf_get(ipipe, ibuf)) {
+			if (ret == 0)
+				ret = -EFAULT;
+			break;
+		}
 
 		obuf = opipe->bufs + nbuf;
 		*obuf = *ibuf;
-- 
cgit v1.2.3


From b989a3e9d260e2b567f5ae501fdf445f626911c1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 21 Mar 2019 17:57:56 -0400
Subject: NFS: Fix a typo in nfs_init_timeout_values()

[ Upstream commit 5a698243930c441afccec04e4d5dc8febfd2b775 ]

Specifying a retrans=0 mount parameter to a NFS/TCP mount, is
inadvertently causing the NFS client to rewrite any specified
timeout parameter to the default of 60 seconds.

Fixes: a956beda19a6 ("NFS: Allow the mount option retrans=0")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Sasha Levin (Microsoft) <sashal@kernel.org>
---
 fs/nfs/client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 96d5f8135eb9..751ca65da8a3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -459,7 +459,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
 	case XPRT_TRANSPORT_RDMA:
 		if (retrans == NFS_UNSPEC_RETRANS)
 			to->to_retries = NFS_DEF_TCP_RETRANS;
-		if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
+		if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
 			to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
 		if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
 			to->to_initval = NFS_MAX_TCP_TIMEOUT;
-- 
cgit v1.2.3


From 97c4f3a8853c50968a40e183dabd2ec27feba191 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Mar 2019 01:38:58 +0000
Subject: ceph: fix use-after-free on symlink traversal

[ Upstream commit daf5cc27eed99afdea8d96e71b89ba41f5406ef6 ]

free the symlink body after the same RCU delay we have for freeing the
struct inode itself, so that traversal during RCU pathwalk wouldn't step
into freed memory.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Sasha Levin (Microsoft) <sashal@kernel.org>
---
 fs/ceph/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4055ab4d5c52..3e518c2ae2bf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
+	kfree(ci->i_symlink);
 	kmem_cache_free(ceph_inode_cachep, ci);
 }
 
@@ -561,7 +562,6 @@ void ceph_destroy_inode(struct inode *inode)
 		ceph_put_snap_realm(mdsc, realm);
 	}
 
-	kfree(ci->i_symlink);
 	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
 		frag = rb_entry(n, struct ceph_inode_frag, node);
 		rb_erase(n, &ci->i_fragtree);
-- 
cgit v1.2.3


From e22c11da0a8683d22011bbce18da493c079d67b3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Mar 2019 01:39:50 +0000
Subject: jffs2: fix use-after-free on symlink traversal

[ Upstream commit 4fdcfab5b5537c21891e22e65996d4d0dd8ab4ca ]

free the symlink body after the same RCU delay we have for freeing the
struct inode itself, so that traversal during RCU pathwalk wouldn't step
into freed memory.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/jffs2/readinode.c | 5 -----
 fs/jffs2/super.c     | 5 ++++-
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 389ea53ea487..bccfc40b3a74 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f)
 
 	jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL);
 
-	if (f->target) {
-		kfree(f->target);
-		f->target = NULL;
-	}
-
 	fds = f->dents;
 	while(fds) {
 		fd = fds;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index bb6ae387469f..05d892c79339 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
 static void jffs2_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
+	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+
+	kfree(f->target);
+	kmem_cache_free(jffs2_inode_cachep, f);
 }
 
 static void jffs2_destroy_inode(struct inode *inode)
-- 
cgit v1.2.3


From f0112b649525fda883ae77db7d0774cb4da47cb8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 26 Mar 2019 01:43:37 +0000
Subject: debugfs: fix use-after-free on symlink traversal

[ Upstream commit 93b919da64c15b90953f96a536e5e61df896ca57 ]

symlink body shouldn't be freed without an RCU delay.  Switch debugfs to
->destroy_inode() and use of call_rcu(); free both the inode and symlink
body in the callback.  Similar to solution for bpf, only here it's even
more obvious that ->evict_inode() can be dropped.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/debugfs/inode.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 41ef452c1fcf..e5126fad57c5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root)
 	return 0;
 }
 
-static void debugfs_evict_inode(struct inode *inode)
+static void debugfs_i_callback(struct rcu_head *head)
 {
-	truncate_inode_pages_final(&inode->i_data);
-	clear_inode(inode);
+	struct inode *inode = container_of(head, struct inode, i_rcu);
 	if (S_ISLNK(inode->i_mode))
 		kfree(inode->i_link);
+	free_inode_nonrcu(inode);
+}
+
+static void debugfs_destroy_inode(struct inode *inode)
+{
+	call_rcu(&inode->i_rcu, debugfs_i_callback);
 }
 
 static const struct super_operations debugfs_super_operations = {
 	.statfs		= simple_statfs,
 	.remount_fs	= debugfs_remount,
 	.show_options	= debugfs_show_options,
-	.evict_inode	= debugfs_evict_inode,
+	.destroy_inode	= debugfs_destroy_inode,
 };
 
 static void debugfs_release_dentry(struct dentry *dentry)
-- 
cgit v1.2.3


From b51fdcbe45d1f09b5ecea5d2b796d37b8e6cbb4c Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Fri, 5 Apr 2019 18:39:06 -0700
Subject: hugetlbfs: fix memory leak for resv_map

[ Upstream commit 58b6e5e8f1addd44583d61b0a03c0f5519527e35 ]

When mknod is used to create a block special file in hugetlbfs, it will
allocate an inode and kmalloc a 'struct resv_map' via resv_map_alloc().
inode->i_mapping->private_data will point the newly allocated resv_map.
However, when the device special file is opened bd_acquire() will set
inode->i_mapping to bd_inode->i_mapping.  Thus the pointer to the
allocated resv_map is lost and the structure is leaked.

Programs to reproduce:
        mount -t hugetlbfs nodev hugetlbfs
        mknod hugetlbfs/dev b 0 0
        exec 30<> hugetlbfs/dev
        umount hugetlbfs/

resv_map structures are only needed for inodes which can have associated
page allocations.  To fix the leak, only allocate resv_map for those
inodes which could possibly be associated with page allocations.

Link: http://lkml.kernel.org/r/20190401213101.16476-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Yufen Yu <yuyufen@huawei.com>
Suggested-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/hugetlbfs/inode.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a7fa037b876b..a3a3d256fb0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -741,11 +741,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 					umode_t mode, dev_t dev)
 {
 	struct inode *inode;
-	struct resv_map *resv_map;
+	struct resv_map *resv_map = NULL;
 
-	resv_map = resv_map_alloc();
-	if (!resv_map)
-		return NULL;
+	/*
+	 * Reserve maps are only needed for inodes that can have associated
+	 * page allocations.
+	 */
+	if (S_ISREG(mode) || S_ISLNK(mode)) {
+		resv_map = resv_map_alloc();
+		if (!resv_map)
+			return NULL;
+	}
 
 	inode = new_inode(sb);
 	if (inode) {
@@ -780,8 +786,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 			break;
 		}
 		lockdep_annotate_inode_mutex_key(inode);
-	} else
-		kref_put(&resv_map->refs, resv_map_release);
+	} else {
+		if (resv_map)
+			kref_put(&resv_map->refs, resv_map_release);
+	}
 
 	return inode;
 }
-- 
cgit v1.2.3


From 04b4d5f75ab04858a9b6dff94d74e4fdf0b9f194 Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@nexedi.com>
Date: Tue, 26 Mar 2019 22:20:43 +0000
Subject: fs: stream_open - opener for stream-like files so that read and write
 can run simultaneously without deadlock

[ Upstream commit 10dce8af34226d90fa56746a934f8da5dcdba3df ]

Commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") added
locking for file.f_pos access and in particular made concurrent read and
write not possible - now both those functions take f_pos lock for the
whole run, and so if e.g. a read is blocked waiting for data, write will
deadlock waiting for that read to complete.

This caused regression for stream-like files where previously read and
write could run simultaneously, but after that patch could not do so
anymore. See e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes
to /proc/xen/xenbus") which fixes such regression for particular case of
/proc/xen/xenbus.

The patch that added f_pos lock in 2014 did so to guarantee POSIX thread
safety for read/write/lseek and added the locking to file descriptors of
all regular files. In 2014 that thread-safety problem was not new as it
was already discussed earlier in 2006.

However even though 2006'th version of Linus's patch was adding f_pos
locking "only for files that are marked seekable with FMODE_LSEEK (thus
avoiding the stream-like objects like pipes and sockets)", the 2014
version - the one that actually made it into the tree as 9c225f2655e3 -
is doing so irregardless of whether a file is seekable or not.

See

    https://lore.kernel.org/lkml/53022DB1.4070805@gmail.com/
    https://lwn.net/Articles/180387
    https://lwn.net/Articles/180396

for historic context.

The reason that it did so is, probably, that there are many files that
are marked non-seekable, but e.g. their read implementation actually
depends on knowing current position to correctly handle the read. Some
examples:

	kernel/power/user.c		snapshot_read
	fs/debugfs/file.c		u32_array_read
	fs/fuse/control.c		fuse_conn_waiting_read + ...
	drivers/hwmon/asus_atk0110.c	atk_debugfs_ggrp_read
	arch/s390/hypfs/inode.c		hypfs_read_iter
	...

Despite that, many nonseekable_open users implement read and write with
pure stream semantics - they don't depend on passed ppos at all. And for
those cases where read could wait for something inside, it creates a
situation similar to xenbus - the write could be never made to go until
read is done, and read is waiting for some, potentially external, event,
for potentially unbounded time -> deadlock.

Besides xenbus, there are 14 such places in the kernel that I've found
with semantic patch (see below):

	drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write()
	drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write()
	drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write()
	drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write()
	net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write()
	drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write()
	drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write()
	drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write()
	net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write()
	drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write()
	drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write()
	drivers/input/misc/uinput.c:400:1-17: ERROR: uinput_fops: .read() can deadlock .write()
	drivers/infiniband/core/user_mad.c:985:7-23: ERROR: umad_fops: .read() can deadlock .write()
	drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write()

In addition to the cases above another regression caused by f_pos
locking is that now FUSE filesystems that implement open with
FOPEN_NONSEEKABLE flag, can no longer implement bidirectional
stream-like files - for the same reason as above e.g. read can deadlock
write locking on file.f_pos in the kernel.

FUSE's FOPEN_NONSEEKABLE was added in 2008 in a7c1b990f715 ("fuse:
implement nonseekable open") to support OSSPD. OSSPD implements /dev/dsp
in userspace with FOPEN_NONSEEKABLE flag, with corresponding read and
write routines not depending on current position at all, and with both
read and write being potentially blocking operations:

See

    https://github.com/libfuse/osspd
    https://lwn.net/Articles/308445

    https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1406
    https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1438-L1477
    https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1479-L1510

Corresponding libfuse example/test also describes FOPEN_NONSEEKABLE as
"somewhat pipe-like files ..." with read handler not using offset.
However that test implements only read without write and cannot exercise
the deadlock scenario:

    https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L124-L131
    https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L146-L163
    https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L209-L216

I've actually hit the read vs write deadlock for real while implementing
my FUSE filesystem where there is /head/watch file, for which open
creates separate bidirectional socket-like stream in between filesystem
and its user with both read and write being later performed
simultaneously. And there it is semantically not easy to split the
stream into two separate read-only and write-only channels:

    https://lab.nexedi.com/kirr/wendelin.core/blob/f13aa600/wcfs/wcfs.go#L88-169

Let's fix this regression. The plan is:

1. We can't change nonseekable_open to include &~FMODE_ATOMIC_POS -
   doing so would break many in-kernel nonseekable_open users which
   actually use ppos in read/write handlers.

2. Add stream_open() to kernel to open stream-like non-seekable file
   descriptors. Read and write on such file descriptors would never use
   nor change ppos. And with that property on stream-like files read and
   write will be running without taking f_pos lock - i.e. read and write
   could be running simultaneously.

3. With semantic patch search and convert to stream_open all in-kernel
   nonseekable_open users for which read and write actually do not
   depend on ppos and where there is no other methods in file_operations
   which assume @offset access.

4. Add FOPEN_STREAM to fs/fuse/ and open in-kernel file-descriptors via
   steam_open if that bit is present in filesystem open reply.

   It was tempting to change fs/fuse/ open handler to use stream_open
   instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but
   grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE,
   and in particular GVFS which actually uses offset in its read and
   write handlers

	https://codesearch.debian.net/search?q=-%3Enonseekable+%3D
	https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080
	https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346
	https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481

   so if we would do such a change it will break a real user.

5. Add stream_open and FOPEN_STREAM handling to stable kernels starting
   from v3.14+ (the kernel where 9c225f2655 first appeared).

   This will allow to patch OSSPD and other FUSE filesystems that
   provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE
   in their open handler and this way avoid the deadlock on all kernel
   versions. This should work because fs/fuse/ ignores unknown open
   flags returned from a filesystem and so passing FOPEN_STREAM to a
   kernel that is not aware of this flag cannot hurt. In turn the kernel
   that is not aware of FOPEN_STREAM will be < v3.14 where just
   FOPEN_NONSEEKABLE is sufficient to implement streams without read vs
   write deadlock.

This patch adds stream_open, converts /proc/xen/xenbus to it and adds
semantic patch to automatically locate in-kernel places that are either
required to be converted due to read vs write deadlock, or that are just
safe to be converted because read and write do not use ppos and there
are no other funky methods in file_operations.

Regarding semantic patch I've verified each generated change manually -
that it is correct to convert - and each other nonseekable_open instance
left - that it is either not correct to convert there, or that it is not
converted due to current stream_open.cocci limitations.

The script also does not convert files that should be valid to convert,
but that currently have .llseek = noop_llseek or generic_file_llseek for
unknown reason despite file being opened with nonseekable_open (e.g.
drivers/input/mousedev.c)

Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Yongzhi Pan <panyongzhi@gmail.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Nikolaus Rath <Nikolaus@rath.org>
Cc: Han-Wen Nienhuys <hanwen@google.com>
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/open.c       | 18 ++++++++++++++++++
 fs/read_write.c |  5 +++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index f1c2f855fd43..a00350018a47 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1215,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp)
 }
 
 EXPORT_SYMBOL(nonseekable_open);
+
+/*
+ * stream_open is used by subsystems that want stream-like file descriptors.
+ * Such file descriptors are not seekable and don't have notion of position
+ * (file.f_pos is always 0). Contrary to file descriptors of other regular
+ * files, .read() and .write() can run simultaneously.
+ *
+ * stream_open never fails and is marked to return int so that it could be
+ * directly used as file_operations.open .
+ */
+int stream_open(struct inode *inode, struct file *filp)
+{
+	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
+	filp->f_mode |= FMODE_STREAM;
+	return 0;
+}
+
+EXPORT_SYMBOL(stream_open);
diff --git a/fs/read_write.c b/fs/read_write.c
index 562974a0616c..85fd7a8ee29e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 
 static inline loff_t file_pos_read(struct file *file)
 {
-	return file->f_pos;
+	return file->f_mode & FMODE_STREAM ? 0 : file->f_pos;
 }
 
 static inline void file_pos_write(struct file *file, loff_t pos)
 {
-	file->f_pos = pos;
+	if ((file->f_mode & FMODE_STREAM) == 0)
+		file->f_pos = pos;
 }
 
 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
-- 
cgit v1.2.3


From e361ccccdd51f45a2d54f6454192acda06cb2598 Mon Sep 17 00:00:00 2001
From: Andrea Parri <andrea.parri@amarulasolutions.com>
Date: Tue, 16 Apr 2019 14:17:11 +0200
Subject: kernfs: fix barrier usage in __kernfs_new_node()

commit 998267900cee901c5d1dfa029a6304d00acbc29f upstream.

smp_mb__before_atomic() can not be applied to atomic_set().  Remove the
barrier and rely on RELEASE synchronization.

Fixes: ba16b2846a8c6 ("kernfs: add an API to get kernfs node from inode number")
Cc: stable@vger.kernel.org
Signed-off-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/kernfs/dir.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 4ca0b5c18192..853a69e493f5 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -650,11 +650,10 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	kn->id.generation = gen;
 
 	/*
-	 * set ino first. This barrier is paired with atomic_inc_not_zero in
+	 * set ino first. This RELEASE is paired with atomic_inc_not_zero in
 	 * kernfs_find_and_get_node_by_ino
 	 */
-	smp_mb__before_atomic();
-	atomic_set(&kn->count, 1);
+	atomic_set_release(&kn->count, 1);
 	atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
 	RB_CLEAR_NODE(&kn->rb);
 
-- 
cgit v1.2.3


From 58be7c109cea9ea619dde5ac504ef9e001a4c2c0 Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Sat, 13 Apr 2019 08:37:37 +0100
Subject: afs: Unlock pages for __pagevec_release()

[ Upstream commit 21bd68f196ca91fc0f3d9bd1b32f6e530e8c1c88 ]

__pagevec_release() complains loudly if any page in the vector is still
locked.  The pages need to be locked for generic_error_remove_page(), but
that function doesn't actually unlock them.

Unlock the pages afterwards.

Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Jonathan Billings <jsbillin@umich.edu>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/afs/write.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 19c04caf3c01..e00461a6de9a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -253,6 +253,7 @@ static void afs_kill_pages(struct address_space *mapping,
 				first = page->index + 1;
 			lock_page(page);
 			generic_error_remove_page(mapping, page);
+			unlock_page(page);
 		}
 
 		__pagevec_release(&pv);
-- 
cgit v1.2.3


From 58db3813680ef3fd74dea8154124831043dd3860 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 13 May 2019 17:15:33 -0700
Subject: mm/huge_memory: fix vmf_insert_pfn_{pmd, pud}() crash, handle
 unaligned addresses

commit fce86ff5802bac3a7b19db171aa1949ef9caac31 upstream.

Starting with c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page
protection by insert_pfn_pmd()") vmf_insert_pfn_pmd() internally calls
pmdp_set_access_flags().  That helper enforces a pmd aligned @address
argument via VM_BUG_ON() assertion.

Update the implementation to take a 'struct vm_fault' argument directly
and apply the address alignment fixup internally to fix crash signatures
like:

    kernel BUG at arch/x86/mm/pgtable.c:515!
    invalid opcode: 0000 [#1] SMP NOPTI
    CPU: 51 PID: 43713 Comm: java Tainted: G           OE     4.19.35 #1
    [..]
    RIP: 0010:pmdp_set_access_flags+0x48/0x50
    [..]
    Call Trace:
     vmf_insert_pfn_pmd+0x198/0x350
     dax_iomap_fault+0xe82/0x1190
     ext4_dax_huge_fault+0x103/0x1f0
     ? __switch_to_asm+0x40/0x70
     __handle_mm_fault+0x3f6/0x1370
     ? __switch_to_asm+0x34/0x70
     ? __switch_to_asm+0x40/0x70
     handle_mm_fault+0xda/0x200
     __do_page_fault+0x249/0x4f0
     do_page_fault+0x32/0x110
     ? page_fault+0x8/0x30
     page_fault+0x1e/0x30

Link: http://lkml.kernel.org/r/155741946350.372037.11148198430068238140.stgit@dwillia2-desk3.amr.corp.intel.com
Fixes: c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Piotr Balcer <piotr.balcer@intel.com>
Tested-by: Yan Ma <yan.ma@intel.com>
Tested-by: Pankaj Gupta <pagupta@redhat.com>
Reviewed-by: Matthew Wilcox <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Chandan Rajendra <chandan@linux.ibm.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dax.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 09fa70683c41..004c8ac1117c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1660,8 +1660,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
 		}
 
 		trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
-		result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
-					    write);
+		result = vmf_insert_pfn_pmd(vmf, pfn, write);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
@@ -1775,8 +1774,7 @@ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
 		break;
 #ifdef CONFIG_FS_DAX_PMD
 	case PE_SIZE_PMD:
-		ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			pfn, true);
+		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
 		break;
 #endif
 	default:
-- 
cgit v1.2.3


From a3ccc156f365a9615890b9fbf99d8a663f2c4b56 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Mon, 13 May 2019 17:19:41 -0700
Subject: hugetlb: use same fault hash key for shared and private mappings

commit 1b426bac66e6cc83c9f2d92b96e4e72acf43419a upstream.

hugetlb uses a fault mutex hash table to prevent page faults of the
same pages concurrently.  The key for shared and private mappings is
different.  Shared keys off address_space and file index.  Private keys
off mm and virtual address.  Consider a private mappings of a populated
hugetlbfs file.  A fault will map the page from the file and if needed
do a COW to map a writable page.

Hugetlbfs hole punch uses the fault mutex to prevent mappings of file
pages.  It uses the address_space file index key.  However, private
mappings will use a different key and could race with this code to map
the file page.  This causes problems (BUG) for the page cache remove
code as it expects the page to be unmapped.  A sample stack is:

page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
kernel BUG at mm/filemap.c:169!
...
RIP: 0010:unaccount_page_cache_page+0x1b8/0x200
...
Call Trace:
__delete_from_page_cache+0x39/0x220
delete_from_page_cache+0x45/0x70
remove_inode_hugepages+0x13c/0x380
? __add_to_page_cache_locked+0x162/0x380
hugetlbfs_fallocate+0x403/0x540
? _cond_resched+0x15/0x30
? __inode_security_revalidate+0x5d/0x70
? selinux_file_permission+0x100/0x130
vfs_fallocate+0x13f/0x270
ksys_fallocate+0x3c/0x80
__x64_sys_fallocate+0x1a/0x20
do_syscall_64+0x5b/0x180
entry_SYSCALL_64_after_hwframe+0x44/0xa9

There seems to be another potential COW issue/race with this approach
of different private and shared keys as noted in commit 8382d914ebf7
("mm, hugetlb: improve page-fault scalability").

Since every hugetlb mapping (even anon and private) is actually a file
mapping, just use the address_space index key for all mappings.  This
results in potentially more hash collisions.  However, this should not
be the common case.

Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com
Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5
Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/hugetlbfs/inode.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a3a3d256fb0e..7a24f91af29e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -426,9 +426,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 			u32 hash;
 
 			index = page->index;
-			hash = hugetlb_fault_mutex_hash(h, current->mm,
-							&pseudo_vma,
-							mapping, index, 0);
+			hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 			/*
@@ -625,8 +623,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		addr = index * hpage_size;
 
 		/* mutex taken here, fault path and hole punch */
-		hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
-						index, addr);
+		hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
 		/* See if already present in mapping to avoid alloc/free */
-- 
cgit v1.2.3


From 3574bc98e2fe3f66090682dd91ebd9b96d0236f3 Mon Sep 17 00:00:00 2001
From: Shuning Zhang <sunny.s.zhang@oracle.com>
Date: Mon, 13 May 2019 17:15:56 -0700
Subject: ocfs2: fix ocfs2 read inode data panic in ocfs2_iget

commit e091eab028f9253eac5c04f9141bbc9d170acab3 upstream.

In some cases, ocfs2_iget() reads the data of inode, which has been
deleted for some reason.  That will make the system panic.  So We should
judge whether this inode has been deleted, and tell the caller that the
inode is a bad inode.

For example, the ocfs2 is used as the backed of nfs, and the client is
nfsv3.  This issue can be reproduced by the following steps.

on the nfs server side,
..../patha/pathb

Step 1: The process A was scheduled before calling the function fh_verify.

Step 2: The process B is removing the 'pathb', and just completed the call
to function dput.  Then the dentry of 'pathb' has been deleted from the
dcache, and all ancestors have been deleted also.  The relationship of
dentry and inode was deleted through the function hlist_del_init.  The
following is the call stack.
dentry_iput->hlist_del_init(&dentry->d_u.d_alias)

At this time, the inode is still in the dcache.

Step 3: The process A call the function ocfs2_get_dentry, which get the
inode from dcache.  Then the refcount of inode is 1.  The following is the
call stack.
nfsd3_proc_getacl->fh_verify->exportfs_decode_fh->fh_to_dentry(ocfs2_get_dentry)

Step 4: Dirty pages are flushed by bdi threads.  So the inode of 'patha'
is evicted, and this directory was deleted.  But the inode of 'pathb'
can't be evicted, because the refcount of the inode was 1.

Step 5: The process A keep running, and call the function
reconnect_path(in exportfs_decode_fh), which call function
ocfs2_get_parent of ocfs2.  Get the block number of parent
directory(patha) by the name of ...  Then read the data from disk by the
block number.  But this inode has been deleted, so the system panic.

Process A                                             Process B
1. in nfsd3_proc_getacl                   |
2.                                        |        dput
3. fh_to_dentry(ocfs2_get_dentry)         |
4. bdi flush dirty cache                  |
5. ocfs2_iget                             |

[283465.542049] OCFS2: ERROR (device sdp): ocfs2_validate_inode_block:
Invalid dinode #580640: OCFS2_VALID_FL not set

[283465.545490] Kernel panic - not syncing: OCFS2: (device sdp): panic forced
after error

[283465.546889] CPU: 5 PID: 12416 Comm: nfsd Tainted: G        W
4.1.12-124.18.6.el6uek.bug28762940v3.x86_64 #2
[283465.548382] Hardware name: VMware, Inc. VMware Virtual Platform/440BX
Desktop Reference Platform, BIOS 6.00 09/21/2015
[283465.549657]  0000000000000000 ffff8800a56fb7b8 ffffffff816e839c
ffffffffa0514758
[283465.550392]  000000000008dc20 ffff8800a56fb838 ffffffff816e62d3
0000000000000008
[283465.551056]  ffff880000000010 ffff8800a56fb848 ffff8800a56fb7e8
ffff88005df9f000
[283465.551710] Call Trace:
[283465.552516]  [<ffffffff816e839c>] dump_stack+0x63/0x81
[283465.553291]  [<ffffffff816e62d3>] panic+0xcb/0x21b
[283465.554037]  [<ffffffffa04e66b0>] ocfs2_handle_error+0xf0/0xf0 [ocfs2]
[283465.554882]  [<ffffffffa04e7737>] __ocfs2_error+0x67/0x70 [ocfs2]
[283465.555768]  [<ffffffffa049c0f9>] ocfs2_validate_inode_block+0x229/0x230
[ocfs2]
[283465.556683]  [<ffffffffa047bcbc>] ocfs2_read_blocks+0x46c/0x7b0 [ocfs2]
[283465.557408]  [<ffffffffa049bed0>] ? ocfs2_inode_cache_io_unlock+0x20/0x20
[ocfs2]
[283465.557973]  [<ffffffffa049f0eb>] ocfs2_read_inode_block_full+0x3b/0x60
[ocfs2]
[283465.558525]  [<ffffffffa049f5ba>] ocfs2_iget+0x4aa/0x880 [ocfs2]
[283465.559082]  [<ffffffffa049146e>] ocfs2_get_parent+0x9e/0x220 [ocfs2]
[283465.559622]  [<ffffffff81297c05>] reconnect_path+0xb5/0x300
[283465.560156]  [<ffffffff81297f46>] exportfs_decode_fh+0xf6/0x2b0
[283465.560708]  [<ffffffffa062faf0>] ? nfsd_proc_getattr+0xa0/0xa0 [nfsd]
[283465.561262]  [<ffffffff810a8196>] ? prepare_creds+0x26/0x110
[283465.561932]  [<ffffffffa0630860>] fh_verify+0x350/0x660 [nfsd]
[283465.562862]  [<ffffffffa0637804>] ? nfsd_cache_lookup+0x44/0x630 [nfsd]
[283465.563697]  [<ffffffffa063a8b9>] nfsd3_proc_getattr+0x69/0xf0 [nfsd]
[283465.564510]  [<ffffffffa062cf60>] nfsd_dispatch+0xe0/0x290 [nfsd]
[283465.565358]  [<ffffffffa05eb892>] ? svc_tcp_adjust_wspace+0x12/0x30
[sunrpc]
[283465.566272]  [<ffffffffa05ea652>] svc_process_common+0x412/0x6a0 [sunrpc]
[283465.567155]  [<ffffffffa05eaa03>] svc_process+0x123/0x210 [sunrpc]
[283465.568020]  [<ffffffffa062c90f>] nfsd+0xff/0x170 [nfsd]
[283465.568962]  [<ffffffffa062c810>] ? nfsd_destroy+0x80/0x80 [nfsd]
[283465.570112]  [<ffffffff810a622b>] kthread+0xcb/0xf0
[283465.571099]  [<ffffffff810a6160>] ? kthread_create_on_node+0x180/0x180
[283465.572114]  [<ffffffff816f11b8>] ret_from_fork+0x58/0x90
[283465.573156]  [<ffffffff810a6160>] ? kthread_create_on_node+0x180/0x180

Link: http://lkml.kernel.org/r/1554185919-3010-1-git-send-email-sunny.s.zhang@oracle.com
Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: piaojun <piaojun@huawei.com>
Cc: "Gang He" <ghe@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ocfs2/export.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 4bf8d5854b27..af2888d23de3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	u64 blkno;
 	struct dentry *parent;
 	struct inode *dir = d_inode(child);
+	int set;
 
 	trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
 			       (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
+	status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1);
+	if (status < 0) {
+		mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+		parent = ERR_PTR(status);
+		goto bail;
+	}
+
 	status = ocfs2_inode_lock(dir, NULL, 0);
 	if (status < 0) {
 		if (status != -ENOENT)
 			mlog_errno(status);
 		parent = ERR_PTR(status);
-		goto bail;
+		goto unlock_nfs_sync;
 	}
 
 	status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
@@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 		goto bail_unlock;
 	}
 
+	status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set);
+	if (status < 0) {
+		if (status == -EINVAL) {
+			status = -ESTALE;
+		} else
+			mlog(ML_ERROR, "test inode bit failed %d\n", status);
+		parent = ERR_PTR(status);
+		goto bail_unlock;
+	}
+
+	trace_ocfs2_get_dentry_test_bit(status, set);
+	if (!set) {
+		status = -ESTALE;
+		parent = ERR_PTR(status);
+		goto bail_unlock;
+	}
+
 	parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
 
 bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
 
+unlock_nfs_sync:
+	ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1);
+
 bail:
 	trace_ocfs2_get_parent_end(parent);
 
-- 
cgit v1.2.3


From 001fe0dab4eacd9e714b7c9de5f13af2bb3c3e1a Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Sat, 6 Apr 2019 18:57:40 -0400
Subject: jbd2: check superblock mapped prior to committing

commit 742b06b5628f2cd23cb51a034cb54dc33c6162c5 upstream.

We hit a BUG at fs/buffer.c:3057 if we detached the nbd device
before unmounting ext4 filesystem.

The typical chain of events leading to the BUG:
jbd2_write_superblock
  submit_bh
    submit_bh_wbc
      BUG_ON(!buffer_mapped(bh));

The block device is removed and all the pages are invalidated. JBD2
was trying to write journal superblock to the block device which is
no longer present.

Fix this by checking the journal superblock's buffer head prior to
submitting.

Reported-by: Eric Ren <renzhen@linux.alibaba.com>
Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/journal.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 88f2a49338a1..9e618777c699 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1366,6 +1366,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
 	journal_superblock_t *sb = journal->j_superblock;
 	int ret;
 
+	/* Buffer got discarded which means block device got invalidated */
+	if (!buffer_mapped(bh))
+		return -EIO;
+
 	trace_jbd2_write_superblock(journal, write_flags);
 	if (!(journal->j_flags & JBD2_BARRIER))
 		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
-- 
cgit v1.2.3


From 71478ef67d7c39ba735c07ce3e39eae21ad8681a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 6 Apr 2019 18:33:06 -0400
Subject: ext4: make sanity check in mballoc more strict

commit 31562b954b60f02acb91b7349dc6432d3f8c3c5f upstream.

The sanity check in mb_find_extent() only checked that returned extent
does not extend past blocksize * 8, however it should not extend past
EXT4_CLUSTERS_PER_GROUP(sb). This can happen when clusters_per_group <
blocksize * 8 and the tail of the bitmap is not properly filled by 1s
which happened e.g. when ancient kernels have grown the filesystem.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e29fce2fbf25..cc229f3357f7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1539,7 +1539,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
 		ex->fe_len += 1 << order;
 	}
 
-	if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
+	if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
 		/* Should never happen! (but apparently sometimes does?!?) */
 		WARN_ON(1);
 		ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
-- 
cgit v1.2.3


From f0f805f8b9e72be1d77bc1cc6f85eab0c3fd637a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 10 Apr 2019 00:37:36 -0400
Subject: ext4: ignore e_value_offs for xattrs with value-in-ea-inode

commit e5d01196c0428a206f307e9ee5f6842964098ff0 upstream.

In other places in fs/ext4/xattr.c, if e_value_inum is non-zero, the
code ignores the value in e_value_offs.  The e_value_offs *should* be
zero, but we shouldn't depend upon it, since it might not be true in a
corrupted/fuzzed file system.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202897
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202877
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 006c277dc22e..f73fc90e5daa 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1700,7 +1700,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 
 	/* No failures allowed past this point. */
 
-	if (!s->not_found && here->e_value_size && here->e_value_offs) {
+	if (!s->not_found && here->e_value_size && !here->e_value_inum) {
 		/* Remove the old value. */
 		void *first_val = s->base + min_offs;
 		size_t offs = le16_to_cpu(here->e_value_offs);
-- 
cgit v1.2.3


From b12a8d80a46e8ab02eb9124a36562a93bbd7224b Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Thu, 25 Apr 2019 11:44:15 -0400
Subject: ext4: avoid drop reference to iloc.bh twice

commit 8c380ab4b7b59c0c602743810be1b712514eaebc upstream.

The reference to iloc.bh has been dropped in ext4_mark_iloc_dirty.
However, the reference is dropped again if error occurs during
ext4_handle_dirty_metadata, which may result in use-after-free bugs.

Fixes: fb265c9cb49e("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases")
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/resize.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e7ae26e36c9c..4d5c0fc9d23a 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -874,6 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
 	if (unlikely(err)) {
 		ext4_std_error(sb, err);
+		iloc.bh = NULL;
 		goto errout;
 	}
 	brelse(dind);
-- 
cgit v1.2.3


From 2a18c9c76718f77a08177f1fc20007fab2fdafdd Mon Sep 17 00:00:00 2001
From: Barret Rhoden <brho@google.com>
Date: Thu, 25 Apr 2019 11:55:50 -0400
Subject: ext4: fix use-after-free race with debug_want_extra_isize

commit 7bc04c5c2cc467c5b40f2b03ba08da174a0d5fa7 upstream.

When remounting with debug_want_extra_isize, we were not performing the
same checks that we do during a normal mount.  That allowed us to set a
value for s_want_extra_isize that reached outside the s_inode_size.

Fixes: e2b911c53584 ("ext4: clean up feature test macros with predicate functions")
Reported-by: syzbot+f584efa0ac7213c226b7@syzkaller.appspotmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Barret Rhoden <brho@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 58 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index abba7ece78e9..a2437d352e09 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3514,6 +3514,37 @@ int ext4_calculate_overhead(struct super_block *sb)
 	return 0;
 }
 
+static void ext4_clamp_want_extra_isize(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+
+	/* determine the minimum size of new large inodes, if present */
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
+	    sbi->s_want_extra_isize == 0) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						     EXT4_GOOD_OLD_INODE_SIZE;
+		if (ext4_has_feature_extra_isize(sb)) {
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_want_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_want_extra_isize);
+			if (sbi->s_want_extra_isize <
+			    le16_to_cpu(es->s_min_extra_isize))
+				sbi->s_want_extra_isize =
+					le16_to_cpu(es->s_min_extra_isize);
+		}
+	}
+	/* Check if enough inode space is available */
+	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
+							sbi->s_inode_size) {
+		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+						       EXT4_GOOD_OLD_INODE_SIZE;
+		ext4_msg(sb, KERN_INFO,
+			 "required extra inode space not available");
+	}
+}
+
 static void ext4_set_resv_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t resv_clusters;
@@ -4388,30 +4419,7 @@ no_journal:
 	} else if (ret)
 		goto failed_mount4a;
 
-	/* determine the minimum size of new large inodes, if present */
-	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
-	    sbi->s_want_extra_isize == 0) {
-		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-						     EXT4_GOOD_OLD_INODE_SIZE;
-		if (ext4_has_feature_extra_isize(sb)) {
-			if (sbi->s_want_extra_isize <
-			    le16_to_cpu(es->s_want_extra_isize))
-				sbi->s_want_extra_isize =
-					le16_to_cpu(es->s_want_extra_isize);
-			if (sbi->s_want_extra_isize <
-			    le16_to_cpu(es->s_min_extra_isize))
-				sbi->s_want_extra_isize =
-					le16_to_cpu(es->s_min_extra_isize);
-		}
-	}
-	/* Check if enough inode space is available */
-	if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
-							sbi->s_inode_size) {
-		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
-						       EXT4_GOOD_OLD_INODE_SIZE;
-		ext4_msg(sb, KERN_INFO, "required extra inode space not"
-			 "available");
-	}
+	ext4_clamp_want_extra_isize(sb);
 
 	ext4_set_resv_clusters(sb);
 
@@ -5197,6 +5205,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
+	ext4_clamp_want_extra_isize(sb);
+
 	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
 	    test_opt(sb, JOURNAL_CHECKSUM)) {
 		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
-- 
cgit v1.2.3


From f795247578aaf398d2af4de902264d194c12222f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 25 Apr 2019 13:06:18 -0400
Subject: ext4: actually request zeroing of inode table after grow

commit 310a997fd74de778b9a4848a64be9cda9f18764a upstream.

It is never possible, that number of block groups decreases,
since only online grow is supported.

But after a growing occured, we have to zero inode tables
for just created new block groups.

Fixes: 19c5246d2516 ("ext4: add new online resize interface")
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5f24fdc140ad..53d57cdf3c4d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -977,7 +977,7 @@ mext_out:
 		if (err == 0)
 			err = err2;
 		mnt_drop_write_file(filp);
-		if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+		if (!err && (o_group < EXT4_SB(sb)->s_groups_count) &&
 		    ext4_has_group_desc_csum(sb) &&
 		    test_opt(sb, INIT_INODE_TABLE))
 			err = ext4_register_li_request(sb, o_group);
-- 
cgit v1.2.3


From 45123ae534e042e4e21d7e5882f2f22c92f35f46 Mon Sep 17 00:00:00 2001
From: Debabrata Banerjee <dbanerje@akamai.com>
Date: Tue, 30 Apr 2019 23:08:15 -0400
Subject: ext4: fix ext4_show_options for file systems w/o journal

commit 50b29d8f033a7c88c5bc011abc2068b1691ab755 upstream.

Instead of removing EXT4_MOUNT_JOURNAL_CHECKSUM from s_def_mount_opt as
I assume was intended, all other options were blown away leading to
_ext4_show_options() output being incorrect.

Fixes: 1e381f60dad9 ("ext4: do not allow journal_opts for fs w/o journal")
Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a2437d352e09..cac6ea956659 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4270,7 +4270,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "data=, fs mounted w/o journal");
 			goto failed_mount_wq;
 		}
-		sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+		sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
 		clear_opt(sb, JOURNAL_CHECKSUM);
 		clear_opt(sb, DATA_FLAGS);
 		sbi->s_journal = NULL;
-- 
cgit v1.2.3


From d8925a1fee71acb0fe1496a5fa80bda4c978d257 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 12 Mar 2019 17:10:40 +0800
Subject: btrfs: Check the first key and level for cached extent buffer

commit 448de471cd4cab0cedd15770082567a69a784a11 upstream.

[BUG]
When reading a file from a fuzzed image, kernel can panic like:

  BTRFS warning (device loop0): csum failed root 5 ino 270 off 0 csum 0x98f94189 expected csum 0x00000000 mirror 1
  assertion failed: !memcmp_extent_buffer(b, &disk_key, offsetof(struct btrfs_leaf, items[0].key), sizeof(disk_key)), file: fs/btrfs/ctree.c, line: 2544
  ------------[ cut here ]------------
  kernel BUG at fs/btrfs/ctree.h:3500!
  invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
  RIP: 0010:btrfs_search_slot.cold.24+0x61/0x63 [btrfs]
  Call Trace:
   btrfs_lookup_csum+0x52/0x150 [btrfs]
   __btrfs_lookup_bio_sums+0x209/0x640 [btrfs]
   btrfs_submit_bio_hook+0x103/0x170 [btrfs]
   submit_one_bio+0x59/0x80 [btrfs]
   extent_read_full_page+0x58/0x80 [btrfs]
   generic_file_read_iter+0x2f6/0x9d0
   __vfs_read+0x14d/0x1a0
   vfs_read+0x8d/0x140
   ksys_read+0x52/0xc0
   do_syscall_64+0x60/0x210
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

[CAUSE]
The fuzzed image has a corrupted leaf whose first key doesn't match its
parent:

  checksum tree key (CSUM_TREE ROOT_ITEM 0)
  node 29741056 level 1 items 14 free 107 generation 19 owner CSUM_TREE
  fs uuid 3381d111-94a3-4ac7-8f39-611bbbdab7e6
  chunk uuid 9af1c3c7-2af5-488b-8553-530bd515f14c
  	...
          key (EXTENT_CSUM EXTENT_CSUM 79691776) block 29761536 gen 19

  leaf 29761536 items 1 free space 1726 generation 19 owner CSUM_TREE
  leaf 29761536 flags 0x1(WRITTEN) backref revision 1
  fs uuid 3381d111-94a3-4ac7-8f39-611bbbdab7e6
  chunk uuid 9af1c3c7-2af5-488b-8553-530bd515f14c
          item 0 key (EXTENT_CSUM EXTENT_CSUM 8798638964736) itemoff 1751 itemsize 2244
                  range start 8798638964736 end 8798641262592 length 2297856

When reading the above tree block, we have extent_buffer->refs = 2 in
the context:

- initial one from __alloc_extent_buffer()
  alloc_extent_buffer()
  |- __alloc_extent_buffer()
     |- atomic_set(&eb->refs, 1)

- one being added to fs_info->buffer_radix
  alloc_extent_buffer()
  |- check_buffer_tree_ref()
     |- atomic_inc(&eb->refs)

So if even we call free_extent_buffer() in read_tree_block or other
similar situation, we only decrease the refs by 1, it doesn't reach 0
and won't be freed right now.

The staled eb and its corrupted content will still be kept cached.

Furthermore, we have several extra cases where we either don't do first
key check or the check is not proper for all callers:

- scrub
  We just don't have first key in this context.

- shared tree block
  One tree block can be shared by several snapshot/subvolume trees.
  In that case, the first key check for one subvolume doesn't apply to
  another.

So for the above reasons, a corrupted extent buffer can sneak into the
buffer cache.

[FIX]
Call verify_level_key in read_block_for_search to do another
verification. For that purpose the function is exported.

Due to above reasons, although we can free corrupted extent buffer from
cache, we still need the check in read_block_for_search(), for scrub and
shared tree blocks.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=202755
Link: https://bugzilla.kernel.org/show_bug.cgi?id=202757
Link: https://bugzilla.kernel.org/show_bug.cgi?id=202759
Link: https://bugzilla.kernel.org/show_bug.cgi?id=202761
Link: https://bugzilla.kernel.org/show_bug.cgi?id=202767
Link: https://bugzilla.kernel.org/show_bug.cgi?id=202769
Reported-by: Yoon Jungyeon <jungyeon@gatech.edu>
CC: stable@vger.kernel.org # 4.19+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/ctree.c   | 10 ++++++++++
 fs/btrfs/disk-io.c | 10 +++++-----
 fs/btrfs/disk-io.h |  3 +++
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 48ac8b7c43a5..79ac1ebabaf7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2436,6 +2436,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
 	if (tmp) {
 		/* first we do an atomic uptodate check */
 		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+			/*
+			 * Do extra check for first_key, eb can be stale due to
+			 * being cached, read from scrub, or have multiple
+			 * parents (shared tree blocks).
+			 */
+			if (btrfs_verify_level_key(fs_info, tmp,
+					parent_level - 1, &first_key, gen)) {
+				free_extent_buffer(tmp);
+				return -EUCLEAN;
+			}
 			*eb_ret = tmp;
 			return 0;
 		}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b4f61a3d560a..d67dbbd9e6ed 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -408,9 +408,9 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
 	return ret;
 }
 
-static int verify_level_key(struct btrfs_fs_info *fs_info,
-			    struct extent_buffer *eb, int level,
-			    struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *eb, int level,
+			   struct btrfs_key *first_key, u64 parent_transid)
 {
 	int found_level;
 	struct btrfs_key found_key;
@@ -487,8 +487,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
 			if (verify_parent_transid(io_tree, eb,
 						   parent_transid, 0))
 				ret = -EIO;
-			else if (verify_level_key(fs_info, eb, level,
-						  first_key, parent_transid))
+			else if (btrfs_verify_level_key(fs_info, eb, level,
+						first_key, parent_transid))
 				ret = -EUCLEAN;
 			else
 				break;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4cccba22640f..7a4a60f26dbf 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,6 +39,9 @@ static inline u64 btrfs_sb_offset(int mirror)
 struct btrfs_device;
 struct btrfs_fs_devices;
 
+int btrfs_verify_level_key(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *eb, int level,
+			   struct btrfs_key *first_key, u64 parent_transid);
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 				      u64 parent_transid, int level,
 				      struct btrfs_key *first_key);
-- 
cgit v1.2.3


From 87dcf0c61985e4e6b44d9a95de54373438eb31cd Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 14 Mar 2019 09:52:35 +0200
Subject: btrfs: Correctly free extent buffer in case
 btree_read_extent_buffer_pages fails

commit 537f38f019fa0b762dbb4c0fc95d7fcce9db8e2d upstream.

If a an eb fails to be read for whatever reason - it's corrupted on disk
and parent transid/key validations fail or IO for eb pages fail then
this buffer must be removed from the buffer cache. Currently the code
calls free_extent_buffer if an error occurs. Unfortunately this doesn't
achieve the desired behavior since btrfs_find_create_tree_block returns
with eb->refs == 2.

On the other hand free_extent_buffer will only decrement the refs once
leaving it added to the buffer cache radix tree.  This enables later
code to look up the buffer from the cache and utilize it potentially
leading to a crash.

The correct way to free the buffer is call free_extent_buffer_stale.
This function will correctly call atomic_dec explicitly for the buffer
and subsequently call release_extent_buffer which will decrement the
final reference thus correctly remove the invalid buffer from buffer
cache. This change affects only newly allocated buffers since they have
eb->refs == 2.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=202755
Reported-by: Jungyeon <jungyeon@gatech.edu>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/disk-io.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d67dbbd9e6ed..b2dc613ebed2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -995,13 +995,18 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = fs_info->btree_inode;
+	int ret;
 
 	buf = btrfs_find_create_tree_block(fs_info, bytenr);
 	if (IS_ERR(buf))
 		return;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, WAIT_NONE, 0);
-	free_extent_buffer(buf);
+
+	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf,
+			WAIT_NONE, 0);
+	if (ret < 0)
+		free_extent_buffer_stale(buf);
+	else
+		free_extent_buffer(buf);
 }
 
 int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
@@ -1021,12 +1026,12 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
 	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
 				       mirror_num);
 	if (ret) {
-		free_extent_buffer(buf);
+		free_extent_buffer_stale(buf);
 		return ret;
 	}
 
 	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
-		free_extent_buffer(buf);
+		free_extent_buffer_stale(buf);
 		return -EIO;
 	} else if (extent_buffer_uptodate(buf)) {
 		*eb = buf;
@@ -1080,7 +1085,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
 	ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
 					     level, first_key);
 	if (ret) {
-		free_extent_buffer(buf);
+		free_extent_buffer_stale(buf);
 		return ERR_PTR(ret);
 	}
 	return buf;
-- 
cgit v1.2.3


From 8b13bb911f0c0c77d41e5ddc41ad3c127c356b8a Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Mon, 25 Mar 2019 14:31:21 +0200
Subject: btrfs: Honour FITRIM range constraints during free space trim

commit c2d1b3aae33605a61cbab445d8ae1c708ccd2698 upstream.

Up until now trimming the freespace was done irrespective of what the
arguments of the FITRIM ioctl were. For example fstrim's -o/-l arguments
will be entirely ignored. Fix it by correctly handling those paramter.
This requires breaking if the found freespace extent is after the end of
the passed range as well as completing trim after trimming
fstrim_range::len bytes.

Fixes: 499f377f49f0 ("btrfs: iterate over unused chunk space in FITRIM")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0db7785cede..809c2c307c64 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10789,9 +10789,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
  * held back allocations.
  */
 static int btrfs_trim_free_extents(struct btrfs_device *device,
-				   u64 minlen, u64 *trimmed)
+				   struct fstrim_range *range, u64 *trimmed)
 {
-	u64 start = 0, len = 0;
+	u64 start = range->start, len = 0;
 	int ret;
 
 	*trimmed = 0;
@@ -10834,8 +10834,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 		if (!trans)
 			up_read(&fs_info->commit_root_sem);
 
-		ret = find_free_dev_extent_start(trans, device, minlen, start,
-						 &start, &len);
+		ret = find_free_dev_extent_start(trans, device, range->minlen,
+						 start, &start, &len);
 		if (trans) {
 			up_read(&fs_info->commit_root_sem);
 			btrfs_put_transaction(trans);
@@ -10848,6 +10848,16 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 			break;
 		}
 
+		/* If we are out of the passed range break */
+		if (start > range->start + range->len - 1) {
+			mutex_unlock(&fs_info->chunk_mutex);
+			ret = 0;
+			break;
+		}
+
+		start = max(range->start, start);
+		len = min(range->len, len);
+
 		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
 		mutex_unlock(&fs_info->chunk_mutex);
 
@@ -10857,6 +10867,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 		start += len;
 		*trimmed += bytes;
 
+		/* We've trimmed enough */
+		if (*trimmed >= range->len)
+			break;
+
 		if (fatal_signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
@@ -10940,8 +10954,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
-		ret = btrfs_trim_free_extents(device, range->minlen,
-					      &group_trimmed);
+		ret = btrfs_trim_free_extents(device, range, &group_trimmed);
 		if (ret) {
 			dev_failed++;
 			dev_ret = ret;
-- 
cgit v1.2.3


From 74ca0a7671ccbaf5ed93e3d601c645d3f9335ad7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Apr 2019 09:29:36 +0100
Subject: Btrfs: send, flush dellaloc in order to avoid data loss

commit 9f89d5de8631c7930898a601b6612e271aa2261c upstream.

When we set a subvolume to read-only mode we do not flush dellaloc for any
of its inodes (except if the filesystem is mounted with -o flushoncommit),
since it does not affect correctness for any subsequent operations - except
for a future send operation. The send operation will not be able to see the
delalloc data since the respective file extent items, inode item updates,
backreferences, etc, have not hit yet the subvolume and extent trees.

Effectively this means data loss, since the send stream will not contain
any data from existing delalloc. Another problem from this is that if the
writeback starts and finishes while the send operation is in progress, we
have the subvolume tree being being modified concurrently which can result
in send failing unexpectedly with EIO or hitting runtime errors, assertion
failures or hitting BUG_ONs, etc.

Simple reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ btrfs subvolume create /mnt/sv
  $ xfs_io -f -c "pwrite -S 0xea 0 108K" /mnt/sv/foo

  $ btrfs property set /mnt/sv ro true
  $ btrfs send -f /tmp/send.stream /mnt/sv

  $ od -t x1 -A d /mnt/sv/foo
  0000000 ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea
  *
  0110592

  $ umount /mnt
  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt

  $ btrfs receive -f /tmp/send.stream /mnt
  $ echo $?
  0
  $ od -t x1 -A d /mnt/sv/foo
  0000000
  # ---> empty file

Since this a problem that affects send only, fix it in send by flushing
dellaloc for all the roots used by the send operation before send starts
to process the commit roots.

This is a problem that affects send since it was introduced (commit
31db9f7c23fbf7 ("Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive"))
but backporting it to older kernels has some dependencies:

- For kernels between 3.19 and 4.20, it depends on commit 3cd24c698004d2
  ("btrfs: use tagged writepage to mitigate livelock of snapshot") because
  the function btrfs_start_delalloc_snapshot() does not exist before that
  commit. So one has to either pick that commit or replace the calls to
  btrfs_start_delalloc_snapshot() in this patch with calls to
  btrfs_start_delalloc_inodes().

- For kernels older than 3.19 it also requires commit e5fa8f865b3324
  ("Btrfs: ensure send always works on roots without orphans") because
  it depends on the function ensure_commit_roots_uptodate() which that
  commits introduced.

- No dependencies for 5.0+ kernels.

A test case for fstests follows soon.

CC: stable@vger.kernel.org # 3.19+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/send.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 84cb6e5ef36c..635e419f2a2d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6583,6 +6583,38 @@ commit_trans:
 	return btrfs_commit_transaction(trans);
 }
 
+/*
+ * Make sure any existing dellaloc is flushed for any root used by a send
+ * operation so that we do not miss any data and we do not race with writeback
+ * finishing and changing a tree while send is using the tree. This could
+ * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
+ * a send operation then uses the subvolume.
+ * After flushing delalloc ensure_commit_roots_uptodate() must be called.
+ */
+static int flush_delalloc_roots(struct send_ctx *sctx)
+{
+	struct btrfs_root *root = sctx->parent_root;
+	int ret;
+	int i;
+
+	if (root) {
+		ret = btrfs_start_delalloc_snapshot(root);
+		if (ret)
+			return ret;
+		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+	}
+
+	for (i = 0; i < sctx->clone_roots_cnt; i++) {
+		root = sctx->clone_roots[i].root;
+		ret = btrfs_start_delalloc_snapshot(root);
+		if (ret)
+			return ret;
+		btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
+	}
+
+	return 0;
+}
+
 static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
 {
 	spin_lock(&root->root_item_lock);
@@ -6807,6 +6839,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
 			NULL);
 	sort_clone_roots = 1;
 
+	ret = flush_delalloc_roots(sctx);
+	if (ret)
+		goto out;
+
 	ret = ensure_commit_roots_uptodate(sctx);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From 0388d45afc5097c8141303d356b3bc1aaad80509 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 15 Apr 2019 14:50:51 +0100
Subject: Btrfs: do not start a transaction during fiemap

commit 03628cdbc64db6262e50d0357960a4e9562676a1 upstream.

During fiemap, for regular extents (non inline) we need to check if they
are shared and if they are, set the shared bit. Checking if an extent is
shared requires checking the delayed references of the currently running
transaction, since some reference might have not yet hit the extent tree
and be only in the in-memory delayed references.

However we were using a transaction join for this, which creates a new
transaction when there is no transaction currently running. That means
that two more potential failures can happen: creating the transaction and
committing it. Further, if no write activity is currently happening in the
system, and fiemap calls keep being done, we end up creating and
committing transactions that do nothing.

In some extreme cases this can result in the commit of the transaction
created by fiemap to fail with ENOSPC when updating the root item of a
subvolume tree because a join does not reserve any space, leading to a
trace like the following:

 heisenberg kernel: ------------[ cut here ]------------
 heisenberg kernel: BTRFS: Transaction aborted (error -28)
 heisenberg kernel: WARNING: CPU: 0 PID: 7137 at fs/btrfs/root-tree.c:136 btrfs_update_root+0x22b/0x320 [btrfs]
(...)
 heisenberg kernel: CPU: 0 PID: 7137 Comm: btrfs-transacti Not tainted 4.19.0-4-amd64 #1 Debian 4.19.28-2
 heisenberg kernel: Hardware name: FUJITSU LIFEBOOK U757/FJNB2A5, BIOS Version 1.21 03/19/2018
 heisenberg kernel: RIP: 0010:btrfs_update_root+0x22b/0x320 [btrfs]
(...)
 heisenberg kernel: RSP: 0018:ffffb5448828bd40 EFLAGS: 00010286
 heisenberg kernel: RAX: 0000000000000000 RBX: ffff8ed56bccef50 RCX: 0000000000000006
 heisenberg kernel: RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8ed6bda166a0
 heisenberg kernel: RBP: 00000000ffffffe4 R08: 00000000000003df R09: 0000000000000007
 heisenberg kernel: R10: 0000000000000000 R11: 0000000000000001 R12: ffff8ed63396a078
 heisenberg kernel: R13: ffff8ed092d7c800 R14: ffff8ed64f5db028 R15: ffff8ed6bd03d068
 heisenberg kernel: FS:  0000000000000000(0000) GS:ffff8ed6bda00000(0000) knlGS:0000000000000000
 heisenberg kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 heisenberg kernel: CR2: 00007f46f75f8000 CR3: 0000000310a0a002 CR4: 00000000003606f0
 heisenberg kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 heisenberg kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 heisenberg kernel: Call Trace:
 heisenberg kernel:  commit_fs_roots+0x166/0x1d0 [btrfs]
 heisenberg kernel:  ? _cond_resched+0x15/0x30
 heisenberg kernel:  ? btrfs_run_delayed_refs+0xac/0x180 [btrfs]
 heisenberg kernel:  btrfs_commit_transaction+0x2bd/0x870 [btrfs]
 heisenberg kernel:  ? start_transaction+0x9d/0x3f0 [btrfs]
 heisenberg kernel:  transaction_kthread+0x147/0x180 [btrfs]
 heisenberg kernel:  ? btrfs_cleanup_transaction+0x530/0x530 [btrfs]
 heisenberg kernel:  kthread+0x112/0x130
 heisenberg kernel:  ? kthread_bind+0x30/0x30
 heisenberg kernel:  ret_from_fork+0x35/0x40
 heisenberg kernel: ---[ end trace 05de912e30e012d9 ]---

Since fiemap (and btrfs_check_shared()) is a read-only operation, do not do
a transaction join to avoid the overhead of creating a new transaction (if
there is currently no running transaction) and introducing a potential
point of failure when the new transaction gets committed, instead use a
transaction attach to grab a handle for the currently running transaction
if any.

Reported-by: Christoph Anton Mitterer <calestyo@scientia.net>
Link: https://lore.kernel.org/linux-btrfs/b2a668d7124f1d3e410367f587926f622b3f03a4.camel@scientia.net/
Fixes: afce772e87c36c ("btrfs: fix check_shared for fiemap ioctl")
CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/backref.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1574a2..73e715d1a24e 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1452,8 +1452,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
  * callers (such as fiemap) which want to know whether the extent is
  * shared but do not need a ref count.
  *
- * This attempts to allocate a transaction in order to account for
- * delayed refs, but continues on even when the alloc fails.
+ * This attempts to attach to the running transaction in order to account for
+ * delayed refs, but continues on even when no running transaction exists.
  *
  * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
  */
@@ -1476,13 +1476,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 	tmp = ulist_alloc(GFP_NOFS);
 	roots = ulist_alloc(GFP_NOFS);
 	if (!tmp || !roots) {
-		ulist_free(tmp);
-		ulist_free(roots);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	trans = btrfs_join_transaction(root);
+	trans = btrfs_attach_transaction(root);
 	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) {
+			ret = PTR_ERR(trans);
+			goto out;
+		}
 		trans = NULL;
 		down_read(&fs_info->commit_root_sem);
 	} else {
@@ -1515,6 +1518,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
 	} else {
 		up_read(&fs_info->commit_root_sem);
 	}
+out:
 	ulist_free(tmp);
 	ulist_free(roots);
 	return ret;
-- 
cgit v1.2.3


From 8a8f671b3dadf878967d917682563e9f0028da53 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 17 Apr 2019 11:30:30 +0100
Subject: Btrfs: do not start a transaction at iterate_extent_inodes()

commit bfc61c36260ca990937539cd648ede3cd749bc10 upstream.

When finding out which inodes have references on a particular extent, done
by backref.c:iterate_extent_inodes(), from the BTRFS_IOC_LOGICAL_INO (both
v1 and v2) ioctl and from scrub we use the transaction join API to grab a
reference on the currently running transaction, since in order to give
accurate results we need to inspect the delayed references of the currently
running transaction.

However, if there is currently no running transaction, the join operation
will create a new transaction. This is inefficient as the transaction will
eventually be committed, doing unnecessary IO and introducing a potential
point of failure that will lead to a transaction abort due to -ENOSPC, as
recently reported [1].

That's because the join, creates the transaction but does not reserve any
space, so when attempting to update the root item of the root passed to
btrfs_join_transaction(), during the transaction commit, we can end up
failling with -ENOSPC. Users of a join operation are supposed to actually
do some filesystem changes and reserve space by some means, which is not
the case of iterate_extent_inodes(), it is a read-only operation for all
contextes from which it is called.

The reported [1] -ENOSPC failure stack trace is the following:

 heisenberg kernel: ------------[ cut here ]------------
 heisenberg kernel: BTRFS: Transaction aborted (error -28)
 heisenberg kernel: WARNING: CPU: 0 PID: 7137 at fs/btrfs/root-tree.c:136 btrfs_update_root+0x22b/0x320 [btrfs]
(...)
 heisenberg kernel: CPU: 0 PID: 7137 Comm: btrfs-transacti Not tainted 4.19.0-4-amd64 #1 Debian 4.19.28-2
 heisenberg kernel: Hardware name: FUJITSU LIFEBOOK U757/FJNB2A5, BIOS Version 1.21 03/19/2018
 heisenberg kernel: RIP: 0010:btrfs_update_root+0x22b/0x320 [btrfs]
(...)
 heisenberg kernel: RSP: 0018:ffffb5448828bd40 EFLAGS: 00010286
 heisenberg kernel: RAX: 0000000000000000 RBX: ffff8ed56bccef50 RCX: 0000000000000006
 heisenberg kernel: RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8ed6bda166a0
 heisenberg kernel: RBP: 00000000ffffffe4 R08: 00000000000003df R09: 0000000000000007
 heisenberg kernel: R10: 0000000000000000 R11: 0000000000000001 R12: ffff8ed63396a078
 heisenberg kernel: R13: ffff8ed092d7c800 R14: ffff8ed64f5db028 R15: ffff8ed6bd03d068
 heisenberg kernel: FS:  0000000000000000(0000) GS:ffff8ed6bda00000(0000) knlGS:0000000000000000
 heisenberg kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 heisenberg kernel: CR2: 00007f46f75f8000 CR3: 0000000310a0a002 CR4: 00000000003606f0
 heisenberg kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 heisenberg kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 heisenberg kernel: Call Trace:
 heisenberg kernel:  commit_fs_roots+0x166/0x1d0 [btrfs]
 heisenberg kernel:  ? _cond_resched+0x15/0x30
 heisenberg kernel:  ? btrfs_run_delayed_refs+0xac/0x180 [btrfs]
 heisenberg kernel:  btrfs_commit_transaction+0x2bd/0x870 [btrfs]
 heisenberg kernel:  ? start_transaction+0x9d/0x3f0 [btrfs]
 heisenberg kernel:  transaction_kthread+0x147/0x180 [btrfs]
 heisenberg kernel:  ? btrfs_cleanup_transaction+0x530/0x530 [btrfs]
 heisenberg kernel:  kthread+0x112/0x130
 heisenberg kernel:  ? kthread_bind+0x30/0x30
 heisenberg kernel:  ret_from_fork+0x35/0x40
 heisenberg kernel: ---[ end trace 05de912e30e012d9 ]---

So fix that by using the attach API, which does not create a transaction
when there is currently no running transaction.

[1] https://lore.kernel.org/linux-btrfs/b2a668d7124f1d3e410367f587926f622b3f03a4.camel@scientia.net/

Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/backref.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 73e715d1a24e..2a4f52c7be22 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1908,13 +1908,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 			extent_item_objectid);
 
 	if (!search_commit_root) {
-		trans = btrfs_join_transaction(fs_info->extent_root);
-		if (IS_ERR(trans))
-			return PTR_ERR(trans);
+		trans = btrfs_attach_transaction(fs_info->extent_root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) != -ENOENT &&
+			    PTR_ERR(trans) != -EROFS)
+				return PTR_ERR(trans);
+			trans = NULL;
+		}
+	}
+
+	if (trans)
 		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-	} else {
+	else
 		down_read(&fs_info->commit_root_sem);
-	}
 
 	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
 				   tree_mod_seq_elem.seq, &refs,
@@ -1947,7 +1953,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 
 	free_leaf_list(refs);
 out:
-	if (!search_commit_root) {
+	if (trans) {
 		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 		btrfs_end_transaction(trans);
 	} else {
-- 
cgit v1.2.3


From 986d3453bee4e932b76261679e65881021eb2a8b Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Fri, 17 May 2019 14:31:44 -0700
Subject: fs/writeback.c: use rcu_barrier() to wait for inflight wb switches
 going into workqueue when umount

commit ec084de929e419e51bcdafaafe567d9e7d0273b7 upstream.

synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb
switch may not go to the workqueue after synchronize_rcu().  Thus
previous scheduled switches was not finished even flushing the
workqueue, which will cause a NULL pointer dereferenced followed below.

  VFS: Busy inodes after unmount of vdd. Self-destruct in 5 seconds.  Have a nice day...
  BUG: unable to handle kernel NULL pointer dereference at 0000000000000278
    evict+0xb3/0x180
    iput+0x1b0/0x230
    inode_switch_wbs_work_fn+0x3c0/0x6a0
    worker_thread+0x4e/0x490
    ? process_one_work+0x410/0x410
    kthread+0xe6/0x100
    ret_from_fork+0x39/0x50

Replace the synchronize_rcu() call with a rcu_barrier() to wait for all
pending callbacks to finish.  And inc isw_nr_in_flight after call_rcu()
in inode_switch_wbs() to make more sense.

Link: http://lkml.kernel.org/r/20190429024108.54150-1-jiufei.xue@linux.alibaba.com
Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Acked-by: Tejun Heo <tj@kernel.org>
Suggested-by: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fs-writeback.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 82ce6d4f7e31..9544e2f8b79f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -530,8 +530,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
 	isw->inode = inode;
 
-	atomic_inc(&isw_nr_in_flight);
-
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the i_page
@@ -539,6 +537,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
 	 */
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+
+	atomic_inc(&isw_nr_in_flight);
+
 	goto out_unlock;
 
 out_free:
@@ -908,7 +909,11 @@ restart:
 void cgroup_writeback_umount(void)
 {
 	if (atomic_read(&isw_nr_in_flight)) {
-		synchronize_rcu();
+		/*
+		 * Use rcu_barrier() to wait for all pending callbacks to
+		 * ensure that all in-flight wb switches are in the workqueue.
+		 */
+		rcu_barrier();
 		flush_workqueue(isw_wq);
 	}
 }
-- 
cgit v1.2.3


From 25d010f4e0ece1ddf0d8d57942c0b0f1568fe498 Mon Sep 17 00:00:00 2001
From: Sriram Rajagopalan <sriramr@arista.com>
Date: Fri, 10 May 2019 19:28:06 -0400
Subject: ext4: zero out the unused memory region in the extent tree block

commit 592acbf16821288ecdc4192c47e3774a4c48bb64 upstream.

This commit zeroes out the unused memory region in the buffer_head
corresponding to the extent metablock after writing the extent header
and the corresponding extent node entries.

This is done to prevent random uninitialized data from getting into
the filesystem when the extent block is synced.

This fixes CVE-2019-11833.

Signed-off-by: Sriram Rajagopalan <sriramr@arista.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/extents.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 72a361d5ef74..45aea792d22a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1035,6 +1035,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	__le32 border;
 	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
 	int err = 0;
+	size_t ext_size = 0;
 
 	/* make decision: where to split? */
 	/* FIXME: now decision is simplest: at current extent */
@@ -1126,6 +1127,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		le16_add_cpu(&neh->eh_entries, m);
 	}
 
+	/* zero out unused area in the extent block */
+	ext_size = sizeof(struct ext4_extent_header) +
+		sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
+	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
 	ext4_extent_block_csum_set(inode, neh);
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
@@ -1205,6 +1210,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 				sizeof(struct ext4_extent_idx) * m);
 			le16_add_cpu(&neh->eh_entries, m);
 		}
+		/* zero out unused area in the extent block */
+		ext_size = sizeof(struct ext4_extent_header) +
+		   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
+		memset(bh->b_data + ext_size, 0,
+			inode->i_sb->s_blocksize - ext_size);
 		ext4_extent_block_csum_set(inode, neh);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
@@ -1270,6 +1280,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock, goal = 0;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	int err = 0;
+	size_t ext_size = 0;
 
 	/* Try to prepend new index to old one */
 	if (ext_depth(inode))
@@ -1295,9 +1306,11 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 		goto out;
 	}
 
+	ext_size = sizeof(EXT4_I(inode)->i_data);
 	/* move top-level index/leaf into new block */
-	memmove(bh->b_data, EXT4_I(inode)->i_data,
-		sizeof(EXT4_I(inode)->i_data));
+	memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
+	/* zero out unused area in the extent block */
+	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
 
 	/* set size of new block */
 	neh = ext_block_hdr(bh);
-- 
cgit v1.2.3


From 0db24122bd7f06e9a7ae24dbda6411b477f5b9dd Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Fri, 10 May 2019 21:45:33 -0400
Subject: ext4: fix data corruption caused by overlapping unaligned and aligned
 IO

commit 57a0da28ced8707cb9f79f071a016b9d005caf5a upstream.

Unaligned AIO must be serialized because the zeroing of partial blocks
of unaligned AIO can result in data corruption in case it's overlapping
another in flight IO.

Currently we wait for all unwritten extents before we submit unaligned
AIO which protects data in case of unaligned AIO is following overlapping
IO. However if a unaligned AIO is followed by overlapping aligned AIO we
can still end up corrupting data.

To fix this, we must make sure that the unaligned AIO is the only IO in
flight by waiting for unwritten extents conversion not just before the
IO submission, but right after it as well.

This problem can be reproduced by xfstest generic/538

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/file.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 98ec11f69cd4..2c5baa5e8291 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -264,6 +264,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	ret = __generic_file_write_iter(iocb, from);
+	/*
+	 * Unaligned direct AIO must be the only IO in flight. Otherwise
+	 * overlapping aligned IO after unaligned might result in data
+	 * corruption.
+	 */
+	if (ret == -EIOCBQUEUED && unaligned_aio)
+		ext4_unwritten_wait(inode);
 	inode_unlock(inode);
 
 	if (ret > 0)
-- 
cgit v1.2.3


From c19db366c0a809027289c470051fba31fbddbe76 Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Fri, 10 May 2019 22:00:33 -0400
Subject: ext4: fix use-after-free in dx_release()

commit 08fc98a4d6424af66eb3ac4e2cedd2fc927ed436 upstream.

The buffer_head (frames[0].bh) and it's corresping page can be
potentially free'd once brelse() is done inside the for loop
but before the for loop exits in dx_release(). It can be free'd
in another context, when the page cache is flushed via
drop_caches_sysctl_handler(). This results into below data abort
when accessing info->indirect_levels in dx_release().

Unable to handle kernel paging request at virtual address ffffffc17ac3e01e
Call trace:
 dx_release+0x70/0x90
 ext4_htree_fill_tree+0x2d4/0x300
 ext4_readdir+0x244/0x6f8
 iterate_dir+0xbc/0x160
 SyS_getdents64+0x94/0x174

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/namei.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4f8de2b9e87e..4c5aa5df6573 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -871,12 +871,15 @@ static void dx_release(struct dx_frame *frames)
 {
 	struct dx_root_info *info;
 	int i;
+	unsigned int indirect_levels;
 
 	if (frames[0].bh == NULL)
 		return;
 
 	info = &((struct dx_root *)frames[0].bh->b_data)->info;
-	for (i = 0; i <= info->indirect_levels; i++) {
+	/* save local copy, "info" may be freed after brelse() */
+	indirect_levels = info->indirect_levels;
+	for (i = 0; i <= indirect_levels; i++) {
 		if (frames[i].bh == NULL)
 			break;
 		brelse(frames[i].bh);
-- 
cgit v1.2.3


From 316063bf7d11c6a6feb8aba51987cf23181544aa Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 17 May 2019 17:37:18 -0400
Subject: ext4: avoid panic during forced reboot due to aborted journal

commit 2c1d0e3631e5732dba98ef49ac0bec1388776793 upstream.

Handling of aborted journal is a special code path different from
standard ext4_error() one and it can call panic() as well. Commit
1dc1097ff60e ("ext4: avoid panic during forced reboot") forgot to update
this path so fix that omission.

Fixes: 1dc1097ff60e ("ext4: avoid panic during forced reboot")
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org # 5.1
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cac6ea956659..07d73fb22b60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -698,7 +698,7 @@ void __ext4_abort(struct super_block *sb, const char *function,
 			jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 		save_error_info(sb, function, line);
 	}
-	if (test_opt(sb, ERRORS_PANIC)) {
+	if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 		if (EXT4_SB(sb)->s_journal &&
 		  !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 			return;
-- 
cgit v1.2.3


From 5b8567682489dbbb0742c9fc3f6711bb6a01f540 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmail.com>
Date: Fri, 10 May 2019 21:15:47 -0400
Subject: jbd2: fix potential double free

commit 0d52154bb0a700abb459a2cbce0a30fc2549b67e upstream.

When failing from creating cache jbd2_inode_cache, we will destroy the
previously created cache jbd2_handle_cache twice.  This patch fixes
this by moving each cache initialization/destruction to its own
separate, individual function.

Signed-off-by: Chengguang Xu <cgxu519@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/jbd2/journal.c     | 49 +++++++++++++++++++++++++++++++------------------
 fs/jbd2/revoke.c      | 32 ++++++++++++++++++++------------
 fs/jbd2/transaction.c |  8 +++++---
 3 files changed, 56 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e618777c699..e9cf88f0bc29 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2389,22 +2389,19 @@ static struct kmem_cache *jbd2_journal_head_cache;
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
 
-static int jbd2_journal_init_journal_head_cache(void)
+static int __init jbd2_journal_init_journal_head_cache(void)
 {
-	int retval;
-
-	J_ASSERT(jbd2_journal_head_cache == NULL);
+	J_ASSERT(!jbd2_journal_head_cache);
 	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
 				sizeof(struct journal_head),
 				0,		/* offset */
 				SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
 				NULL);		/* ctor */
-	retval = 0;
 	if (!jbd2_journal_head_cache) {
-		retval = -ENOMEM;
 		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
+		return -ENOMEM;
 	}
-	return retval;
+	return 0;
 }
 
 static void jbd2_journal_destroy_journal_head_cache(void)
@@ -2650,28 +2647,38 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void)
 
 struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 
+static int __init jbd2_journal_init_inode_cache(void)
+{
+	J_ASSERT(!jbd2_inode_cache);
+	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
+	if (!jbd2_inode_cache) {
+		pr_emerg("JBD2: failed to create inode cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 static int __init jbd2_journal_init_handle_cache(void)
 {
+	J_ASSERT(!jbd2_handle_cache);
 	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
-	if (jbd2_handle_cache == NULL) {
+	if (!jbd2_handle_cache) {
 		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
 		return -ENOMEM;
 	}
-	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
-	if (jbd2_inode_cache == NULL) {
-		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
-		kmem_cache_destroy(jbd2_handle_cache);
-		return -ENOMEM;
-	}
 	return 0;
 }
 
+static void jbd2_journal_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(jbd2_inode_cache);
+	jbd2_inode_cache = NULL;
+}
+
 static void jbd2_journal_destroy_handle_cache(void)
 {
 	kmem_cache_destroy(jbd2_handle_cache);
 	jbd2_handle_cache = NULL;
-	kmem_cache_destroy(jbd2_inode_cache);
-	jbd2_inode_cache = NULL;
 }
 
 /*
@@ -2682,11 +2689,15 @@ static int __init journal_init_caches(void)
 {
 	int ret;
 
-	ret = jbd2_journal_init_revoke_caches();
+	ret = jbd2_journal_init_revoke_record_cache();
+	if (ret == 0)
+		ret = jbd2_journal_init_revoke_table_cache();
 	if (ret == 0)
 		ret = jbd2_journal_init_journal_head_cache();
 	if (ret == 0)
 		ret = jbd2_journal_init_handle_cache();
+	if (ret == 0)
+		ret = jbd2_journal_init_inode_cache();
 	if (ret == 0)
 		ret = jbd2_journal_init_transaction_cache();
 	return ret;
@@ -2694,9 +2705,11 @@ static int __init journal_init_caches(void)
 
 static void jbd2_journal_destroy_caches(void)
 {
-	jbd2_journal_destroy_revoke_caches();
+	jbd2_journal_destroy_revoke_record_cache();
+	jbd2_journal_destroy_revoke_table_cache();
 	jbd2_journal_destroy_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_inode_cache();
 	jbd2_journal_destroy_transaction_cache();
 	jbd2_journal_destroy_slabs();
 }
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a1143e57a718..69b9bc329964 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -178,33 +178,41 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
 	return NULL;
 }
 
-void jbd2_journal_destroy_revoke_caches(void)
+void jbd2_journal_destroy_revoke_record_cache(void)
 {
 	kmem_cache_destroy(jbd2_revoke_record_cache);
 	jbd2_revoke_record_cache = NULL;
+}
+
+void jbd2_journal_destroy_revoke_table_cache(void)
+{
 	kmem_cache_destroy(jbd2_revoke_table_cache);
 	jbd2_revoke_table_cache = NULL;
 }
 
-int __init jbd2_journal_init_revoke_caches(void)
+int __init jbd2_journal_init_revoke_record_cache(void)
 {
 	J_ASSERT(!jbd2_revoke_record_cache);
-	J_ASSERT(!jbd2_revoke_table_cache);
-
 	jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
 					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
-	if (!jbd2_revoke_record_cache)
-		goto record_cache_failure;
 
+	if (!jbd2_revoke_record_cache) {
+		pr_emerg("JBD2: failed to create revoke_record cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+int __init jbd2_journal_init_revoke_table_cache(void)
+{
+	J_ASSERT(!jbd2_revoke_table_cache);
 	jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
 					     SLAB_TEMPORARY);
-	if (!jbd2_revoke_table_cache)
-		goto table_cache_failure;
-	return 0;
-table_cache_failure:
-	jbd2_journal_destroy_revoke_caches();
-record_cache_failure:
+	if (!jbd2_revoke_table_cache) {
+		pr_emerg("JBD2: failed to create revoke_table cache\n");
 		return -ENOMEM;
+	}
+	return 0;
 }
 
 static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 914e725c82c4..e20a6703531f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -42,9 +42,11 @@ int __init jbd2_journal_init_transaction_cache(void)
 					0,
 					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
 					NULL);
-	if (transaction_cache)
-		return 0;
-	return -ENOMEM;
+	if (!transaction_cache) {
+		pr_emerg("JBD2: failed to create transaction cache\n");
+		return -ENOMEM;
+	}
+	return 0;
 }
 
 void jbd2_journal_destroy_transaction_cache(void)
-- 
cgit v1.2.3


From f4bf101be366d1075767f21a10b1bf587ce2a52f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 18 Oct 2018 11:17:42 -0700
Subject: pstore: Centralize init/exit routines

commit cb095afd44768bf495894b9ad063bd078e4bb201 upstream.

In preparation for having additional actions during init/exit, this moves
the init/exit into platform.c, centralizing the logic to make call outs
to the fs init/exit.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/inode.c    | 11 ++---------
 fs/pstore/internal.h |  5 +++--
 fs/pstore/platform.c | 23 +++++++++++++++++++++++
 3 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5fcb845b9fec..8cf2218b46a7 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -482,12 +482,10 @@ static struct file_system_type pstore_fs_type = {
 	.kill_sb	= pstore_kill_sb,
 };
 
-static int __init init_pstore_fs(void)
+int __init pstore_init_fs(void)
 {
 	int err;
 
-	pstore_choose_compression();
-
 	/* Create a convenient mount point for people to access pstore */
 	err = sysfs_create_mount_point(fs_kobj, "pstore");
 	if (err)
@@ -500,14 +498,9 @@ static int __init init_pstore_fs(void)
 out:
 	return err;
 }
-module_init(init_pstore_fs)
 
-static void __exit exit_pstore_fs(void)
+void __exit pstore_exit_fs(void)
 {
 	unregister_filesystem(&pstore_fs_type);
 	sysfs_remove_mount_point(fs_kobj, "pstore");
 }
-module_exit(exit_pstore_fs)
-
-MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
-MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index fb767e28aeb2..7062ea4bc57c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -37,7 +37,8 @@ extern bool	pstore_is_mounted(void);
 extern void	pstore_record_init(struct pstore_record *record,
 				   struct pstore_info *psi);
 
-/* Called during module_init() */
-extern void __init pstore_choose_compression(void);
+/* Called during pstore init/exit. */
+int __init	pstore_init_fs(void);
+void __exit	pstore_exit_fs(void);
 
 #endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 15e99d5a681d..d61e26812af6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -780,8 +780,31 @@ void __init pstore_choose_compression(void)
 	}
 }
 
+static int __init pstore_init(void)
+{
+	int ret;
+
+	pstore_choose_compression();
+
+	ret = pstore_init_fs();
+	if (ret)
+		return ret;
+
+	return 0;
+}
+module_init(pstore_init)
+
+static void __exit pstore_exit(void)
+{
+	pstore_exit_fs();
+}
+module_exit(pstore_exit)
+
 module_param(compress, charp, 0444);
 MODULE_PARM_DESC(compress, "Pstore compression to use");
 
 module_param(backend, charp, 0444);
 MODULE_PARM_DESC(backend, "Pstore backend to use");
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From fea8b84765a12c5c2aef32602381d9ffcc6d3507 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 17 Oct 2018 03:13:55 -0700
Subject: pstore: Allocate compression during late_initcall()

commit 416031653eb55f844e3547fb8f8576399a800da0 upstream.

ramoops's call of pstore_register() was recently moved to run during
late_initcall() because the crypto backend may not have been ready during
postcore_initcall(). This meant early-boot crash dumps were not getting
caught by pstore any more.

Instead, lets allow calls to pstore_register() earlier, and once crypto
is ready we can initialize the compression.

Reported-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Tested-by: Sai Prakash Ranjan <saiprakash.ranjan@codeaurora.org>
Fixes: cb3bee0369bc ("pstore: Use crypto compress API")
[kees: trivial rebase]
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/platform.c | 10 +++++++++-
 fs/pstore/ram.c      |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index d61e26812af6..578f178a695f 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -786,13 +786,21 @@ static int __init pstore_init(void)
 
 	pstore_choose_compression();
 
+	/*
+	 * Check if any pstore backends registered earlier but did not
+	 * initialize compression because crypto was not ready. If so,
+	 * initialize compression now.
+	 */
+	if (psinfo && !tfm)
+		allocate_buf_for_compression();
+
 	ret = pstore_init_fs();
 	if (ret)
 		return ret;
 
 	return 0;
 }
-module_init(pstore_init)
+late_initcall(pstore_init);
 
 static void __exit pstore_exit(void)
 {
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index eb67bb7f04de..44ed6b193d2e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -956,7 +956,7 @@ static int __init ramoops_init(void)
 
 	return ret;
 }
-late_initcall(ramoops_init);
+postcore_initcall(ramoops_init);
 
 static void __exit ramoops_exit(void)
 {
-- 
cgit v1.2.3


From 953e826e8d0f24904241e7cd3633a54af511b42a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Oct 2018 14:00:12 -0700
Subject: pstore: Refactor compression initialization

commit 95047b0519c17a28e09df5f38750f5354e3db4c4 upstream.

This refactors compression initialization slightly to better handle
getting potentially called twice (via early pstore_register() calls
and later pstore_init()) and improves the comments and reporting to be
more verbose.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/platform.c | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 578f178a695f..b821054ca3ed 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -274,36 +274,56 @@ static int pstore_decompress(void *in, void *out,
 
 static void allocate_buf_for_compression(void)
 {
+	struct crypto_comp *ctx;
+	int size;
+	char *buf;
+
+	/* Skip if not built-in or compression backend not selected yet. */
 	if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
 		return;
 
+	/* Skip if no pstore backend yet or compression init already done. */
+	if (!psinfo || tfm)
+		return;
+
 	if (!crypto_has_comp(zbackend->name, 0, 0)) {
-		pr_err("No %s compression\n", zbackend->name);
+		pr_err("Unknown compression: %s\n", zbackend->name);
 		return;
 	}
 
-	big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize);
-	if (big_oops_buf_sz <= 0)
+	size = zbackend->zbufsize(psinfo->bufsize);
+	if (size <= 0) {
+		pr_err("Invalid compression size for %s: %d\n",
+		       zbackend->name, size);
 		return;
+	}
 
-	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
-	if (!big_oops_buf) {
-		pr_err("allocate compression buffer error!\n");
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf) {
+		pr_err("Failed %d byte compression buffer allocation for: %s\n",
+		       size, zbackend->name);
 		return;
 	}
 
-	tfm = crypto_alloc_comp(zbackend->name, 0, 0);
-	if (IS_ERR_OR_NULL(tfm)) {
-		kfree(big_oops_buf);
-		big_oops_buf = NULL;
-		pr_err("crypto_alloc_comp() failed!\n");
+	ctx = crypto_alloc_comp(zbackend->name, 0, 0);
+	if (IS_ERR_OR_NULL(ctx)) {
+		kfree(buf);
+		pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
+		       PTR_ERR(ctx));
 		return;
 	}
+
+	/* A non-NULL big_oops_buf indicates compression is available. */
+	tfm = ctx;
+	big_oops_buf_sz = size;
+	big_oops_buf = buf;
+
+	pr_info("Using compression: %s\n", zbackend->name);
 }
 
 static void free_buf_for_compression(void)
 {
-	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm))
+	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm)
 		crypto_free_comp(tfm);
 	kfree(big_oops_buf);
 	big_oops_buf = NULL;
@@ -774,7 +794,6 @@ void __init pstore_choose_compression(void)
 	for (step = zbackends; step->name; step++) {
 		if (!strcmp(compress, step->name)) {
 			zbackend = step;
-			pr_info("using %s compression\n", zbackend->name);
 			return;
 		}
 	}
@@ -791,8 +810,7 @@ static int __init pstore_init(void)
 	 * initialize compression because crypto was not ready. If so,
 	 * initialize compression now.
 	 */
-	if (psinfo && !tfm)
-		allocate_buf_for_compression();
+	allocate_buf_for_compression();
 
 	ret = pstore_init_fs();
 	if (ret)
-- 
cgit v1.2.3


From 6172ae55a187378b091282c7a5b4895635e63371 Mon Sep 17 00:00:00 2001
From: "zhangyi (F)" <yi.zhang@huawei.com>
Date: Thu, 21 Feb 2019 11:29:10 -0500
Subject: ext4: fix compile error when using BUFFER_TRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit ddccb6dbe780d68133191477571cb7c69e17bb8c upstream.

Fix compile error below when using BUFFER_TRACE.

fs/ext4/inode.c: In function ‘ext4_expand_extra_isize’:
fs/ext4/inode.c:5979:19: error: request for member ‘bh’ in something not a structure or union
  BUFFER_TRACE(iloc.bh, "get_write_access");

Fixes: c03b45b853f58 ("ext4, project: expand inode extra size if possible")
Signed-off-by: zhangyi (F) <yi.zhang@huawei.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c43c5b92229..18884f39610f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5971,7 +5971,7 @@ int ext4_expand_extra_isize(struct inode *inode,
 
 	ext4_write_lock_xattr(inode, &no_expand);
 
-	BUFFER_TRACE(iloc.bh, "get_write_access");
+	BUFFER_TRACE(iloc->bh, "get_write_access");
 	error = ext4_journal_get_write_access(handle, iloc->bh);
 	if (error) {
 		brelse(iloc->bh);
-- 
cgit v1.2.3


From e8816d3bc5956a3d4d0f00651138a8a3b36a07bf Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@dilger.ca>
Date: Thu, 14 Feb 2019 17:52:18 -0500
Subject: ext4: don't update s_rev_level if not required

commit c9e716eb9b3455a83ed7c5f5a81256a3da779a95 upstream.

Don't update the superblock s_rev_level during mount if it isn't
actually necessary, only if superblock features are being set by
the kernel.  This was originally added for ext3 since it always
set the INCOMPAT_RECOVER and HAS_JOURNAL features during mount,
but this is not needed since no journal mode was added to ext4.

That will allow Geert to mount his 20-year-old ext2 rev 0.0 m68k
filesystem, as a testament of the backward compatibility of ext4.

Fixes: 0390131ba84f ("ext4: Allow ext4 to run without a journal")
Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/ext4.h  | 6 +++++-
 fs/ext4/inode.c | 1 -
 fs/ext4/super.c | 1 -
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2ddf7833350d..1ee51d3a978a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1670,6 +1670,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_INCOMPAT_INLINE_DATA	0x8000 /* data in inode */
 #define EXT4_FEATURE_INCOMPAT_ENCRYPT		0x10000
 
+extern void ext4_update_dynamic_rev(struct super_block *sb);
+
 #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
 static inline bool ext4_has_feature_##name(struct super_block *sb) \
 { \
@@ -1678,6 +1680,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+	ext4_update_dynamic_rev(sb); \
 	EXT4_SB(sb)->s_es->s_feature_compat |= \
 		cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
 } \
@@ -1695,6 +1698,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+	ext4_update_dynamic_rev(sb); \
 	EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
 		cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
 } \
@@ -1712,6 +1716,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \
 } \
 static inline void ext4_set_feature_##name(struct super_block *sb) \
 { \
+	ext4_update_dynamic_rev(sb); \
 	EXT4_SB(sb)->s_es->s_feature_incompat |= \
 		cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
 } \
@@ -2679,7 +2684,6 @@ do {									\
 
 #endif
 
-extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
 extern int ext4_update_rocompat_feature(handle_t *handle,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18884f39610f..67e8aa35197e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5320,7 +5320,6 @@ static int ext4_do_update_inode(handle_t *handle,
 		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
 		if (err)
 			goto out_brelse;
-		ext4_update_dynamic_rev(sb);
 		ext4_set_feature_large_file(sb);
 		ext4_handle_sync(handle);
 		err = ext4_handle_dirty_super(handle, sb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 07d73fb22b60..a270391228af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2259,7 +2259,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	ext4_update_tstamp(es, s_mtime);
-	ext4_update_dynamic_rev(sb);
 	if (sbi->s_journal)
 		ext4_set_feature_journal_needs_recovery(sb);
 
-- 
cgit v1.2.3


From 7928396df91eab24047ab5d8fb105adfe9f50daa Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 19 Apr 2019 14:55:12 -0400
Subject: proc: prevent changes to overridden credentials

commit 35a196bef449b5824033865b963ed9a43fb8c730 upstream.

Prevent userspace from changing the the /proc/PID/attr values if the
task's credentials are currently overriden.  This not only makes sense
conceptually, it also prevents some really bizarre error cases caused
when trying to commit credentials to a task with overridden
credentials.

Cc: <stable@vger.kernel.org>
Reported-by: "chengjian (D)" <cj.chengjian@huawei.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: James Morris <james.morris@microsoft.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/proc/base.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81d77b15b347..f999e8bd3771 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2542,6 +2542,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 		rcu_read_unlock();
 		return -EACCES;
 	}
+	/* Prevent changes to overridden credentials. */
+	if (current_cred() != current_real_cred()) {
+		rcu_read_unlock();
+		return -EBUSY;
+	}
 	rcu_read_unlock();
 
 	if (count > PAGE_SIZE)
-- 
cgit v1.2.3


From c939121b5435e920d2cec3c664163eb81d41b233 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Mar 2019 22:23:19 -0400
Subject: dcache: sort the freeing-without-RCU-delay mess for good.

commit 5467a68cbf6884c9a9d91e2a89140afb1839c835 upstream.

For lockless accesses to dentries we don't have pinned we rely
(among other things) upon having an RCU delay between dropping
the last reference and actually freeing the memory.

On the other hand, for things like pipes and sockets we neither
do that kind of lockless access, nor want to deal with the
overhead of an RCU delay every time a socket gets closed.

So delay was made optional - setting DCACHE_RCUACCESS in ->d_flags
made sure it would happen.  We tried to avoid setting it unless
we knew we need it.  Unfortunately, that had led to recurring
class of bugs, in which we missed the need to set it.

We only really need it for dentries that are created by
d_alloc_pseudo(), so let's not bother with trying to be smart -
just make having an RCU delay the default.  The ones that do
*not* get it set the replacement flag (DCACHE_NORCU) and we'd
better use that sparingly.  d_alloc_pseudo() is the only
such user right now.

FWIW, the race that finally prompted that switch had been
between __lock_parent() of immediate subdirectory of what's
currently the root of a disconnected tree (e.g. from
open-by-handle in progress) racing with d_splice_alias()
elsewhere picking another alias for the same inode, either
on outright corrupted fs image, or (in case of open-by-handle
on NFS) that subdirectory having been just moved on server.
It's not easy to hit, so the sky is not falling, but that's
not the first race on similar missed cases and the logics
for settinf DCACHE_RCUACCESS has gotten ridiculously
convoluted.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/dcache.c | 24 +++++++++++++-----------
 fs/nsfs.c   |  3 +--
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index cb515f183482..6e0022326afe 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -344,7 +344,7 @@ static void dentry_free(struct dentry *dentry)
 		}
 	}
 	/* if dentry was never visible to RCU, immediate free is OK */
-	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+	if (dentry->d_flags & DCACHE_NORCU)
 		__d_free(&dentry->d_u.d_rcu);
 	else
 		call_rcu(&dentry->d_u.d_rcu, __d_free);
@@ -1694,7 +1694,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	struct dentry *dentry = __d_alloc(parent->d_sb, name);
 	if (!dentry)
 		return NULL;
-	dentry->d_flags |= DCACHE_RCUACCESS;
 	spin_lock(&parent->d_lock);
 	/*
 	 * don't need child lock because it is not subject
@@ -1719,7 +1718,7 @@ struct dentry *d_alloc_cursor(struct dentry * parent)
 {
 	struct dentry *dentry = d_alloc_anon(parent->d_sb);
 	if (dentry) {
-		dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR;
+		dentry->d_flags |= DCACHE_DENTRY_CURSOR;
 		dentry->d_parent = dget(parent);
 	}
 	return dentry;
@@ -1732,10 +1731,17 @@ struct dentry *d_alloc_cursor(struct dentry * parent)
  *
  * For a filesystem that just pins its dentries in memory and never
  * performs lookups at all, return an unhashed IS_ROOT dentry.
+ * This is used for pipes, sockets et.al. - the stuff that should
+ * never be anyone's children or parents.  Unlike all other
+ * dentries, these will not have RCU delay between dropping the
+ * last reference and freeing them.
  */
 struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
 {
-	return __d_alloc(sb, name);
+	struct dentry *dentry = __d_alloc(sb, name);
+	if (likely(dentry))
+		dentry->d_flags |= DCACHE_NORCU;
+	return dentry;
 }
 EXPORT_SYMBOL(d_alloc_pseudo);
 
@@ -1899,12 +1905,10 @@ struct dentry *d_make_root(struct inode *root_inode)
 
 	if (root_inode) {
 		res = d_alloc_anon(root_inode->i_sb);
-		if (res) {
-			res->d_flags |= DCACHE_RCUACCESS;
+		if (res)
 			d_instantiate(res, root_inode);
-		} else {
+		else
 			iput(root_inode);
-		}
 	}
 	return res;
 }
@@ -2769,9 +2773,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 		copy_name(dentry, target);
 		target->d_hash.pprev = NULL;
 		dentry->d_parent->d_lockref.count++;
-		if (dentry == old_parent)
-			dentry->d_flags |= DCACHE_RCUACCESS;
-		else
+		if (dentry != old_parent) /* wasn't IS_ROOT */
 			WARN_ON(!--old_parent->d_lockref.count);
 	} else {
 		target->d_parent = old_parent;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 60702d677bd4..30d150a4f0c6 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -85,13 +85,12 @@ slow:
 	inode->i_fop = &ns_file_operations;
 	inode->i_private = ns;
 
-	dentry = d_alloc_pseudo(mnt->mnt_sb, &empty_name);
+	dentry = d_alloc_anon(mnt->mnt_sb);
 	if (!dentry) {
 		iput(inode);
 		return ERR_PTR(-ENOMEM);
 	}
 	d_instantiate(dentry, inode);
-	dentry->d_flags |= DCACHE_RCUACCESS;
 	dentry->d_fsdata = (void *)ns->ops;
 	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
 	if (d) {
-- 
cgit v1.2.3


From 939db6fdbea6a84ef8f574a2e7f3a71a8b9507e5 Mon Sep 17 00:00:00 2001
From: Christoph Probst <kernel@probst.it>
Date: Tue, 7 May 2019 17:16:40 +0200
Subject: cifs: fix strcat buffer overflow and reduce raciness in
 smb21_set_oplock_level()

commit 6a54b2e002c9d00b398d35724c79f9fe0d9b38fb upstream.

Change strcat to strncpy in the "None" case to fix a buffer overflow
when cinode->oplock is reset to 0 by another thread accessing the same
cinode. It is never valid to append "None" to any other message.

Consolidate multiple writes to cinode->oplock to reduce raciness.

Signed-off-by: Christoph Probst <kernel@probst.it>
Reviewed-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2ops.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 2001184afe70..0ccf8f9b63a2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -2348,26 +2348,28 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
 		       unsigned int epoch, bool *purge_cache)
 {
 	char message[5] = {0};
+	unsigned int new_oplock = 0;
 
 	oplock &= 0xFF;
 	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
 		return;
 
-	cinode->oplock = 0;
 	if (oplock & SMB2_LEASE_READ_CACHING_HE) {
-		cinode->oplock |= CIFS_CACHE_READ_FLG;
+		new_oplock |= CIFS_CACHE_READ_FLG;
 		strcat(message, "R");
 	}
 	if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) {
-		cinode->oplock |= CIFS_CACHE_HANDLE_FLG;
+		new_oplock |= CIFS_CACHE_HANDLE_FLG;
 		strcat(message, "H");
 	}
 	if (oplock & SMB2_LEASE_WRITE_CACHING_HE) {
-		cinode->oplock |= CIFS_CACHE_WRITE_FLG;
+		new_oplock |= CIFS_CACHE_WRITE_FLG;
 		strcat(message, "W");
 	}
-	if (!cinode->oplock)
-		strcat(message, "None");
+	if (!new_oplock)
+		strncpy(message, "None", sizeof(message));
+
+	cinode->oplock = new_oplock;
 	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
 		 &cinode->vfs_inode);
 }
-- 
cgit v1.2.3


From d3dd6057d2d61e0963a9be9cb80343add5457279 Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Date: Mon, 6 May 2019 11:57:03 +0800
Subject: NFS4: Fix v4.0 client state corruption when mount

commit f02f3755dbd14fb935d24b14650fff9ba92243b8 upstream.

stat command with soft mount never return after server is stopped.

When alloc a new client, the state of the client will be set to
NFS4CLNT_LEASE_EXPIRED.

When the server is stopped, the state manager will work, and accord
the state to recover. But the state is NFS4CLNT_LEASE_EXPIRED, it
will drain the slot table and lead other task to wait queue, until
the client recovered. Then the stat command is hung.

When discover server trunking, the client will renew the lease,
but check the client state, it lead the client state corruption.

So, we need to call state manager to recover it when detect server
ip trunking.

Signed-off-by: ZhangXiaoxu <zhangxiaoxu5@huawei.com>
Cc: stable@vger.kernel.org
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4state.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index d2f645d34eb1..3ba2087469ac 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -159,6 +159,10 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
 		/* Sustain the lease, even if it's empty.  If the clientid4
 		 * goes stale it's of no use for trunking discovery. */
 		nfs4_schedule_state_renewal(*result);
+
+		/* If the client state need to recover, do it. */
+		if (clp->cl_state)
+			nfs4_schedule_state_manager(clp);
 	}
 out:
 	return status;
-- 
cgit v1.2.3


From 04f34b76368f19f4ade154a0190b664e91bfa0df Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Tue, 7 May 2019 13:41:49 -0400
Subject: PNFS fallback to MDS if no deviceid found

commit b1029c9bc078a6f1515f55dd993b507dcc7e3440 upstream.

If we fail to find a good deviceid while trying to pnfs instead of
propogating an error back fallback to doing IO to the MDS. Currently,
code with fals the IO with EINVAL.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Fixes: 8d40b0f14846f ("NFS filelayout:call GETDEVICEINFO after pnfs_layout_process completes"
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/filelayout/filelayout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d175724ff566..2478a69da0f0 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -904,7 +904,7 @@ fl_pnfs_update_layout(struct inode *ino,
 	status = filelayout_check_deviceid(lo, fl, gfp_flags);
 	if (status) {
 		pnfs_put_lseg(lseg);
-		lseg = ERR_PTR(status);
+		lseg = NULL;
 	}
 out:
 	return lseg;
-- 
cgit v1.2.3


From a452f733f93e943d991fd98da1fda20e4cec77d1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Wed, 24 Apr 2019 17:05:06 +0200
Subject: fuse: fix writepages on 32bit

commit 9de5be06d0a89ca97b5ab902694d42dfd2bb77d2 upstream.

Writepage requests were cropped to i_size & 0xffffffff, which meant that
mmaped writes to any file larger than 4G might be silently discarded.

Fix by storing the file size in a properly sized variable (loff_t instead
of size_t).

Reported-by: Antonio SJ Musumeci <trapexit@spawn.link>
Fixes: 6eaf4782eb09 ("fuse: writepages: crop secondary requests")
Cc: <stable@vger.kernel.org> # v3.13
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index bd500c3b7858..0ada6e91cc38 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1526,7 +1526,7 @@ __acquires(fc->lock)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	size_t crop = i_size_read(inode);
+	loff_t crop = i_size_read(inode);
 	struct fuse_req *req;
 
 	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
-- 
cgit v1.2.3


From 979d2433b873cde6d2d360644754d16fe9c5481b Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Thu, 18 Apr 2019 04:04:41 +0800
Subject: fuse: honor RLIMIT_FSIZE in fuse_file_fallocate

commit 0cbade024ba501313da3b7e5dd2a188a6bc491b5 upstream.

fstests generic/228 reported this failure that fuse fallocate does not
honor what 'ulimit -f' has set.

This adds the necessary inode_newsize_ok() check.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Fixes: 05ba1f082300 ("fuse: add FALLOCATE operation")
Cc: <stable@vger.kernel.org> # v3.5
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/file.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0ada6e91cc38..21fd510183c3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2975,6 +2975,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		}
 	}
 
+	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+	    offset + length > i_size_read(inode)) {
+		err = inode_newsize_ok(inode, offset + length);
+		if (err)
+			return err;
+	}
+
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
-- 
cgit v1.2.3


From a9676c96e7e06f3f90e9f2e7413b949dfc4d2df5 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 22 Jan 2019 07:01:39 +0200
Subject: ovl: fix missing upper fs freeze protection on copy up for ioctl

commit 3428030da004a1128cbdcf93dc03e16f184d845b upstream.

Generalize the helper ovl_open_maybe_copy_up() and use it to copy up file
with data before FS_IOC_SETFLAGS ioctl.

The FS_IOC_SETFLAGS ioctl is a bit of an odd ball in vfs, which probably
caused the confusion.  File may be open O_RDONLY, but ioctl modifies the
file.  VFS does not call mnt_want_write_file() nor lock inode mutex, but
fs-specific code for FS_IOC_SETFLAGS does.  So ovl_ioctl() calls
mnt_want_write_file() for the overlay file, and fs-specific code calls
mnt_want_write_file() for upper fs file, but there was no call for
ovl_want_write() for copy up duration which prevents overlayfs from copying
up on a frozen upper fs.

Fixes: dab5ca8fd9dd ("ovl: add lsattr/chattr support")
Cc: <stable@vger.kernel.org> # v4.19
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/overlayfs/copy_up.c   | 6 +++---
 fs/overlayfs/file.c      | 5 ++---
 fs/overlayfs/overlayfs.h | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 75eeee08d848..ffc73600216b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -878,14 +878,14 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
 	return true;
 }
 
-int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
+int ovl_maybe_copy_up(struct dentry *dentry, int flags)
 {
 	int err = 0;
 
-	if (ovl_open_need_copy_up(dentry, file_flags)) {
+	if (ovl_open_need_copy_up(dentry, flags)) {
 		err = ovl_want_write(dentry);
 		if (!err) {
-			err = ovl_copy_up_flags(dentry, file_flags);
+			err = ovl_copy_up_flags(dentry, flags);
 			ovl_drop_write(dentry);
 		}
 	}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 986313da0c88..0c810f20f778 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -116,11 +116,10 @@ static int ovl_real_fdget(const struct file *file, struct fd *real)
 
 static int ovl_open(struct inode *inode, struct file *file)
 {
-	struct dentry *dentry = file_dentry(file);
 	struct file *realfile;
 	int err;
 
-	err = ovl_open_maybe_copy_up(dentry, file->f_flags);
+	err = ovl_maybe_copy_up(file_dentry(file), file->f_flags);
 	if (err)
 		return err;
 
@@ -390,7 +389,7 @@ static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (ret)
 			return ret;
 
-		ret = ovl_copy_up_with_data(file_dentry(file));
+		ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY);
 		if (!ret) {
 			ret = ovl_real_ioctl(file, cmd, arg);
 
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index d9c16ceebfe7..80fb66426760 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -411,7 +411,7 @@ extern const struct file_operations ovl_file_operations;
 int ovl_copy_up(struct dentry *dentry);
 int ovl_copy_up_with_data(struct dentry *dentry);
 int ovl_copy_up_flags(struct dentry *dentry, int flags);
-int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
+int ovl_maybe_copy_up(struct dentry *dentry, int flags);
 int ovl_copy_xattr(struct dentry *old, struct dentry *new);
 int ovl_set_attr(struct dentry *upper, struct kstat *stat);
 struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
-- 
cgit v1.2.3


From 77ca91441696be994644adf53233766994b60317 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 7 May 2019 09:20:54 -0400
Subject: ceph: flush dirty inodes before proceeding with remount

commit 00abf69dd24f4444d185982379c5cc3bb7b6d1fc upstream.

xfstest generic/452 was triggering a "Busy inodes after umount" warning.
ceph was allowing the mount to go read-only without first flushing out
dirty inodes in the cache. Ensure we sync out the filesystem before
allowing a remount to proceed.

Cc: stable@vger.kernel.org
Link: http://tracker.ceph.com/issues/39571
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ceph/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index eab1359d0553..c5cf46e43f2e 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -819,6 +819,12 @@ static void ceph_umount_begin(struct super_block *sb)
 	return;
 }
 
+static int ceph_remount(struct super_block *sb, int *flags, char *data)
+{
+	sync_filesystem(sb);
+	return 0;
+}
+
 static const struct super_operations ceph_super_ops = {
 	.alloc_inode	= ceph_alloc_inode,
 	.destroy_inode	= ceph_destroy_inode,
@@ -826,6 +832,7 @@ static const struct super_operations ceph_super_ops = {
 	.drop_inode	= ceph_drop_inode,
 	.sync_fs        = ceph_sync_fs,
 	.put_super	= ceph_put_super,
+	.remount_fs	= ceph_remount,
 	.show_options   = ceph_show_options,
 	.statfs		= ceph_statfs,
 	.umount_begin   = ceph_umount_begin,
-- 
cgit v1.2.3


From fea685000caf196f676fc5a813599f63634c2c60 Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@nexedi.com>
Date: Wed, 24 Apr 2019 07:13:57 +0000
Subject: fuse: Add FOPEN_STREAM to use stream_open()

commit bbd84f33652f852ce5992d65db4d020aba21f882 upstream.

Starting from commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per
POSIX") files opened even via nonseekable_open gate read and write via lock
and do not allow them to be run simultaneously. This can create read vs
write deadlock if a filesystem is trying to implement a socket-like file
which is intended to be simultaneously used for both read and write from
filesystem client.  See commit 10dce8af3422 ("fs: stream_open - opener for
stream-like files so that read and write can run simultaneously without
deadlock") for details and e.g. commit 581d21a2d02a ("xenbus: fix deadlock
on writes to /proc/xen/xenbus") for a similar deadlock example on
/proc/xen/xenbus.

To avoid such deadlock it was tempting to adjust fuse_finish_open to use
stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags,
but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE,
and in particular GVFS which actually uses offset in its read and write
handlers

	https://codesearch.debian.net/search?q=-%3Enonseekable+%3D
	https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080
	https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346
	https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481

so if we would do such a change it will break a real user.

Add another flag (FOPEN_STREAM) for filesystem servers to indicate that the
opened handler is having stream-like semantics; does not use file position
and thus the kernel is free to issue simultaneous read and write request on
opened file handle.

This patch together with stream_open() should be added to stable kernels
starting from v3.14+. This will allow to patch OSSPD and other FUSE
filesystems that provide stream-like files to return FOPEN_STREAM |
FOPEN_NONSEEKABLE in open handler and this way avoid the deadlock on all
kernel versions. This should work because fuse_finish_open ignores unknown
open flags returned from a filesystem and so passing FOPEN_STREAM to a
kernel that is not aware of this flag cannot hurt. In turn the kernel that
is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE
is sufficient to implement streams without read vs write deadlock.

Cc: stable@vger.kernel.org # v3.14+
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 21fd510183c3..59e8bb72dc14 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -179,7 +179,9 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 		file->f_op = &fuse_direct_io_file_operations;
 	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 		invalidate_inode_pages2(inode->i_mapping);
-	if (ff->open_flags & FOPEN_NONSEEKABLE)
+	if (ff->open_flags & FOPEN_STREAM)
+		stream_open(inode, file);
+	else if (ff->open_flags & FOPEN_NONSEEKABLE)
 		nonseekable_open(inode, file);
 	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
 		struct fuse_inode *fi = get_fuse_inode(inode);
-- 
cgit v1.2.3


From a06fdd99a3393032d1c145ffb991c4cd916a8ba4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 May 2019 22:46:11 -0400
Subject: ufs: fix braino in ufs_get_inode_gid() for solaris UFS flavour

[ Upstream commit 4e9036042fedaffcd868d7f7aa948756c48c637d ]

To choose whether to pick the GID from the old (16bit) or new (32bit)
field, we should check if the old gid field is set to 0xffff.  Mainline
checks the old *UID* field instead - cut'n'paste from the corresponding
code in ufs_get_inode_uid().

Fixes: 252e211e90ce
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/ufs/util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 1fd3011ea623..7fd4802222b8 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode)
 	case UFS_UID_44BSD:
 		return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid);
 	case UFS_UID_EFT:
-		if (inode->ui_u1.oldids.ui_suid == 0xFFFF)
+		if (inode->ui_u1.oldids.ui_sgid == 0xFFFF)
 			return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid);
 		/* Fall through */
 	default:
-- 
cgit v1.2.3


From 71e430fd593b2627baaa67e808ff27131b1418e0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 May 2019 23:35:28 -0400
Subject: ext4: do not delete unlinked inode from orphan list on failed
 truncate

commit ee0ed02ca93ef1ecf8963ad96638795d55af2c14 upstream.

It is possible that unlinked inode enters ext4_setattr() (e.g. if
somebody calls ftruncate(2) on unlinked but still open file). In such
case we should not delete the inode from the orphan list if truncate
fails. Note that this is mostly a theoretical concern as filesystem is
corrupted if we reach this path anyway but let's be consistent in our
orphan handling.

Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 67e8aa35197e..82dee8addb4b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5596,7 +5596,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			up_write(&EXT4_I(inode)->i_data_sem);
 			ext4_journal_stop(handle);
 			if (error) {
-				if (orphan)
+				if (orphan && inode->i_nlink)
 					ext4_orphan_del(NULL, inode);
 				goto err_out;
 			}
-- 
cgit v1.2.3


From 5220582c427bb377c8c4b28ed911f545e7da859d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 May 2019 23:07:08 -0400
Subject: ext4: wait for outstanding dio during truncate in nojournal mode

commit 82a25b027ca48d7ef197295846b352345853dfa8 upstream.

We didn't wait for outstanding direct IO during truncate in nojournal
mode (as we skip orphan handling in that case). This can lead to fs
corruption or stale data exposure if truncate ends up freeing blocks
and these get reallocated before direct IO finishes. Fix the condition
determining whether the wait is necessary.

CC: stable@vger.kernel.org
Fixes: 1c9114f9c0f1 ("ext4: serialize unlocked dio reads with truncate")
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/ext4/inode.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 82dee8addb4b..05dc5a4ba481 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5601,20 +5601,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 			}
 		}
-		if (!shrink)
+		if (!shrink) {
 			pagecache_isize_extended(inode, oldsize, inode->i_size);
-
-		/*
-		 * Blocks are going to be removed from the inode. Wait
-		 * for dio in flight.  Temporarily disable
-		 * dioread_nolock to prevent livelock.
-		 */
-		if (orphan) {
-			if (!ext4_should_journal_data(inode)) {
-				inode_dio_wait(inode);
-			} else
-				ext4_wait_for_tail_page_commit(inode);
+		} else {
+			/*
+			 * Blocks are going to be removed from the inode. Wait
+			 * for dio in flight.
+			 */
+			inode_dio_wait(inode);
 		}
+		if (orphan && ext4_should_journal_data(inode))
+			ext4_wait_for_tail_page_commit(inode);
 		down_write(&EXT4_I(inode)->i_mmap_sem);
 
 		rc = ext4_break_layouts(inode);
-- 
cgit v1.2.3


From 70d33cce97f01c79f2aa951684cf2116ec9ba1c5 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Sat, 16 Mar 2019 09:13:06 +0900
Subject: f2fs: Fix use of number of devices

commit 0916878da355650d7e77104a7ac0fa1784eca852 upstream.

For a single device mount using a zoned block device, the zone
information for the device is stored in the sbi->devs single entry
array and sbi->s_ndevs is set to 1. This differs from a single device
mount using a regular block device which does not allocate sbi->devs
and sets sbi->s_ndevs to 0.

However, sbi->s_devs == 0 condition is used throughout the code to
differentiate a single device mount from a multi-device mount where
sbi->s_ndevs is always larger than 1. This results in problems with
single zoned block device volumes as these are treated as multi-device
mounts but do not have the start_blk and end_blk information set. One
of the problem observed is skipping of zone discard issuing resulting in
write commands being issued to full zones or unaligned to a zone write
pointer.

Fix this problem by simply treating the cases sbi->s_ndevs == 0 (single
regular block device mount) and sbi->s_ndevs == 1 (single zoned block
device mount) in the same manner. This is done by introducing the
helper function f2fs_is_multi_device() and using this helper in place
of direct tests of sbi->s_ndevs value, improving code readability.

Fixes: 7bb3a371d199 ("f2fs: Fix zoned block device support")
Cc: <stable@vger.kernel.org>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/f2fs/data.c    | 17 +++++++++++------
 fs/f2fs/f2fs.h    | 13 ++++++++++++-
 fs/f2fs/file.c    |  2 +-
 fs/f2fs/gc.c      |  2 +-
 fs/f2fs/segment.c | 13 +++++++------
 5 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 08314fb42652..4d02e76b648a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -197,12 +197,14 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
 	struct block_device *bdev = sbi->sb->s_bdev;
 	int i;
 
-	for (i = 0; i < sbi->s_ndevs; i++) {
-		if (FDEV(i).start_blk <= blk_addr &&
-					FDEV(i).end_blk >= blk_addr) {
-			blk_addr -= FDEV(i).start_blk;
-			bdev = FDEV(i).bdev;
-			break;
+	if (f2fs_is_multi_device(sbi)) {
+		for (i = 0; i < sbi->s_ndevs; i++) {
+			if (FDEV(i).start_blk <= blk_addr &&
+			    FDEV(i).end_blk >= blk_addr) {
+				blk_addr -= FDEV(i).start_blk;
+				bdev = FDEV(i).bdev;
+				break;
+			}
 		}
 	}
 	if (bio) {
@@ -216,6 +218,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
 {
 	int i;
 
+	if (!f2fs_is_multi_device(sbi))
+		return 0;
+
 	for (i = 0; i < sbi->s_ndevs; i++)
 		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
 			return i;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1f5d5f62bb77..a4b6eacf22ea 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1336,6 +1336,17 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type)
 }
 #endif
 
+/*
+ * Test if the mounted volume is a multi-device volume.
+ *   - For a single regular disk volume, sbi->s_ndevs is 0.
+ *   - For a single zoned disk volume, sbi->s_ndevs is 1.
+ *   - For a multi-device volume, sbi->s_ndevs is always 2 or more.
+ */
+static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi)
+{
+	return sbi->s_ndevs > 1;
+}
+
 /* For write statistics. Suppose sector size is 512 bytes,
  * and the return value is in kbytes. s is of struct f2fs_sb_info.
  */
@@ -3455,7 +3466,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw)
 {
 	return (f2fs_post_read_required(inode) ||
 			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
-			F2FS_I_SB(inode)->s_ndevs);
+			f2fs_is_multi_device(F2FS_I_SB(inode)));
 }
 
 #ifdef CONFIG_F2FS_FAULT_INJECTION
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b3f46e3bec17..8d1eb8dec605 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2539,7 +2539,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
 							sizeof(range)))
 		return -EFAULT;
 
-	if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num ||
+	if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num ||
 			sbi->segs_per_sec != 1) {
 		f2fs_msg(sbi->sb, KERN_WARNING,
 			"Can't flush %u in %d for segs_per_sec %u != 1\n",
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5c8d00422237..d44b57a363ff 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1256,7 +1256,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
 	sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
 
 	/* give warm/cold data area from slower device */
-	if (sbi->s_ndevs && sbi->segs_per_sec == 1)
+	if (f2fs_is_multi_device(sbi) && sbi->segs_per_sec == 1)
 		SIT_I(sbi)->last_victim[ALLOC_NEXT] =
 				GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index ac038563273d..03fa2c4d3d79 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -574,7 +574,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
 	int ret = 0;
 	int i;
 
-	if (!sbi->s_ndevs)
+	if (!f2fs_is_multi_device(sbi))
 		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
@@ -640,7 +640,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
 		return ret;
 	}
 
-	if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+	if (atomic_inc_return(&fcc->issing_flush) == 1 ||
+	    f2fs_is_multi_device(sbi)) {
 		ret = submit_flush_wait(sbi, ino);
 		atomic_dec(&fcc->issing_flush);
 
@@ -746,7 +747,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
 {
 	int ret = 0, i;
 
-	if (!sbi->s_ndevs)
+	if (!f2fs_is_multi_device(sbi))
 		return 0;
 
 	for (i = 1; i < sbi->s_ndevs; i++) {
@@ -1289,7 +1290,7 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
 
 	trace_f2fs_queue_discard(bdev, blkstart, blklen);
 
-	if (sbi->s_ndevs) {
+	if (f2fs_is_multi_device(sbi)) {
 		int devi = f2fs_target_device_index(sbi, blkstart);
 
 		blkstart -= FDEV(devi).start_blk;
@@ -1638,7 +1639,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	block_t lblkstart = blkstart;
 	int devi = 0;
 
-	if (sbi->s_ndevs) {
+	if (f2fs_is_multi_device(sbi)) {
 		devi = f2fs_target_device_index(sbi, blkstart);
 		blkstart -= FDEV(devi).start_blk;
 	}
@@ -2971,7 +2972,7 @@ static void update_device_state(struct f2fs_io_info *fio)
 	struct f2fs_sb_info *sbi = fio->sbi;
 	unsigned int devidx;
 
-	if (!sbi->s_ndevs)
+	if (!f2fs_is_multi_device(sbi))
 		return;
 
 	devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
-- 
cgit v1.2.3


From fdc78eedc54d3d4eb0c3cd747226e5e9948fa860 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Fri, 17 May 2019 19:18:43 +0100
Subject: gfs2: Fix sign extension bug in gfs2_update_stats

commit 5a5ec83d6ac974b12085cd99b196795f14079037 upstream.

Commit 4d207133e9c3 changed the types of the statistic values in struct
gfs2_lkstats from s64 to u64.  Because of that, what should be a signed
value in gfs2_update_stats turned into an unsigned value.  When shifted
right, we end up with a large positive value instead of a small negative
value, which results in an incorrect variance estimate.

Fixes: 4d207133e9c3 ("gfs2: Make statistics unsigned, suitable for use with do_div()")
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Cc: stable@vger.kernel.org # v4.4+
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/gfs2/lock_dlm.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index ac7caa267ed6..62edf8f5615f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -31,9 +31,10 @@
  * @delta is the difference between the current rtt sample and the
  * running average srtt. We add 1/8 of that to the srtt in order to
  * update the current srtt estimate. The variance estimate is a bit
- * more complicated. We subtract the abs value of the @delta from
- * the current variance estimate and add 1/4 of that to the running
- * total.
+ * more complicated. We subtract the current variance estimate from
+ * the abs value of the @delta and add 1/4 of that to the running
+ * total.  That's equivalent to 3/4 of the current variance
+ * estimate plus 1/4 of the abs of @delta.
  *
  * Note that the index points at the array entry containing the smoothed
  * mean value, and the variance is always in the following entry
@@ -49,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
 	s64 delta = sample - s->stats[index];
 	s->stats[index] += (delta >> 3);
 	index++;
-	s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
+	s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2;
 }
 
 /**
-- 
cgit v1.2.3


From ce21e6586eece6502da23a9c275a60f2fdf8b8b8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 3 May 2019 11:10:06 -0400
Subject: btrfs: don't double unlock on error in btrfs_punch_hole

commit 8fca955057b9c58467d1b231e43f19c4cf26ae8c upstream.

If we have an error writing out a delalloc range in
btrfs_punch_hole_lock_range we'll unlock the inode and then goto
out_only_mutex, where we will again unlock the inode.  This is bad,
don't do this.

Fixes: f27451f22996 ("Btrfs: add support for fallocate's zero range operation")
CC: stable@vger.kernel.org # 4.19+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ca4902c66dc4..ed67aa91914c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2565,10 +2565,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
 					  &cached_state);
-	if (ret) {
-		inode_unlock(inode);
+	if (ret)
 		goto out_only_mutex;
-	}
 
 	path = btrfs_alloc_path();
 	if (!path) {
-- 
cgit v1.2.3


From 7ec747c811ab16deab4eb85ee2d67cb2f89cc384 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 29 Apr 2019 13:08:14 +0100
Subject: Btrfs: do not abort transaction at btrfs_update_root() after failure
 to COW path

commit 72bd2323ec87722c115a5906bc6a1b31d11e8f54 upstream.

Currently when we fail to COW a path at btrfs_update_root() we end up
always aborting the transaction. However all the current callers of
btrfs_update_root() are able to deal with errors returned from it, many do
end up aborting the transaction themselves (directly or not, such as the
transaction commit path), other BUG_ON() or just gracefully cancel whatever
they were doing.

When syncing the fsync log, we call btrfs_update_root() through
tree-log.c:update_log_root(), and if it returns an -ENOSPC error, the log
sync code does not abort the transaction, instead it gracefully handles
the error and returns -EAGAIN to the fsync handler, so that it falls back
to a transaction commit. Any other error different from -ENOSPC, makes the
log sync code abort the transaction.

So remove the transaction abort from btrfs_update_log() when we fail to
COW a path to update the root item, so that if an -ENOSPC failure happens
we avoid aborting the current transaction and have a chance of the fsync
succeeding after falling back to a transaction commit.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203413
Fixes: 79787eaab46121 ("btrfs: replace many BUG_ONs with proper error handling")
Cc: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/root-tree.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 65bda0682928..fb205ebeca39 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -132,10 +132,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		return -ENOMEM;
 
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0) {
-		btrfs_abort_transaction(trans, ret);
+	if (ret < 0)
 		goto out;
-	}
 
 	if (ret != 0) {
 		btrfs_print_leaf(path->nodes[0]);
-- 
cgit v1.2.3


From 4f9a774dda97c0a07ae1e475182f29889b109cba Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 May 2019 16:43:51 +0100
Subject: Btrfs: avoid fallback to transaction commit during fsync of files
 with holes

commit ebb929060aeb162417b4c1307e63daee47b208d9 upstream.

When we are doing a full fsync (bit BTRFS_INODE_NEEDS_FULL_SYNC set) of a
file that has holes and has file extent items spanning two or more leafs,
we can end up falling to back to a full transaction commit due to a logic
bug that leads to failure to insert a duplicate file extent item that is
meant to represent a hole between the last file extent item of a leaf and
the first file extent item in the next leaf. The failure (EEXIST error)
leads to a transaction commit (as most errors when logging an inode do).

For example, we have the two following leafs:

Leaf N:

  -----------------------------------------------
  | ..., ..., ..., (257, FILE_EXTENT_ITEM, 64K) |
  -----------------------------------------------
  The file extent item at the end of leaf N has a length of 4Kb,
  representing the file range from 64K to 68K - 1.

Leaf N + 1:

  -----------------------------------------------
  | (257, FILE_EXTENT_ITEM, 72K), ..., ..., ... |
  -----------------------------------------------
  The file extent item at the first slot of leaf N + 1 has a length of
  4Kb too, representing the file range from 72K to 76K - 1.

During the full fsync path, when we are at tree-log.c:copy_items() with
leaf N as a parameter, after processing the last file extent item, that
represents the extent at offset 64K, we take a look at the first file
extent item at the next leaf (leaf N + 1), and notice there's a 4K hole
between the two extents, and therefore we insert a file extent item
representing that hole, starting at file offset 68K and ending at offset
72K - 1. However we don't update the value of *last_extent, which is used
to represent the end offset (plus 1, non-inclusive end) of the last file
extent item inserted in the log, so it stays with a value of 68K and not
with a value of 72K.

Then, when copy_items() is called for leaf N + 1, because the value of
*last_extent is smaller then the offset of the first extent item in the
leaf (68K < 72K), we look at the last file extent item in the previous
leaf (leaf N) and see it there's a 4K gap between it and our first file
extent item (again, 68K < 72K), so we decide to insert a file extent item
representing the hole, starting at file offset 68K and ending at offset
72K - 1, this insertion will fail with -EEXIST being returned from
btrfs_insert_file_extent() because we already inserted a file extent item
representing a hole for this offset (68K) in the previous call to
copy_items(), when processing leaf N.

The -EEXIST error gets propagated to the fsync callback, btrfs_sync_file(),
which falls back to a full transaction commit.

Fix this by adjusting *last_extent after inserting a hole when we had to
look at the next leaf.

Fixes: 4ee3fad34a9c ("Btrfs: fix fsync after hole punching when using no-holes feature")
Cc: stable@vger.kernel.org # 4.14+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2f4f0958e5f2..75051d36dc1a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4121,6 +4121,7 @@ fill_holes:
 							       *last_extent, 0,
 							       0, len, 0, len,
 							       0, 0, 0);
+				*last_extent += len;
 			}
 		}
 	}
-- 
cgit v1.2.3


From 92f907d7d63b178da6d01a6870dea7831e6abf3e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 6 May 2019 16:44:02 +0100
Subject: Btrfs: fix race between ranged fsync and writeback of adjacent ranges

commit 0c713cbab6200b0ab6473b50435e450a6e1de85d upstream.

When we do a full fsync (the bit BTRFS_INODE_NEEDS_FULL_SYNC is set in the
inode) that happens to be ranged, which happens during a msync() or writes
for files opened with O_SYNC for example, we can end up with a corrupt log,
due to different file extent items representing ranges that overlap with
each other, or hit some assertion failures.

When doing a ranged fsync we only flush delalloc and wait for ordered
exents within that range. If while we are logging items from our inode
ordered extents for adjacent ranges complete, we end up in a race that can
make us insert the file extent items that overlap with others we logged
previously and the assertion failures.

For example, if tree-log.c:copy_items() receives a leaf that has the
following file extents items, all with a length of 4K and therefore there
is an implicit hole in the range 68K to 72K - 1:

  (257 EXTENT_ITEM 64K), (257 EXTENT_ITEM 72K), (257 EXTENT_ITEM 76K), ...

It copies them to the log tree. However due to the need to detect implicit
holes, it may release the path, in order to look at the previous leaf to
detect an implicit hole, and then later it will search again in the tree
for the first file extent item key, with the goal of locking again the
leaf (which might have changed due to concurrent changes to other inodes).

However when it locks again the leaf containing the first key, the key
corresponding to the extent at offset 72K may not be there anymore since
there is an ordered extent for that range that is finishing (that is,
somewhere in the middle of btrfs_finish_ordered_io()), and it just
removed the file extent item but has not yet replaced it with a new file
extent item, so the part of copy_items() that does hole detection will
decide that there is a hole in the range starting from 68K to 76K - 1,
and therefore insert a file extent item to represent that hole, having
a key offset of 68K. After that we now have a log tree with 2 different
extent items that have overlapping ranges:

 1) The file extent item copied before copy_items() released the path,
    which has a key offset of 72K and a length of 4K, representing the
    file range 72K to 76K - 1.

 2) And a file extent item representing a hole that has a key offset of
    68K and a length of 8K, representing the range 68K to 76K - 1. This
    item was inserted after releasing the path, and overlaps with the
    extent item inserted before.

The overlapping extent items can cause all sorts of unpredictable and
incorrect behaviour, either when replayed or if a fast (non full) fsync
happens later, which can trigger a BUG_ON() when calling
btrfs_set_item_key_safe() through __btrfs_drop_extents(), producing a
trace like the following:

  [61666.783269] ------------[ cut here ]------------
  [61666.783943] kernel BUG at fs/btrfs/ctree.c:3182!
  [61666.784644] invalid opcode: 0000 [#1] PREEMPT SMP
  (...)
  [61666.786253] task: ffff880117b88c40 task.stack: ffffc90008168000
  [61666.786253] RIP: 0010:btrfs_set_item_key_safe+0x7c/0xd2 [btrfs]
  [61666.786253] RSP: 0018:ffffc9000816b958 EFLAGS: 00010246
  [61666.786253] RAX: 0000000000000000 RBX: 000000000000000f RCX: 0000000000030000
  [61666.786253] RDX: 0000000000000000 RSI: ffffc9000816ba4f RDI: ffffc9000816b937
  [61666.786253] RBP: ffffc9000816b998 R08: ffff88011dae2428 R09: 0000000000001000
  [61666.786253] R10: 0000160000000000 R11: 6db6db6db6db6db7 R12: ffff88011dae2418
  [61666.786253] R13: ffffc9000816ba4f R14: ffff8801e10c4118 R15: ffff8801e715c000
  [61666.786253] FS:  00007f6060a18700(0000) GS:ffff88023f5c0000(0000) knlGS:0000000000000000
  [61666.786253] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [61666.786253] CR2: 00007f6060a28000 CR3: 0000000213e69000 CR4: 00000000000006e0
  [61666.786253] Call Trace:
  [61666.786253]  __btrfs_drop_extents+0x5e3/0xaad [btrfs]
  [61666.786253]  ? time_hardirqs_on+0x9/0x14
  [61666.786253]  btrfs_log_changed_extents+0x294/0x4e0 [btrfs]
  [61666.786253]  ? release_extent_buffer+0x38/0xb4 [btrfs]
  [61666.786253]  btrfs_log_inode+0xb6e/0xcdc [btrfs]
  [61666.786253]  ? lock_acquire+0x131/0x1c5
  [61666.786253]  ? btrfs_log_inode_parent+0xee/0x659 [btrfs]
  [61666.786253]  ? arch_local_irq_save+0x9/0xc
  [61666.786253]  ? btrfs_log_inode_parent+0x1f5/0x659 [btrfs]
  [61666.786253]  btrfs_log_inode_parent+0x223/0x659 [btrfs]
  [61666.786253]  ? arch_local_irq_save+0x9/0xc
  [61666.786253]  ? lockref_get_not_zero+0x2c/0x34
  [61666.786253]  ? rcu_read_unlock+0x3e/0x5d
  [61666.786253]  btrfs_log_dentry_safe+0x60/0x7b [btrfs]
  [61666.786253]  btrfs_sync_file+0x317/0x42c [btrfs]
  [61666.786253]  vfs_fsync_range+0x8c/0x9e
  [61666.786253]  SyS_msync+0x13c/0x1c9
  [61666.786253]  entry_SYSCALL_64_fastpath+0x18/0xad

A sample of a corrupt log tree leaf with overlapping extents I got from
running btrfs/072:

      item 14 key (295 108 200704) itemoff 2599 itemsize 53
              extent data disk bytenr 0 nr 0
              extent data offset 0 nr 458752 ram 458752
      item 15 key (295 108 659456) itemoff 2546 itemsize 53
              extent data disk bytenr 4343541760 nr 770048
              extent data offset 606208 nr 163840 ram 770048
      item 16 key (295 108 663552) itemoff 2493 itemsize 53
              extent data disk bytenr 4343541760 nr 770048
              extent data offset 610304 nr 155648 ram 770048
      item 17 key (295 108 819200) itemoff 2440 itemsize 53
              extent data disk bytenr 4334788608 nr 4096
              extent data offset 0 nr 4096 ram 4096

The file extent item at offset 659456 (item 15) ends at offset 823296
(659456 + 163840) while the next file extent item (item 16) starts at
offset 663552.

Another different problem that the race can trigger is a failure in the
assertions at tree-log.c:copy_items(), which expect that the first file
extent item key we found before releasing the path exists after we have
released path and that the last key we found before releasing the path
also exists after releasing the path:

  $ cat -n fs/btrfs/tree-log.c
  4080          if (need_find_last_extent) {
  4081                  /* btrfs_prev_leaf could return 1 without releasing the path */
  4082                  btrfs_release_path(src_path);
  4083                  ret = btrfs_search_slot(NULL, inode->root, &first_key,
  4084                                  src_path, 0, 0);
  4085                  if (ret < 0)
  4086                          return ret;
  4087                  ASSERT(ret == 0);
  (...)
  4103                  if (i >= btrfs_header_nritems(src_path->nodes[0])) {
  4104                          ret = btrfs_next_leaf(inode->root, src_path);
  4105                          if (ret < 0)
  4106                                  return ret;
  4107                          ASSERT(ret == 0);
  4108                          src = src_path->nodes[0];
  4109                          i = 0;
  4110                          need_find_last_extent = true;
  4111                  }
  (...)

The second assertion implicitly expects that the last key before the path
release still exists, because the surrounding while loop only stops after
we have found that key. When this assertion fails it produces a stack like
this:

  [139590.037075] assertion failed: ret == 0, file: fs/btrfs/tree-log.c, line: 4107
  [139590.037406] ------------[ cut here ]------------
  [139590.037707] kernel BUG at fs/btrfs/ctree.h:3546!
  [139590.038034] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI
  [139590.038340] CPU: 1 PID: 31841 Comm: fsstress Tainted: G        W         5.0.0-btrfs-next-46 #1
  (...)
  [139590.039354] RIP: 0010:assfail.constprop.24+0x18/0x1a [btrfs]
  (...)
  [139590.040397] RSP: 0018:ffffa27f48f2b9b0 EFLAGS: 00010282
  [139590.040730] RAX: 0000000000000041 RBX: ffff897c635d92c8 RCX: 0000000000000000
  [139590.041105] RDX: 0000000000000000 RSI: ffff897d36a96868 RDI: ffff897d36a96868
  [139590.041470] RBP: ffff897d1b9a0708 R08: 0000000000000000 R09: 0000000000000000
  [139590.041815] R10: 0000000000000008 R11: 0000000000000000 R12: 0000000000000013
  [139590.042159] R13: 0000000000000227 R14: ffff897cffcbba88 R15: 0000000000000001
  [139590.042501] FS:  00007f2efc8dee80(0000) GS:ffff897d36a80000(0000) knlGS:0000000000000000
  [139590.042847] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [139590.043199] CR2: 00007f8c064935e0 CR3: 0000000232252002 CR4: 00000000003606e0
  [139590.043547] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [139590.043899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [139590.044250] Call Trace:
  [139590.044631]  copy_items+0xa3f/0x1000 [btrfs]
  [139590.045009]  ? generic_bin_search.constprop.32+0x61/0x200 [btrfs]
  [139590.045396]  btrfs_log_inode+0x7b3/0xd70 [btrfs]
  [139590.045773]  btrfs_log_inode_parent+0x2b3/0xce0 [btrfs]
  [139590.046143]  ? do_raw_spin_unlock+0x49/0xc0
  [139590.046510]  btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
  [139590.046872]  btrfs_sync_file+0x3b6/0x440 [btrfs]
  [139590.047243]  btrfs_file_write_iter+0x45b/0x5c0 [btrfs]
  [139590.047592]  __vfs_write+0x129/0x1c0
  [139590.047932]  vfs_write+0xc2/0x1b0
  [139590.048270]  ksys_write+0x55/0xc0
  [139590.048608]  do_syscall_64+0x60/0x1b0
  [139590.048946]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [139590.049287] RIP: 0033:0x7f2efc4be190
  (...)
  [139590.050342] RSP: 002b:00007ffe743243a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
  [139590.050701] RAX: ffffffffffffffda RBX: 0000000000008d58 RCX: 00007f2efc4be190
  [139590.051067] RDX: 0000000000008d58 RSI: 00005567eca0f370 RDI: 0000000000000003
  [139590.051459] RBP: 0000000000000024 R08: 0000000000000003 R09: 0000000000008d60
  [139590.051863] R10: 0000000000000078 R11: 0000000000000246 R12: 0000000000000003
  [139590.052252] R13: 00000000003d3507 R14: 00005567eca0f370 R15: 0000000000000000
  (...)
  [139590.055128] ---[ end trace 193f35d0215cdeeb ]---

So fix this race between a full ranged fsync and writeback of adjacent
ranges by flushing all delalloc and waiting for all ordered extents to
complete before logging the inode. This is the simplest way to solve the
problem because currently the full fsync path does not deal with ranges
at all (it assumes a full range from 0 to LLONG_MAX) and it always needs
to look at adjacent ranges for hole detection. For use cases of ranged
fsyncs this can make a few fsyncs slower but on the other hand it can
make some following fsyncs to other ranges do less work or no need to do
anything at all. A full fsync is rare anyway and happens only once after
loading/creating an inode and once after less common operations such as a
shrinking truncate.

This is an issue that exists for a long time, and was often triggered by
generic/127, because it does mmap'ed writes and msync (which triggers a
ranged fsync). Adding support for the tree checker to detect overlapping
extents (next patch in the series) and trigger a WARN() when such cases
are found, and then calling btrfs_check_leaf_full() at the end of
btrfs_insert_file_extent() made the issue much easier to detect. Running
btrfs/072 with that change to the tree checker and making fsstress open
files always with O_SYNC made it much easier to trigger the issue (as
triggering it with generic/127 is very rare).

CC: stable@vger.kernel.org # 3.16+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/file.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ed67aa91914c..de4d3baec161 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2058,6 +2058,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	int ret = 0, err;
 	u64 len;
 
+	/*
+	 * If the inode needs a full sync, make sure we use a full range to
+	 * avoid log tree corruption, due to hole detection racing with ordered
+	 * extent completion for adjacent ranges, and assertion failures during
+	 * hole detection.
+	 */
+	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		start = 0;
+		end = LLONG_MAX;
+	}
+
 	/*
 	 * The range length can be represented by u64, we have to do the typecasts
 	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
-- 
cgit v1.2.3


From 946ad2ecef611c86166fbf3eeaf6d4d12e3d4305 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <tobin@kernel.org>
Date: Mon, 13 May 2019 13:39:11 +1000
Subject: btrfs: sysfs: Fix error path kobject memory leak

commit 450ff8348808a89cc27436771aa05c2b90c0eef1 upstream.

If a call to kobject_init_and_add() fails we must call kobject_put()
otherwise we leak memory.

Calling kobject_put() when kobject_init_and_add() fails drops the
refcount back to 0 and calls the ktype release method (which in turn
calls the percpu destroy and kfree).

Add call to kobject_put() in the error path of call to
kobject_init_and_add().

Cc: stable@vger.kernel.org # v4.4+
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Tobin C. Harding <tobin@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 809c2c307c64..6d155d7d439d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3911,8 +3911,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
 				    info->space_info_kobj, "%s",
 				    alloc_name(space_info->flags));
 	if (ret) {
-		percpu_counter_destroy(&space_info->total_bytes_pinned);
-		kfree(space_info);
+		kobject_put(&space_info->kobj);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From 94e1f96667b4d4cdc3a06592f109422d088eb352 Mon Sep 17 00:00:00 2001
From: "Tobin C. Harding" <tobin@kernel.org>
Date: Mon, 13 May 2019 13:39:12 +1000
Subject: btrfs: sysfs: don't leak memory when failing add fsid

commit e32773357d5cc271b1d23550b3ed026eb5c2a468 upstream.

A failed call to kobject_init_and_add() must be followed by a call to
kobject_put().  Currently in the error path when adding fs_devices we
are missing this call.  This could be fixed by calling
btrfs_sysfs_remove_fsid() if btrfs_sysfs_add_fsid() returns an error or
by adding a call to kobject_put() directly in btrfs_sysfs_add_fsid().
Here we choose the second option because it prevents the slightly
unusual error path handling requirements of kobject from leaking out
into btrfs functions.

Add a call to kobject_put() in the error path of kobject_add_and_init().
This causes the release method to be called if kobject_init_and_add()
fails.  open_tree() is the function that calls btrfs_sysfs_add_fsid()
and the error code in this function is already written with the
assumption that the release method is called during the error path of
open_tree() (as seen by the call to btrfs_sysfs_remove_fsid() under the
fail_fsdev_sysfs label).

Cc: stable@vger.kernel.org # v4.4+
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Tobin C. Harding <tobin@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/sysfs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3717c864ba23..aefb0169d46d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -811,7 +811,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 	fs_devs->fsid_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs_devs->fsid_kobj,
 				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
-	return error;
+	if (error) {
+		kobject_put(&fs_devs->fsid_kobj);
+		return error;
+	}
+
+	return 0;
 }
 
 int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
-- 
cgit v1.2.3


From e1eed6928b3ee66a1ac63c7d3eebc63066d59730 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <olga.kornievskaia@gmail.com>
Date: Fri, 11 Jan 2019 19:04:44 -0500
Subject: NFSv4.2 fix unnecessary retry in nfs4_copy_file_range

commit 45ac486ecf2dc998e25cf32f0cabf2deaad875be upstream.

Currently nfs42_proc_copy_file_range() can not return EAGAIN.

Fixes: e4648aa4f98a ("NFS recover from destination server reboot for copies")
Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Cc: Yu Xu <xuyu@linux.alibaba.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4file.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4288a6ecaf75..31dac3f2033f 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    size_t count, unsigned int flags)
 {
-	ssize_t ret;
-
 	if (file_inode(file_in) == file_inode(file_out))
 		return -EINVAL;
-retry:
-	ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
-	if (ret == -EAGAIN)
-		goto retry;
-	return ret;
+	return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
 }
 
 static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
-- 
cgit v1.2.3


From cc1afc1050a94be73f85c81889941f1fa7cda0e4 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Thu, 11 Apr 2019 14:34:18 -0400
Subject: NFSv4.1 fix incorrect return value in copy_file_range

commit 0769663b4f580566ef6cdf366f3073dbe8022c39 upstream.

According to the NFSv4.2 spec if the input and output file is the
same file, operation should fail with EINVAL. However, linux
copy_file_range() system call has no such restrictions. Therefore,
in such case let's return EOPNOTSUPP and allow VFS to fallback
to doing do_splice_direct(). Also when copy_file_range is called
on an NFSv4.0 or 4.1 mount (ie., a server that doesn't support
COPY functionality), we also need to return EOPNOTSUPP and
fallback to a regular copy.

Fixes xfstest generic/075, generic/091, generic/112, generic/263
for all NFSv4.x versions.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Yu Xu <xuyu@linux.alibaba.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs42proc.c | 3 ---
 fs/nfs/nfs4file.c  | 4 +++-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index fed06fd9998d..94f98e190e63 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
 	};
 	ssize_t err, err2;
 
-	if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY))
-		return -EOPNOTSUPP;
-
 	src_lock = nfs_get_lock_context(nfs_file_open_context(src));
 	if (IS_ERR(src_lock))
 		return PTR_ERR(src_lock);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 31dac3f2033f..134858507268 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    size_t count, unsigned int flags)
 {
+	if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY))
+		return -EOPNOTSUPP;
 	if (file_inode(file_in) == file_inode(file_out))
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
 }
 
-- 
cgit v1.2.3


From 9c0339dd381db5b06afd377b97c7713d9157cae9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 16 Jan 2019 11:00:57 -0500
Subject: btrfs: honor path->skip_locking in backref code

commit 38e3eebff643db725633657d1d87a3be019d1018 upstream.

Qgroups will do the old roots lookup at delayed ref time, which could be
while walking down the extent root while running a delayed ref.  This
should be fine, except we specifically lock eb's in the backref walking
code irrespective of path->skip_locking, which deadlocks the system.
Fix up the backref code to honor path->skip_locking, nobody will be
modifying the commit_root when we're searching so it's completely safe
to do.

This happens since fb235dc06fac ("btrfs: qgroup: Move half of the qgroup
accounting time out of commit trans"), kernel may lockup with quota
enabled.

There is one backref trace triggered by snapshot dropping along with
write operation in the source subvolume.  The example can be reliably
reproduced:

  btrfs-cleaner   D    0  4062      2 0x80000000
  Call Trace:
   schedule+0x32/0x90
   btrfs_tree_read_lock+0x93/0x130 [btrfs]
   find_parent_nodes+0x29b/0x1170 [btrfs]
   btrfs_find_all_roots_safe+0xa8/0x120 [btrfs]
   btrfs_find_all_roots+0x57/0x70 [btrfs]
   btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs]
   btrfs_qgroup_trace_leaf_items+0x10b/0x140 [btrfs]
   btrfs_qgroup_trace_subtree+0xc8/0xe0 [btrfs]
   do_walk_down+0x541/0x5e3 [btrfs]
   walk_down_tree+0xab/0xe7 [btrfs]
   btrfs_drop_snapshot+0x356/0x71a [btrfs]
   btrfs_clean_one_deleted_snapshot+0xb8/0xf0 [btrfs]
   cleaner_kthread+0x12b/0x160 [btrfs]
   kthread+0x112/0x130
   ret_from_fork+0x27/0x50

When dropping snapshots with qgroup enabled, we will trigger backref
walk.

However such backref walk at that timing is pretty dangerous, as if one
of the parent nodes get WRITE locked by other thread, we could cause a
dead lock.

For example:

           FS 260     FS 261 (Dropped)
            node A        node B
           /      \      /      \
       node C      node D      node E
      /   \         /  \        /     \
  leaf F|leaf G|leaf H|leaf I|leaf J|leaf K

The lock sequence would be:

      Thread A (cleaner)             |       Thread B (other writer)
-----------------------------------------------------------------------
write_lock(B)                        |
write_lock(D)                        |
^^^ called by walk_down_tree()       |
                                     |       write_lock(A)
                                     |       write_lock(D) << Stall
read_lock(H) << for backref walk     |
read_lock(D) << lock owner is        |
                the same thread A    |
                so read lock is OK   |
read_lock(A) << Stall                |

So thread A hold write lock D, and needs read lock A to unlock.
While thread B holds write lock A, while needs lock D to unlock.

This will cause a deadlock.

This is not only limited to snapshot dropping case.  As the backref
walk, even only happens on commit trees, is breaking the normal top-down
locking order, makes it deadlock prone.

Fixes: fb235dc06fac ("btrfs: qgroup: Move half of the qgroup accounting time out of commit trans")
CC: stable@vger.kernel.org # 4.14+
Reported-and-tested-by: David Sterba <dsterba@suse.com>
Reported-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[ rebase to latest branch and fix lock assert bug in btrfs/007 ]
[ backport to linux-4.19.y branch, solve minor conflicts ]
Signed-off-by: Qu Wenruo <wqu@suse.com>
[ copy logs and deadlock analysis from Qu's patch ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/backref.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2a4f52c7be22..ac6c383d6314 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -710,7 +710,7 @@ out:
  * read tree blocks and add keys where required.
  */
 static int add_missing_keys(struct btrfs_fs_info *fs_info,
-			    struct preftrees *preftrees)
+			    struct preftrees *preftrees, bool lock)
 {
 	struct prelim_ref *ref;
 	struct extent_buffer *eb;
@@ -735,12 +735,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
 			free_extent_buffer(eb);
 			return -EIO;
 		}
-		btrfs_tree_read_lock(eb);
+		if (lock)
+			btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
 		else
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
-		btrfs_tree_read_unlock(eb);
+		if (lock)
+			btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 		prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL);
 		cond_resched();
@@ -1225,7 +1227,7 @@ again:
 
 	btrfs_release_path(path);
 
-	ret = add_missing_keys(fs_info, &preftrees);
+	ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0);
 	if (ret)
 		goto out;
 
@@ -1286,11 +1288,14 @@ again:
 					ret = -EIO;
 					goto out;
 				}
-				btrfs_tree_read_lock(eb);
-				btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				if (!path->skip_locking) {
+					btrfs_tree_read_lock(eb);
+					btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie, ignore_offset);
-				btrfs_tree_read_unlock_blocking(eb);
+				if (!path->skip_locking)
+					btrfs_tree_read_unlock_blocking(eb);
 				free_extent_buffer(eb);
 				if (ret < 0)
 					goto out;
-- 
cgit v1.2.3


From 86c43c40fe05c78eae5b6a7d3665220457920224 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Thu, 28 Mar 2019 17:38:29 +0200
Subject: ovl: relax WARN_ON() for overlapping layers use case

commit acf3062a7e1ccf67c6f7e7c28671a6708fde63b0 upstream.

This nasty little syzbot repro:
https://syzkaller.appspot.com/x/repro.syz?x=12c7a94f400000

Creates overlay mounts where the same directory is both in upper and lower
layers. Simplified example:

  mkdir foo work
  mount -t overlay none foo -o"lowerdir=.,upperdir=foo,workdir=work"

The repro runs several threads in parallel that attempt to chdir into foo
and attempt to symlink/rename/exec/mkdir the file bar.

The repro hits a WARN_ON() I placed in ovl_instantiate(), which suggests
that an overlay inode already exists in cache and is hashed by the pointer
of the real upper dentry that ovl_create_real() has just created. At the
point of the WARN_ON(), for overlay dir inode lock is held and upper dir
inode lock, so at first, I did not see how this was possible.

On a closer look, I see that after ovl_create_real(), because of the
overlapping upper and lower layers, a lookup by another thread can find the
file foo/bar that was just created in upper layer, at overlay path
foo/foo/bar and hash the an overlay inode with the new real dentry as lower
dentry. This is possible because the overlay directory foo/foo is not
locked and the upper dentry foo/bar is in dcache, so ovl_lookup() can find
it without taking upper dir inode shared lock.

Overlapping layers is considered a wrong setup which would result in
unexpected behavior, but it shouldn't crash the kernel and it shouldn't
trigger WARN_ON() either, so relax this WARN_ON() and leave a pr_warn()
instead to cover all cases of failure to get an overlay inode.

The error returned from failure to insert new inode to cache with
inode_insert5() was changed to -EEXIST, to distinguish from the error
-ENOMEM returned on failure to get/allocate inode with iget5_locked().

Reported-by: syzbot+9c69c282adc4edd2b540@syzkaller.appspotmail.com
Fixes: 01b39dcc9568 ("ovl: use inode_insert5() to hash a newly...")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/overlayfs/dir.c   | 2 +-
 fs/overlayfs/inode.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index b2aadd3e1fec..336f04da80ed 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -260,7 +260,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
 		 * hashed directory inode aliases.
 		 */
 		inode = ovl_get_inode(dentry->d_sb, &oip);
-		if (WARN_ON(IS_ERR(inode)))
+		if (IS_ERR(inode))
 			return PTR_ERR(inode);
 	} else {
 		WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 3b7ed5d2279c..b48273e846ad 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -832,7 +832,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
 	int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
 	bool is_dir, metacopy = false;
 	unsigned long ino = 0;
-	int err = -ENOMEM;
+	int err = oip->newinode ? -EEXIST : -ENOMEM;
 
 	if (!realinode)
 		realinode = d_inode(lowerdentry);
@@ -917,6 +917,7 @@ out:
 	return inode;
 
 out_err:
+	pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err);
 	inode = ERR_PTR(err);
 	goto out;
 }
-- 
cgit v1.2.3


From 7c2bcb3cca0339b73f6e1883c14f1166f233c8d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Apr 2019 21:04:13 -0400
Subject: acct_on(): don't mess with freeze protection

commit 9419a3191dcb27f24478d288abaab697228d28e6 upstream.

What happens there is that we are replacing file->path.mnt of
a file we'd just opened with a clone and we need the write
count contribution to be transferred from original mount to
new one.  That's it.  We do *NOT* want any kind of freeze
protection for the duration of switchover.

IOW, we should just use __mnt_{want,drop}_write() for that
switchover; no need to bother with mnt_{want,drop}_write()
there.

Tested-by: Amir Goldstein <amir73il@gmail.com>
Reported-by: syzbot+2a73a6ea9507b7112141@syzkaller.appspotmail.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/internal.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/internal.h b/fs/internal.h
index d410186bc369..d109665b9e50 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -80,9 +80,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write(struct vfsmount *);
 extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write(struct vfsmount *);
 extern void __mnt_drop_write_file(struct file *);
 
 /*
-- 
cgit v1.2.3


From 06a67c0f4abb34784cc296fc3d242fcd03459ca2 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 29 May 2019 19:25:44 +0200
Subject: Revert "btrfs: Honour FITRIM range constraints during free space
 trim"

This reverts commit 8b13bb911f0c0c77d41e5ddc41ad3c127c356b8a.

There is currently no corresponding patch in master due to additional
changes that would be significantly different from plain revert in the
respective stable branch.

The range argument was not handled correctly and could cause trim to
overlap allocated areas or reach beyond the end of the device. The
address space that fitrim normally operates on is in logical
coordinates, while the discards are done on the physical device extents.
This distinction cannot be made with the current ioctl interface and
caused the confusion.

The bug depends on the layout of block groups and does not always
happen. The whole-fs trim (run by default by the fstrim tool) is not
affected.

Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/extent-tree.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6d155d7d439d..0cc800d22a08 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10788,9 +10788,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
  * held back allocations.
  */
 static int btrfs_trim_free_extents(struct btrfs_device *device,
-				   struct fstrim_range *range, u64 *trimmed)
+				   u64 minlen, u64 *trimmed)
 {
-	u64 start = range->start, len = 0;
+	u64 start = 0, len = 0;
 	int ret;
 
 	*trimmed = 0;
@@ -10833,8 +10833,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 		if (!trans)
 			up_read(&fs_info->commit_root_sem);
 
-		ret = find_free_dev_extent_start(trans, device, range->minlen,
-						 start, &start, &len);
+		ret = find_free_dev_extent_start(trans, device, minlen, start,
+						 &start, &len);
 		if (trans) {
 			up_read(&fs_info->commit_root_sem);
 			btrfs_put_transaction(trans);
@@ -10847,16 +10847,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 			break;
 		}
 
-		/* If we are out of the passed range break */
-		if (start > range->start + range->len - 1) {
-			mutex_unlock(&fs_info->chunk_mutex);
-			ret = 0;
-			break;
-		}
-
-		start = max(range->start, start);
-		len = min(range->len, len);
-
 		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
 		mutex_unlock(&fs_info->chunk_mutex);
 
@@ -10866,10 +10856,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 		start += len;
 		*trimmed += bytes;
 
-		/* We've trimmed enough */
-		if (*trimmed >= range->len)
-			break;
-
 		if (fatal_signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
@@ -10953,7 +10939,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
-		ret = btrfs_trim_free_extents(device, range, &group_trimmed);
+		ret = btrfs_trim_free_extents(device, range->minlen,
+					      &group_trimmed);
 		if (ret) {
 			dev_failed++;
 			dev_ret = ret;
-- 
cgit v1.2.3


From bac8520892812dada1ac93f27d96317470d24b1f Mon Sep 17 00:00:00 2001
From: Ross Lagerwall <ross.lagerwall@citrix.com>
Date: Wed, 27 Mar 2019 17:09:17 +0000
Subject: gfs2: Fix lru_count going negative

[ Upstream commit 7881ef3f33bb80f459ea6020d1e021fc524a6348 ]

Under certain conditions, lru_count may drop below zero resulting in
a large amount of log spam like this:

vmscan: shrink_slab: gfs2_dump_glock+0x3b0/0x630 [gfs2] \
    negative objects to delete nr=-1

This happens as follows:
1) A glock is moved from lru_list to the dispose list and lru_count is
   decremented.
2) The dispose function calls cond_resched() and drops the lru lock.
3) Another thread takes the lru lock and tries to add the same glock to
   lru_list, checking if the glock is on an lru list.
4) It is on a list (actually the dispose list) and so it avoids
   incrementing lru_count.
5) The glock is moved to lru_list.
5) The original thread doesn't dispose it because it has been re-added
   to the lru list but the lru_count has still decreased by one.

Fix by checking if the LRU flag is set on the glock rather than checking
if the glock is on some list and rearrange the code so that the LRU flag
is added/removed precisely when the glock is added/removed from lru_list.

Signed-off-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/gfs2/glock.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9d566e62684c..775256141e9f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -183,15 +183,19 @@ static int demote_ok(const struct gfs2_glock *gl)
 
 void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
 {
+	if (!(gl->gl_ops->go_flags & GLOF_LRU))
+		return;
+
 	spin_lock(&lru_lock);
 
-	if (!list_empty(&gl->gl_lru))
-		list_del_init(&gl->gl_lru);
-	else
+	list_del(&gl->gl_lru);
+	list_add_tail(&gl->gl_lru, &lru_list);
+
+	if (!test_bit(GLF_LRU, &gl->gl_flags)) {
+		set_bit(GLF_LRU, &gl->gl_flags);
 		atomic_inc(&lru_count);
+	}
 
-	list_add_tail(&gl->gl_lru, &lru_list);
-	set_bit(GLF_LRU, &gl->gl_flags);
 	spin_unlock(&lru_lock);
 }
 
@@ -201,7 +205,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 		return;
 
 	spin_lock(&lru_lock);
-	if (!list_empty(&gl->gl_lru)) {
+	if (test_bit(GLF_LRU, &gl->gl_flags)) {
 		list_del_init(&gl->gl_lru);
 		atomic_dec(&lru_count);
 		clear_bit(GLF_LRU, &gl->gl_flags);
@@ -1158,8 +1162,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
 			fast_path = 1;
 	}
-	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) &&
-	    (glops->go_flags & GLOF_LRU))
+	if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
 		gfs2_glock_add_to_lru(gl);
 
 	trace_gfs2_glock_queue(gh, 0);
@@ -1455,6 +1458,7 @@ __acquires(&lru_lock)
 		if (!spin_trylock(&gl->gl_lockref.lock)) {
 add_back_to_lru:
 			list_add(&gl->gl_lru, &lru_list);
+			set_bit(GLF_LRU, &gl->gl_flags);
 			atomic_inc(&lru_count);
 			continue;
 		}
@@ -1462,7 +1466,6 @@ add_back_to_lru:
 			spin_unlock(&gl->gl_lockref.lock);
 			goto add_back_to_lru;
 		}
-		clear_bit(GLF_LRU, &gl->gl_flags);
 		gl->gl_lockref.count++;
 		if (demote_ok(gl))
 			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1497,6 +1500,7 @@ static long gfs2_scan_glock_lru(int nr)
 		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
 			list_move(&gl->gl_lru, &dispose);
 			atomic_dec(&lru_count);
+			clear_bit(GLF_LRU, &gl->gl_flags);
 			freed++;
 			continue;
 		}
-- 
cgit v1.2.3


From 36296b0034ae720d50206cf6a3e729780d59cf6b Mon Sep 17 00:00:00 2001
From: Roberto Bergantinos Corpas <rbergant@redhat.com>
Date: Thu, 25 Apr 2019 15:36:51 +0200
Subject: NFS: make nfs_match_client killable

[ Upstream commit 950a578c6128c2886e295b9c7ecb0b6b22fcc92b ]

    Actually we don't do anything with return value from
    nfs_wait_client_init_complete in nfs_match_client, as a
    consequence if we get a fatal signal and client is not
    fully initialised, we'll loop to "again" label

    This has been proven to cause soft lockups on some scenarios
    (no-carrier but configured network interfaces)

Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/client.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 751ca65da8a3..846d45cb1a3c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 	struct nfs_client *clp;
 	const struct sockaddr *sap = data->addr;
 	struct nfs_net *nn = net_generic(data->net, nfs_net_id);
+	int error;
 
 again:
 	list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
@@ -302,8 +303,10 @@ again:
 		if (clp->cl_cons_state > NFS_CS_READY) {
 			refcount_inc(&clp->cl_count);
 			spin_unlock(&nn->nfs_client_lock);
-			nfs_wait_client_init_complete(clp);
+			error = nfs_wait_client_init_complete(clp);
 			nfs_put_client(clp);
+			if (error < 0)
+				return ERR_PTR(error);
 			spin_lock(&nn->nfs_client_lock);
 			goto again;
 		}
@@ -413,6 +416,8 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
 		clp = nfs_match_client(cl_init);
 		if (clp) {
 			spin_unlock(&nn->nfs_client_lock);
+			if (IS_ERR(clp))
+				return clp;
 			if (new)
 				new->rpc_ops->free_client(new);
 			return nfs_found_client(cl_init, clp);
-- 
cgit v1.2.3


From c4b51dbcccfcd305b5bb3e108c908274ad1e442c Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Thu, 4 Apr 2019 21:11:11 +0100
Subject: gfs2: Fix occasional glock use-after-free

[ Upstream commit 9287c6452d2b1f24ea8e84bd3cf6f3c6f267f712 ]

This patch has to do with the life cycle of glocks and buffers.  When
gfs2 metadata or journaled data is queued to be written, a gfs2_bufdata
object is assigned to track the buffer, and that is queued to various
lists, including the glock's gl_ail_list to indicate it's on the active
items list.  Once the page associated with the buffer has been written,
it is removed from the ail list, but its life isn't over until a revoke
has been successfully written.

So after the block is written, its bufdata object is moved from the
glock's gl_ail_list to a file-system-wide list of pending revokes,
sd_log_le_revoke.  At that point the glock still needs to track how many
revokes it contributed to that list (in gl_revokes) so that things like
glock go_sync can ensure all the metadata has been not only written, but
also revoked before the glock is granted to a different node.  This is
to guarantee journal replay doesn't replay the block once the glock has
been granted to another node.

Ross Lagerwall recently discovered a race in which an inode could be
evicted, and its glock freed after its ail list had been synced, but
while it still had unwritten revokes on the sd_log_le_revoke list.  The
evict decremented the glock reference count to zero, which allowed the
glock to be freed.  After the revoke was written, function
revoke_lo_after_commit tried to adjust the glock's gl_revokes counter
and clear its GLF_LFLUSH flag, at which time it referenced the freed
glock.

This patch fixes the problem by incrementing the glock reference count
in gfs2_add_revoke when the glock's first bufdata object is moved from
the glock to the global revokes list. Later, when the glock's last such
bufdata object is freed, the reference count is decremented. This
guarantees that whichever process finishes last (the revoke writing or
the evict) will properly free the glock, and neither will reference the
glock after it has been freed.

Reported-by: Ross Lagerwall <ross.lagerwall@citrix.com>
Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/gfs2/glock.c | 1 +
 fs/gfs2/log.c   | 3 ++-
 fs/gfs2/lops.c  | 6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 775256141e9f..ccdd8c821abd 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -140,6 +140,7 @@ void gfs2_glock_free(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 
+	BUG_ON(atomic_read(&gl->gl_revokes));
 	rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
 	smp_mb();
 	wake_up_glock(gl);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ee20ea42e7b5..cd85092723de 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -604,7 +604,8 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	bd->bd_bh = NULL;
 	bd->bd_ops = &gfs2_revoke_lops;
 	sdp->sd_log_num_revoke++;
-	atomic_inc(&gl->gl_revokes);
+	if (atomic_inc_return(&gl->gl_revokes) == 1)
+		gfs2_glock_hold(gl);
 	set_bit(GLF_LFLUSH, &gl->gl_flags);
 	list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
 }
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index f2567f958d00..8f99b395d7bf 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -662,8 +662,10 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 		bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
 		list_del_init(&bd->bd_list);
 		gl = bd->bd_gl;
-		atomic_dec(&gl->gl_revokes);
-		clear_bit(GLF_LFLUSH, &gl->gl_flags);
+		if (atomic_dec_return(&gl->gl_revokes) == 0) {
+			clear_bit(GLF_LFLUSH, &gl->gl_flags);
+			gfs2_glock_queue_put(gl);
+		}
 		kmem_cache_free(gfs2_bufdata_cachep, bd);
 	}
 }
-- 
cgit v1.2.3


From 1084fc9afbe382150899d3c58afe52861a08f484 Mon Sep 17 00:00:00 2001
From: Robbie Ko <robbieko@synology.com>
Date: Tue, 26 Mar 2019 11:56:11 +0800
Subject: Btrfs: fix data bytes_may_use underflow with fallocate due to failed
 quota reserve

[ Upstream commit 39ad317315887c2cb9a4347a93a8859326ddf136 ]

When doing fallocate, we first add the range to the reserve_list and
then reserve the quota.  If quota reservation fails, we'll release all
reserved parts of reserve_list.

However, cur_offset is not updated to indicate that this range is
already been inserted into the list.  Therefore, the same range is freed
twice.  Once at list_for_each_entry loop, and once at the end of the
function.  This will result in WARN_ON on bytes_may_use when we free the
remaining space.

At the end, under the 'out' label we have a call to:

   btrfs_free_reserved_data_space(inode, data_reserved, alloc_start, alloc_end - cur_offset);

The start offset, third argument, should be cur_offset.

Everything from alloc_start to cur_offset was freed by the
list_for_each_entry_safe_loop.

Fixes: 18513091af94 ("btrfs: update btrfs_space_info's bytes_may_use timely")
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index de4d3baec161..e24c0a69ff5d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3161,6 +3161,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 			ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
 					cur_offset, last_byte - cur_offset);
 			if (ret < 0) {
+				cur_offset = last_byte;
 				free_extent_map(em);
 				break;
 			}
@@ -3210,7 +3211,7 @@ out:
 	/* Let go of our reservation. */
 	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
 		btrfs_free_reserved_data_space(inode, data_reserved,
-				alloc_start, alloc_end - cur_offset);
+				cur_offset, alloc_end - cur_offset);
 	extent_changeset_free(data_reserved);
 	return ret;
 }
-- 
cgit v1.2.3


From 431cbaec1287772d66baea828eec23546e5c40dc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 25 Feb 2019 11:14:45 -0500
Subject: btrfs: fix panic during relocation after ENOSPC before writeback
 happens

[ Upstream commit ff612ba7849964b1898fd3ccd1f56941129c6aab ]

We've been seeing the following sporadically throughout our fleet

panic: kernel BUG at fs/btrfs/relocation.c:4584!
netversion: 5.0-0
Backtrace:
 #0 [ffffc90003adb880] machine_kexec at ffffffff81041da8
 #1 [ffffc90003adb8c8] __crash_kexec at ffffffff8110396c
 #2 [ffffc90003adb988] crash_kexec at ffffffff811048ad
 #3 [ffffc90003adb9a0] oops_end at ffffffff8101c19a
 #4 [ffffc90003adb9c0] do_trap at ffffffff81019114
 #5 [ffffc90003adba00] do_error_trap at ffffffff810195d0
 #6 [ffffc90003adbab0] invalid_op at ffffffff81a00a9b
    [exception RIP: btrfs_reloc_cow_block+692]
    RIP: ffffffff8143b614  RSP: ffffc90003adbb68  RFLAGS: 00010246
    RAX: fffffffffffffff7  RBX: ffff8806b9c32000  RCX: ffff8806aad00690
    RDX: ffff880850b295e0  RSI: ffff8806b9c32000  RDI: ffff88084f205bd0
    RBP: ffff880849415000   R8: ffffc90003adbbe0   R9: ffff88085ac90000
    R10: ffff8805f7369140  R11: 0000000000000000  R12: ffff880850b295e0
    R13: ffff88084f205bd0  R14: 0000000000000000  R15: 0000000000000000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #7 [ffffc90003adbbb0] __btrfs_cow_block at ffffffff813bf1cd
 #8 [ffffc90003adbc28] btrfs_cow_block at ffffffff813bf4b3
 #9 [ffffc90003adbc78] btrfs_search_slot at ffffffff813c2e6c

The way relocation moves data extents is by creating a reloc inode and
preallocating extents in this inode and then copying the data into these
preallocated extents.  Once we've done this for all of our extents,
we'll write out these dirty pages, which marks the extent written, and
goes into btrfs_reloc_cow_block().  From here we get our current
reloc_control, which _should_ match the reloc_control for the current
block group we're relocating.

However if we get an ENOSPC in this path at some point we'll bail out,
never initiating writeback on this inode.  Not a huge deal, unless we
happen to be doing relocation on a different block group, and this block
group is now rc->stage == UPDATE_DATA_PTRS.  This trips the BUG_ON() in
btrfs_reloc_cow_block(), because we expect to be done modifying the data
inode.  We are in fact done modifying the metadata for the data inode
we're currently using, but not the one from the failed block group, and
thus we BUG_ON().

(This happens when writeback finishes for extents from the previous
group, when we are at btrfs_finish_ordered_io() which updates the data
reloc tree (inode item, drops/adds extent items, etc).)

Fix this by writing out the reloc data inode always, and then breaking
out of the loop after that point to keep from tripping this BUG_ON()
later.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
[ add note from Filipe ]
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/relocation.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0526b6c473c7..5d57ed629345 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4289,27 +4289,36 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
 		mutex_lock(&fs_info->cleaner_mutex);
 		ret = relocate_block_group(rc);
 		mutex_unlock(&fs_info->cleaner_mutex);
-		if (ret < 0) {
+		if (ret < 0)
 			err = ret;
-			goto out;
-		}
-
-		if (rc->extents_found == 0)
-			break;
-
-		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
 
+		/*
+		 * We may have gotten ENOSPC after we already dirtied some
+		 * extents.  If writeout happens while we're relocating a
+		 * different block group we could end up hitting the
+		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
+		 * btrfs_reloc_cow_block.  Make sure we write everything out
+		 * properly so we don't trip over this problem, and then break
+		 * out of the loop if we hit an error.
+		 */
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
 						       (u64)-1);
-			if (ret) {
+			if (ret)
 				err = ret;
-				goto out;
-			}
 			invalidate_mapping_pages(rc->data_inode->i_mapping,
 						 0, -1);
 			rc->stage = UPDATE_DATA_PTRS;
 		}
+
+		if (err < 0)
+			goto out;
+
+		if (rc->extents_found == 0)
+			break;
+
+		btrfs_info(fs_info, "found %llu extents", rc->extents_found);
+
 	}
 
 	WARN_ON(rc->block_group->pinned > 0);
-- 
cgit v1.2.3


From bd3d8f4cb956980c5fa08b043919a5e056cf8a41 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Tue, 26 Feb 2019 16:33:56 +0800
Subject: btrfs: Don't panic when we can't find a root key

[ Upstream commit 7ac1e464c4d473b517bb784f30d40da1f842482e ]

When we failed to find a root key in btrfs_update_root(), we just panic.

That's definitely not cool, fix it by outputting an unique error
message, aborting current transaction and return -EUCLEAN. This should
not normally happen as the root has been used by the callers in some
way.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/btrfs/root-tree.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index fb205ebeca39..3228d3b3084a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -135,11 +135,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret < 0)
 		goto out;
 
-	if (ret != 0) {
-		btrfs_print_leaf(path->nodes[0]);
-		btrfs_crit(fs_info, "unable to update root key %llu %u %llu",
-			   key->objectid, key->type, key->offset);
-		BUG_ON(1);
+	if (ret > 0) {
+		btrfs_crit(fs_info,
+			"unable to find root key (%llu %u %llu) in tree %llu",
+			key->objectid, key->type, key->offset,
+			root->root_key.objectid);
+		ret = -EUCLEAN;
+		btrfs_abort_transaction(trans, ret);
+		goto out;
 	}
 
 	l = path->nodes[0];
-- 
cgit v1.2.3


From 65ec64f28a882096d9976963e2ccefd85085fb1a Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Fri, 15 Feb 2019 20:27:11 +0800
Subject: chardev: add additional check for minor range overlap

[ Upstream commit de36e16d1557a0b6eb328bc3516359a12ba5c25c ]

Current overlap checking cannot correctly handle
a case which is baseminor < existing baseminor &&
baseminor + minorct > existing baseminor + minorct.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/char_dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index a279c58fe360..8a63cfa29005 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -159,6 +159,12 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 			ret = -EBUSY;
 			goto out;
 		}
+
+		if (new_min < old_min && new_max > old_max) {
+			ret = -EBUSY;
+			goto out;
+		}
+
 	}
 
 	cd->next = *cp;
-- 
cgit v1.2.3


From 26433652f0e429b7a641eb32927b683819ca5eb0 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Thu, 9 May 2019 07:25:21 -0400
Subject: NFS: Fix a double unlock from nfs_match,get_client

[ Upstream commit c260121a97a3e4df6536edbc2f26e166eff370ce ]

Now that nfs_match_client drops the nfs_client_lock, we should be
careful
to always return it in the same condition: locked.

Fixes: 950a578c6128 ("NFS: make nfs_match_client killable")
Reported-by: syzbot+228a82b263b5da91883d@syzkaller.appspotmail.com
Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/nfs/client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 846d45cb1a3c..c092661147b3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -305,9 +305,9 @@ again:
 			spin_unlock(&nn->nfs_client_lock);
 			error = nfs_wait_client_init_complete(clp);
 			nfs_put_client(clp);
+			spin_lock(&nn->nfs_client_lock);
 			if (error < 0)
 				return ERR_PTR(error);
-			spin_lock(&nn->nfs_client_lock);
 			goto again;
 		}
 
-- 
cgit v1.2.3


From 7301bbeae98f265dbd3ba5571b27bdd39fbf4797 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 May 2019 16:02:47 +0100
Subject: Btrfs: fix wrong ctime and mtime of a directory after log replay

commit 5338e43abbab13791144d37fd8846847062351c6 upstream.

When replaying a log that contains a new file or directory name that needs
to be added to its parent directory, we end up updating the mtime and the
ctime of the parent directory to the current time after we have set their
values to the correct ones (set at fsync time), efectivelly losing them.

Sample reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ mkdir /mnt/dir
  $ touch /mnt/dir/file

  # fsync of the directory is optional, not needed
  $ xfs_io -c fsync /mnt/dir
  $ xfs_io -c fsync /mnt/dir/file

  $ stat -c %Y /mnt/dir
  1557856079

  <power failure>

  $ sleep 3
  $ mount /dev/sdb /mnt
  $ stat -c %Y /mnt/dir
  1557856082

    --> should have been 1557856079, the mtime is updated to the current
        time when replaying the log

Fix this by not updating the mtime and ctime to the current time at
btrfs_add_link() when we are replaying a log tree.

This could be triggered by my recent fsync fuzz tester for fstests, for
which an fstests patch exists titled "fstests: generic, fsync fuzz tester
with fsstress".

Fixes: e02119d5a7b43 ("Btrfs: Add a write ahead tree log to optimize synchronous operations")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/inode.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59f361f7d0c1..c1cd3fe2b295 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6426,8 +6426,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
 			   name_len * 2);
 	inode_inc_iversion(&parent_inode->vfs_inode);
-	parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
-		current_time(&parent_inode->vfs_inode);
+	/*
+	 * If we are replaying a log tree, we do not want to update the mtime
+	 * and ctime of the parent directory with the current time, since the
+	 * log replay procedure is responsible for setting them to their correct
+	 * values (the ones it had when the fsync was done).
+	 */
+	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
+		struct timespec64 now = current_time(&parent_inode->vfs_inode);
+
+		parent_inode->vfs_inode.i_mtime = now;
+		parent_inode->vfs_inode.i_ctime = now;
+	}
 	ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
 	if (ret)
 		btrfs_abort_transaction(trans, ret);
-- 
cgit v1.2.3


From 37fe038328a20980f45ba7a37c9fa64708a374e7 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 15 May 2019 16:03:17 +0100
Subject: Btrfs: fix race updating log root item during fsync

commit 06989c799f04810f6876900d4760c0edda369cf7 upstream.

When syncing the log, the final phase of a fsync operation, we need to
either create a log root's item or update the existing item in the log
tree of log roots, and that depends on the current value of the log
root's log_transid - if it's 1 we need to create the log root item,
otherwise it must exist already and we update it. Since there is no
synchronization between updating the log_transid and checking it for
deciding whether the log root's item needs to be created or updated, we
end up with a tiny race window that results in attempts to update the
item to fail because the item was not yet created:

              CPU 1                                    CPU 2

  btrfs_sync_log()

    lock root->log_mutex

    set log root's log_transid to 1

    unlock root->log_mutex

                                               btrfs_sync_log()

                                                 lock root->log_mutex

                                                 sets log root's
                                                 log_transid to 2

                                                 unlock root->log_mutex

    update_log_root()

      sees log root's log_transid
      with a value of 2

        calls btrfs_update_root(),
        which fails with -EUCLEAN
        and causes transaction abort

Until recently the race lead to a BUG_ON at btrfs_update_root(), but after
the recent commit 7ac1e464c4d47 ("btrfs: Don't panic when we can't find a
root key") we just abort the current transaction.

A sample trace of the BUG_ON() on a SLE12 kernel:

  ------------[ cut here ]------------
  kernel BUG at ../fs/btrfs/root-tree.c:157!
  Oops: Exception in kernel mode, sig: 5 [#1]
  SMP NR_CPUS=2048 NUMA pSeries
  (...)
  Supported: Yes, External
  CPU: 78 PID: 76303 Comm: rtas_errd Tainted: G                 X 4.4.156-94.57-default #1
  task: c00000ffa906d010 ti: c00000ff42b08000 task.ti: c00000ff42b08000
  NIP: d000000036ae5cdc LR: d000000036ae5cd8 CTR: 0000000000000000
  REGS: c00000ff42b0b860 TRAP: 0700   Tainted: G                 X  (4.4.156-94.57-default)
  MSR: 8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 22444484  XER: 20000000
  CFAR: d000000036aba66c SOFTE: 1
  GPR00: d000000036ae5cd8 c00000ff42b0bae0 d000000036bda220 0000000000000054
  GPR04: 0000000000000001 0000000000000000 c00007ffff8d37c8 0000000000000000
  GPR08: c000000000e19c00 0000000000000000 0000000000000000 3736343438312079
  GPR12: 3930373337303434 c000000007a3a800 00000000007fffff 0000000000000023
  GPR16: c00000ffa9d26028 c00000ffa9d261f8 0000000000000010 c00000ffa9d2ab28
  GPR20: c00000ff42b0bc48 0000000000000001 c00000ff9f0d9888 0000000000000001
  GPR24: c00000ffa9d26000 c00000ffa9d261e8 c00000ffa9d2a800 c00000ff9f0d9888
  GPR28: c00000ffa9d26028 c00000ffa9d2aa98 0000000000000001 c00000ffa98f5b20
  NIP [d000000036ae5cdc] btrfs_update_root+0x25c/0x4e0 [btrfs]
  LR [d000000036ae5cd8] btrfs_update_root+0x258/0x4e0 [btrfs]
  Call Trace:
  [c00000ff42b0bae0] [d000000036ae5cd8] btrfs_update_root+0x258/0x4e0 [btrfs] (unreliable)
  [c00000ff42b0bba0] [d000000036b53610] btrfs_sync_log+0x2d0/0xc60 [btrfs]
  [c00000ff42b0bce0] [d000000036b1785c] btrfs_sync_file+0x44c/0x4e0 [btrfs]
  [c00000ff42b0bd80] [c00000000032e300] vfs_fsync_range+0x70/0x120
  [c00000ff42b0bdd0] [c00000000032e44c] do_fsync+0x5c/0xb0
  [c00000ff42b0be10] [c00000000032e8dc] SyS_fdatasync+0x2c/0x40
  [c00000ff42b0be30] [c000000000009488] system_call+0x3c/0x100
  Instruction dump:
  7f43d378 4bffebb9 60000000 88d90008 3d220000 e8b90000 3b390009 e87a01f0
  e8898e08 e8f90000 4bfd48e5 60000000 <0fe00000> e95b0060 39200004 394a0ea0
  ---[ end trace 8f2dc8f919cabab8 ]---

So fix this by doing the check of log_transid and updating or creating the
log root's item while holding the root's log_mutex.

Fixes: 7237f1833601d ("Btrfs: fix tree logs parallel sync")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 75051d36dc1a..f79855cee88b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3037,6 +3037,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
+	/*
+	 * Update or create log root item under the root's log_mutex to prevent
+	 * races with concurrent log syncs that can lead to failure to update
+	 * log root item because it was not created yet.
+	 */
+	ret = update_log_root(trans, log);
 	/*
 	 * IO has been started, blocks of the log tree have WRITTEN flag set
 	 * in their headers. new modifications of the log will be written to
@@ -3056,8 +3062,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	mutex_unlock(&log_root_tree->log_mutex);
 
-	ret = update_log_root(trans, log);
-
 	mutex_lock(&log_root_tree->log_mutex);
 	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
 		/* atomic_dec_and_test implies a barrier */
-- 
cgit v1.2.3


From a81071110d25a83e749b2f58f0e337ac6782c12c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 16 May 2019 15:48:55 +0100
Subject: Btrfs: fix fsync not persisting changed attributes of a directory

commit 60d9f50308e5df19bc18c2fefab0eba4a843900a upstream.

While logging an inode we follow its ancestors and for each one we mark
it as logged in the current transaction, even if we have not logged it.
As a consequence if we change an attribute of an ancestor, such as the
UID or GID for example, and then explicitly fsync it, we end up not
logging the inode at all despite returning success to user space, which
results in the attribute being lost if a power failure happens after
the fsync.

Sample reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt

  $ mkdir /mnt/dir
  $ chown 6007:6007 /mnt/dir

  $ sync

  $ chown 9003:9003 /mnt/dir
  $ touch /mnt/dir/file
  $ xfs_io -c fsync /mnt/dir/file

  # fsync our directory after fsync'ing the new file, should persist the
  # new values for the uid and gid.
  $ xfs_io -c fsync /mnt/dir

  <power failure>

  $ mount /dev/sdb /mnt
  $ stat -c %u:%g /mnt/dir
  6007:6007

    --> should be 9003:9003, the uid and gid were not persisted, despite
        the explicit fsync on the directory prior to the power failure

Fix this by not updating the logged_trans field of ancestor inodes when
logging an inode, since we have not logged them. Let only future calls to
btrfs_log_inode() to mark inodes as logged.

This could be triggered by my recent fsync fuzz tester for fstests, for
which an fstests patch exists titled "fstests: generic, fsync fuzz tester
with fsstress".

Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes")
CC: stable@vger.kernel.org # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/tree-log.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f79855cee88b..0d5840d20efc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5312,7 +5312,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	struct dentry *old_parent = NULL;
-	struct btrfs_inode *orig_inode = inode;
 
 	/*
 	 * for regular files, if its inode is already on disk, we don't
@@ -5332,16 +5331,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 	}
 
 	while (1) {
-		/*
-		 * If we are logging a directory then we start with our inode,
-		 * not our parent's inode, so we need to skip setting the
-		 * logged_trans so that further down in the log code we don't
-		 * think this inode has already been logged.
-		 */
-		if (inode != orig_inode)
-			inode->logged_trans = trans->transid;
-		smp_mb();
-
 		if (btrfs_must_commit_transaction(trans, inode)) {
 			ret = 1;
 			break;
@@ -6070,7 +6059,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 	 * if this directory was already logged any new
 	 * names for this file/dir will get recorded
 	 */
-	smp_mb();
 	if (dir->logged_trans == trans->transid)
 		return;
 
-- 
cgit v1.2.3


From 8a652fd142c38d03f6e83a054cff40f2e6878beb Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 20 May 2019 09:55:42 +0100
Subject: Btrfs: incremental send, fix file corruption when no-holes feature is
 enabled

commit 6b1f72e5b82a5c2a4da4d1ebb8cc01913ddbea21 upstream.

When using the no-holes feature, if we have a file with prealloc extents
with a start offset beyond the file's eof, doing an incremental send can
cause corruption of the file due to incorrect hole detection. Such case
requires that the prealloc extent(s) exist in both the parent and send
snapshots, and that a hole is punched into the file that covers all its
extents that do not cross the eof boundary.

Example reproducer:

  $ mkfs.btrfs -f -O no-holes /dev/sdb
  $ mount /dev/sdb /mnt/sdb

  $ xfs_io -f -c "pwrite -S 0xab 0 500K" /mnt/sdb/foobar
  $ xfs_io -c "falloc -k 1200K 800K" /mnt/sdb/foobar

  $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/base

  $ btrfs send -f /tmp/base.snap /mnt/sdb/base

  $ xfs_io -c "fpunch 0 500K" /mnt/sdb/foobar

  $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/incr

  $ btrfs send -p /mnt/sdb/base -f /tmp/incr.snap /mnt/sdb/incr

  $ md5sum /mnt/sdb/incr/foobar
  816df6f64deba63b029ca19d880ee10a   /mnt/sdb/incr/foobar

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt/sdc

  $ btrfs receive -f /tmp/base.snap /mnt/sdc
  $ btrfs receive -f /tmp/incr.snap /mnt/sdc

  $ md5sum /mnt/sdc/incr/foobar
  cf2ef71f4a9e90c2f6013ba3b2257ed2   /mnt/sdc/incr/foobar

    --> Different checksum, because the prealloc extent beyond the
        file's eof confused the hole detection code and it assumed
        a hole starting at offset 0 and ending at the offset of the
        prealloc extent (1200Kb) instead of ending at the offset
        500Kb (the file's size).

Fix this by ensuring we never cross the file's size when issuing the
write operations for a hole.

Fixes: 16e7549f045d33 ("Btrfs: incompatible format change to remove hole extents")
CC: stable@vger.kernel.org # 3.14+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/btrfs/send.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 635e419f2a2d..258392b75048 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5021,6 +5021,12 @@ static int send_hole(struct send_ctx *sctx, u64 end)
 	if (offset >= sctx->cur_inode_size)
 		return 0;
 
+	/*
+	 * Don't go beyond the inode's i_size due to prealloc extents that start
+	 * after the i_size.
+	 */
+	end = min_t(u64, end, sctx->cur_inode_size);
+
 	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
 		return send_update_extent(sctx, offset, end - offset);
 
-- 
cgit v1.2.3


From 32d57c0c063cdab6639da074e8b0f28517862e41 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 17 May 2019 09:12:33 +0100
Subject: cifs: fix memory leak of pneg_inbuf on -EOPNOTSUPP ioctl case

commit 210782038b54ec8e9059a3c12d6f6ae173efa3a9 upstream.

Currently in the case where SMB2_ioctl returns the -EOPNOTSUPP error
there is a memory leak of pneg_inbuf. Fix this by returning via
the out_free_inbuf exit path that will perform the relevant kfree.

Addresses-Coverity: ("Resource leak")
Fixes: 969ae8e8d4ee ("cifs: Accept validate negotiate if server return NT_STATUS_NOT_SUPPORTED")
CC: Stable <stable@vger.kernel.org> # v5.1+
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/smb2pdu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 33afb637e6f8..c181f1621e1a 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -887,7 +887,8 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
 		 * not supported error. Client should accept it.
 		 */
 		cifs_dbg(VFS, "Server does not support validate negotiate\n");
-		return 0;
+		rc = 0;
+		goto out_free_inbuf;
 	} else if (rc != 0) {
 		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
 		rc = -EIO;
-- 
cgit v1.2.3


From 297a251062c0d616e5d6644d4a041532c77d601f Mon Sep 17 00:00:00 2001
From: Roberto Bergantinos Corpas <rbergant@redhat.com>
Date: Tue, 28 May 2019 09:38:14 +0200
Subject: CIFS: cifs_read_allocate_pages: don't iterate through whole page
 array on ENOMEM

commit 31fad7d41e73731f05b8053d17078638cf850fa6 upstream.

 In cifs_read_allocate_pages, in case of ENOMEM, we go through
whole rdata->pages array but we have failed the allocation before
nr_pages, therefore we may end up calling put_page with NULL
pointer, causing oops

Signed-off-by: Roberto Bergantinos Corpas <rbergant@redhat.com>
Acked-by: Pavel Shilovsky <pshilov@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/cifs/file.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d6b45682833b..23cee91ed442 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2988,7 +2988,9 @@ cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages)
 	}
 
 	if (rc) {
-		for (i = 0; i < nr_pages; i++) {
+		unsigned int nr_page_failed = i;
+
+		for (i = 0; i < nr_page_failed; i++) {
 			put_page(rdata->pages[i]);
 			rdata->pages[i] = NULL;
 		}
-- 
cgit v1.2.3


From 873041930dab5e03e9ef6f6adbee9f0a3e9dc034 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Mon, 20 May 2019 10:33:07 -0400
Subject: Revert "lockd: Show pid of lockd for remote locks"

commit 141731d15d6eb2fd9aaefbf9b935ce86ae243074 upstream.

This reverts most of commit b8eee0e90f97 ("lockd: Show pid of lockd for
remote locks"), which caused remote locks to not be differentiated between
remote processes for NLM.

We retain the fixup for setting the client's fl_pid to a negative value.

Fixes: b8eee0e90f97 ("lockd: Show pid of lockd for remote locks")
Cc: stable@vger.kernel.org

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: XueWei Zhang <xueweiz@google.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/lockd/xdr.c  | 4 ++--
 fs/lockd/xdr4.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9846f7e95282..7147e4aebecc 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -127,7 +127,7 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 
 	locks_init_lock(fl);
 	fl->fl_owner = current->files;
-	fl->fl_pid   = current->tgid;
+	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	start = ntohl(*p++);
@@ -269,7 +269,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
 	lock->svid = ~(u32) 0;
-	lock->fl.fl_pid = current->tgid;
+	lock->fl.fl_pid = (pid_t)lock->svid;
 
 	if (!(p = nlm_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 70154f376695..7ed9edf9aed4 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -119,7 +119,7 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 
 	locks_init_lock(fl);
 	fl->fl_owner = current->files;
-	fl->fl_pid   = current->tgid;
+	fl->fl_pid   = (pid_t)lock->svid;
 	fl->fl_flags = FL_POSIX;
 	fl->fl_type  = F_RDLCK;		/* as good as anything else */
 	p = xdr_decode_hyper(p, &start);
@@ -266,7 +266,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
 	memset(lock, 0, sizeof(*lock));
 	locks_init_lock(&lock->fl);
 	lock->svid = ~(u32) 0;
-	lock->fl.fl_pid = current->tgid;
+	lock->fl.fl_pid = (pid_t)lock->svid;
 
 	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
 	 || !(p = xdr_decode_string_inplace(p, &lock->caller,
-- 
cgit v1.2.3


From ea0327b47754467e0190212dac1ec578262b8e70 Mon Sep 17 00:00:00 2001
From: Yihao Wu <wuyihao@linux.alibaba.com>
Date: Wed, 22 May 2019 01:57:10 +0800
Subject: NFSv4.1: Again fix a race where CB_NOTIFY_LOCK fails to wake a waiter

commit 52b042ab9948cc367b61f9ca9c18603aa7813c3a upstream.

Commit b7dbcc0e433f "NFSv4.1: Fix a race where CB_NOTIFY_LOCK fails to wake a waiter"
found this bug. However it didn't fix it.

This commit replaces schedule_timeout() with wait_woken() and
default_wake_function() with woken_wake_function() in function
nfs4_retry_setlk() and nfs4_wake_lock_waiter(). wait_woken() uses
memory barriers in its implementation to avoid potential race condition
when putting a process into sleeping state and then waking it up.

Fixes: a1d617d8f134 ("nfs: allow blocking locks to be awoken by lock callbacks")
Cc: stable@vger.kernel.org #4.9+
Signed-off-by: Yihao Wu <wuyihao@linux.alibaba.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 580e37bc3fe2..06d9548a14d0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6850,7 +6850,6 @@ struct nfs4_lock_waiter {
 	struct task_struct	*task;
 	struct inode		*inode;
 	struct nfs_lowner	*owner;
-	bool			notified;
 };
 
 static int
@@ -6872,13 +6871,13 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo
 		/* Make sure it's for the right inode */
 		if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
 			return 0;
-
-		waiter->notified = true;
 	}
 
 	/* override "private" so we can use default_wake_function */
 	wait->private = waiter->task;
-	ret = autoremove_wake_function(wait, mode, flags, key);
+	ret = woken_wake_function(wait, mode, flags, key);
+	if (ret)
+		list_del_init(&wait->entry);
 	wait->private = waiter;
 	return ret;
 }
@@ -6887,7 +6886,6 @@ static int
 nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	int status = -ERESTARTSYS;
-	unsigned long flags;
 	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
 	struct nfs_server *server = NFS_SERVER(state->inode);
 	struct nfs_client *clp = server->nfs_client;
@@ -6897,8 +6895,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 				    .s_dev = server->s_dev };
 	struct nfs4_lock_waiter waiter = { .task  = current,
 					   .inode = state->inode,
-					   .owner = &owner,
-					   .notified = false };
+					   .owner = &owner};
 	wait_queue_entry_t wait;
 
 	/* Don't bother with waitqueue if we don't expect a callback */
@@ -6911,21 +6908,14 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 	add_wait_queue(q, &wait);
 
 	while(!signalled()) {
-		waiter.notified = false;
 		status = nfs4_proc_setlk(state, cmd, request);
 		if ((status != -EAGAIN) || IS_SETLK(cmd))
 			break;
 
 		status = -ERESTARTSYS;
-		spin_lock_irqsave(&q->lock, flags);
-		if (waiter.notified) {
-			spin_unlock_irqrestore(&q->lock, flags);
-			continue;
-		}
-		set_current_state(TASK_INTERRUPTIBLE);
-		spin_unlock_irqrestore(&q->lock, flags);
-
-		freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT);
+		freezer_do_not_count();
+		wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
+		freezer_count();
 	}
 
 	finish_wait(q, &wait);
-- 
cgit v1.2.3


From 56e3f73e838ae6315a17b70c99d1fe8bab3aca2d Mon Sep 17 00:00:00 2001
From: Yihao Wu <wuyihao@linux.alibaba.com>
Date: Mon, 13 May 2019 14:58:22 +0800
Subject: NFSv4.1: Fix bug only first CB_NOTIFY_LOCK is handled

commit ba851a39c9703f09684a541885ed176f8fb7c868 upstream.

When a waiter is waked by CB_NOTIFY_LOCK, it will retry
nfs4_proc_setlk(). The waiter may fail to nfs4_proc_setlk() and sleep
again. However, the waiter is already removed from clp->cl_lock_waitq
when handling CB_NOTIFY_LOCK in nfs4_wake_lock_waiter(). So any
subsequent CB_NOTIFY_LOCK won't wake this waiter anymore. We should
put the waiter back to clp->cl_lock_waitq before retrying.

Cc: stable@vger.kernel.org #4.9+
Signed-off-by: Yihao Wu <wuyihao@linux.alibaba.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/nfs/nfs4proc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 06d9548a14d0..53cf8599a46e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6905,20 +6905,22 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 	init_wait(&wait);
 	wait.private = &waiter;
 	wait.func = nfs4_wake_lock_waiter;
-	add_wait_queue(q, &wait);
 
 	while(!signalled()) {
+		add_wait_queue(q, &wait);
 		status = nfs4_proc_setlk(state, cmd, request);
-		if ((status != -EAGAIN) || IS_SETLK(cmd))
+		if ((status != -EAGAIN) || IS_SETLK(cmd)) {
+			finish_wait(q, &wait);
 			break;
+		}
 
 		status = -ERESTARTSYS;
 		freezer_do_not_count();
 		wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT);
 		freezer_count();
+		finish_wait(q, &wait);
 	}
 
-	finish_wait(q, &wait);
 	return status;
 }
 #else /* !CONFIG_NFS_V4_1 */
-- 
cgit v1.2.3


From a3b8b4ad6db7eea67a67eb88cc5f6e101f89dbc1 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 27 May 2019 11:42:07 +0200
Subject: fuse: fallocate: fix return with locked inode

commit 35d6fcbb7c3e296a52136347346a698a35af3fda upstream.

Do the proper cleanup in case the size check fails.

Tested with xfstests:generic/228

Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Fixes: 0cbade024ba5 ("fuse: honor RLIMIT_FSIZE in fuse_file_fallocate")
Cc: Liu Bo <bo.liu@linux.alibaba.com>
Cc: <stable@vger.kernel.org> # v3.5
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 59e8bb72dc14..9a22aa580fe7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2981,7 +2981,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	    offset + length > i_size_read(inode)) {
 		err = inode_newsize_ok(inode, offset + length);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE))
-- 
cgit v1.2.3


From c63ce7166dafd82c679873107ec649ff9f768d1e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 1 Nov 2018 14:08:07 -0700
Subject: pstore: Remove needless lock during console writes

commit b77fa617a2ff4d6beccad3d3d4b3a1f2d10368aa upstream.

Since the console writer does not use the preallocated crash dump buffer
any more, there is no reason to perform locking around it.

Fixes: 70ad35db3321 ("pstore: Convert console write to use ->write_buf")
Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/platform.c | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b821054ca3ed..805283b5385d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -476,31 +476,14 @@ static void pstore_unregister_kmsg(void)
 #ifdef CONFIG_PSTORE_CONSOLE
 static void pstore_console_write(struct console *con, const char *s, unsigned c)
 {
-	const char *e = s + c;
+	struct pstore_record record;
 
-	while (s < e) {
-		struct pstore_record record;
-		unsigned long flags;
-
-		pstore_record_init(&record, psinfo);
-		record.type = PSTORE_TYPE_CONSOLE;
-
-		if (c > psinfo->bufsize)
-			c = psinfo->bufsize;
+	pstore_record_init(&record, psinfo);
+	record.type = PSTORE_TYPE_CONSOLE;
 
-		if (oops_in_progress) {
-			if (!spin_trylock_irqsave(&psinfo->buf_lock, flags))
-				break;
-		} else {
-			spin_lock_irqsave(&psinfo->buf_lock, flags);
-		}
-		record.buf = (char *)s;
-		record.size = c;
-		psinfo->write(&record);
-		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
-		s += c;
-		c = e - s;
-	}
+	record.buf = (char *)s;
+	record.size = c;
+	psinfo->write(&record);
 }
 
 static struct console pstore_console = {
-- 
cgit v1.2.3


From d4128a1b580ca949e829fd919c2579dcaa9138d4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 30 Nov 2018 14:36:58 -0800
Subject: pstore: Convert buf_lock to semaphore

commit ea84b580b95521644429cc6748b6c2bf27c8b0f3 upstream.

Instead of running with interrupts disabled, use a semaphore. This should
make it easier for backends that may need to sleep (e.g. EFI) when
performing a write:

|BUG: sleeping function called from invalid context at kernel/sched/completion.c:99
|in_atomic(): 1, irqs_disabled(): 1, pid: 2236, name: sig-xstate-bum
|Preemption disabled at:
|[<ffffffff99d60512>] pstore_dump+0x72/0x330
|CPU: 26 PID: 2236 Comm: sig-xstate-bum Tainted: G      D           4.20.0-rc3 #45
|Call Trace:
| dump_stack+0x4f/0x6a
| ___might_sleep.cold.91+0xd3/0xe4
| __might_sleep+0x50/0x90
| wait_for_completion+0x32/0x130
| virt_efi_query_variable_info+0x14e/0x160
| efi_query_variable_store+0x51/0x1a0
| efivar_entry_set_safe+0xa3/0x1b0
| efi_pstore_write+0x109/0x140
| pstore_dump+0x11c/0x330
| kmsg_dump+0xa4/0xd0
| oops_exit+0x22/0x30
...

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Fixes: 21b3ddd39fee ("efi: Don't use spinlocks for efi vars")
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/platform.c | 44 +++++++++++++++++++++++---------------------
 fs/pstore/ram.c      |  1 -
 2 files changed, 23 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 805283b5385d..b2d63bd05e74 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -124,26 +124,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
 	}
 }
 
-bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+/*
+ * Should pstore_dump() wait for a concurrent pstore_dump()? If
+ * not, the current pstore_dump() will report a failure to dump
+ * and return.
+ */
+static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
 {
-	/*
-	 * In case of NMI path, pstore shouldn't be blocked
-	 * regardless of reason.
-	 */
+	/* In NMI path, pstore shouldn't block regardless of reason. */
 	if (in_nmi())
 		return true;
 
 	switch (reason) {
 	/* In panic case, other cpus are stopped by smp_send_stop(). */
 	case KMSG_DUMP_PANIC:
-	/* Emergency restart shouldn't be blocked by spin lock. */
+	/* Emergency restart shouldn't be blocked. */
 	case KMSG_DUMP_EMERG:
 		return true;
 	default:
 		return false;
 	}
 }
-EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
 
 #if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
 static int zbufsize_deflate(size_t size)
@@ -378,23 +379,23 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	unsigned long	total = 0;
 	const char	*why;
 	unsigned int	part = 1;
-	unsigned long	flags = 0;
-	int		is_locked;
 	int		ret;
 
 	why = get_reason_str(reason);
 
-	if (pstore_cannot_block_path(reason)) {
-		is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
-		if (!is_locked) {
-			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
-				       , in_nmi() ? "NMI" : why);
+	if (down_trylock(&psinfo->buf_lock)) {
+		/* Failed to acquire lock: give up if we cannot wait. */
+		if (pstore_cannot_wait(reason)) {
+			pr_err("dump skipped in %s path: may corrupt error record\n",
+				in_nmi() ? "NMI" : why);
+			return;
+		}
+		if (down_interruptible(&psinfo->buf_lock)) {
+			pr_err("could not grab semaphore?!\n");
 			return;
 		}
-	} else {
-		spin_lock_irqsave(&psinfo->buf_lock, flags);
-		is_locked = 1;
 	}
+
 	oopscount++;
 	while (total < kmsg_bytes) {
 		char *dst;
@@ -411,7 +412,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		record.part = part;
 		record.buf = psinfo->buf;
 
-		if (big_oops_buf && is_locked) {
+		if (big_oops_buf) {
 			dst = big_oops_buf;
 			dst_size = big_oops_buf_sz;
 		} else {
@@ -429,7 +430,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 					  dst_size, &dump_size))
 			break;
 
-		if (big_oops_buf && is_locked) {
+		if (big_oops_buf) {
 			zipped_len = pstore_compress(dst, psinfo->buf,
 						header_size + dump_size,
 						psinfo->bufsize);
@@ -452,8 +453,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		total += record.size;
 		part++;
 	}
-	if (is_locked)
-		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
+
+	up(&psinfo->buf_lock);
 }
 
 static struct kmsg_dumper pstore_dumper = {
@@ -572,6 +573,7 @@ int pstore_register(struct pstore_info *psi)
 		psi->write_user = pstore_write_user_compat;
 	psinfo = psi;
 	mutex_init(&psinfo->read_mutex);
+	sema_init(&psinfo->buf_lock, 1);
 	spin_unlock(&pstore_lock);
 
 	if (owner && !try_module_get(owner)) {
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 44ed6b193d2e..77050b472c49 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -814,7 +814,6 @@ static int ramoops_probe(struct platform_device *pdev)
 		err = -ENOMEM;
 		goto fail_clear;
 	}
-	spin_lock_init(&cxt->pstore.buf_lock);
 
 	cxt->pstore.flags = PSTORE_FLAGS_DMESG;
 	if (cxt->console_size)
-- 
cgit v1.2.3


From aa73a3b205a4043ed3146445f28f09536dadf67c Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Mon, 20 May 2019 14:51:19 +0800
Subject: pstore: Set tfm to NULL on free_buf_for_compression

commit a9fb94a99bb515d8720ba8440ce3aba84aec80f8 upstream.

Set tfm to NULL on free_buf_for_compression() after crypto_free_comp().

This avoid a use-after-free when allocate_buf_for_compression()
and free_buf_for_compression() are called twice. Although
free_buf_for_compression() freed the tfm, allocate_buf_for_compression()
won't reinitialize the tfm since the tfm pointer is not NULL.

Fixes: 95047b0519c1 ("pstore: Refactor compression initialization")
Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/platform.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b2d63bd05e74..3212cd36e2e7 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -324,8 +324,10 @@ static void allocate_buf_for_compression(void)
 
 static void free_buf_for_compression(void)
 {
-	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm)
+	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
 		crypto_free_comp(tfm);
+		tfm = NULL;
+	}
 	kfree(big_oops_buf);
 	big_oops_buf = NULL;
 	big_oops_buf_sz = 0;
-- 
cgit v1.2.3


From f4d0227ff170b7ec365f75d435705b28295f08bd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 30 May 2019 23:37:29 -0700
Subject: pstore/ram: Run without kernel crash dump region

commit 8880fa32c557600f5f624084152668ed3c2ea51e upstream.

The ram pstore backend has always had the crash dumper frontend enabled
unconditionally. However, it was possible to effectively disable it
by setting a record_size=0. All the machinery would run (storing dumps
to the temporary crash buffer), but 0 bytes would ultimately get stored
due to there being no przs allocated for dumps. Commit 89d328f637b9
("pstore/ram: Correctly calculate usable PRZ bytes"), however, assumed
that there would always be at least one allocated dprz for calculating
the size of the temporary crash buffer. This was, of course, not the
case when record_size=0, and would lead to a NULL deref trying to find
the dprz buffer size:

BUG: unable to handle kernel NULL pointer dereference at (null)
...
IP: ramoops_probe+0x285/0x37e (fs/pstore/ram.c:808)

        cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;

Instead, we need to only enable the frontends based on the success of the
prz initialization and only take the needed actions when those zones are
available. (This also fixes a possible error in detecting if the ftrace
frontend should be enabled.)

Reported-and-tested-by: Yaro Slav <yaro330@gmail.com>
Fixes: 89d328f637b9 ("pstore/ram: Correctly calculate usable PRZ bytes")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/pstore/platform.c |  3 ++-
 fs/pstore/ram.c      | 36 +++++++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 3212cd36e2e7..4bae3f4fe829 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -583,7 +583,8 @@ int pstore_register(struct pstore_info *psi)
 		return -EINVAL;
 	}
 
-	allocate_buf_for_compression();
+	if (psi->flags & PSTORE_FLAGS_DMESG)
+		allocate_buf_for_compression();
 
 	if (pstore_is_mounted())
 		pstore_get_records(0);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 77050b472c49..316c16463b20 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -803,26 +803,36 @@ static int ramoops_probe(struct platform_device *pdev)
 
 	cxt->pstore.data = cxt;
 	/*
-	 * Since bufsize is only used for dmesg crash dumps, it
-	 * must match the size of the dprz record (after PRZ header
-	 * and ECC bytes have been accounted for).
+	 * Prepare frontend flags based on which areas are initialized.
+	 * For ramoops_init_przs() cases, the "max count" variable tells
+	 * if there are regions present. For ramoops_init_prz() cases,
+	 * the single region size is how to check.
 	 */
-	cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
-	cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
-	if (!cxt->pstore.buf) {
-		pr_err("cannot allocate pstore crash dump buffer\n");
-		err = -ENOMEM;
-		goto fail_clear;
-	}
-
-	cxt->pstore.flags = PSTORE_FLAGS_DMESG;
+	cxt->pstore.flags = 0;
+	if (cxt->max_dump_cnt)
+		cxt->pstore.flags |= PSTORE_FLAGS_DMESG;
 	if (cxt->console_size)
 		cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE;
-	if (cxt->ftrace_size)
+	if (cxt->max_ftrace_cnt)
 		cxt->pstore.flags |= PSTORE_FLAGS_FTRACE;
 	if (cxt->pmsg_size)
 		cxt->pstore.flags |= PSTORE_FLAGS_PMSG;
 
+	/*
+	 * Since bufsize is only used for dmesg crash dumps, it
+	 * must match the size of the dprz record (after PRZ header
+	 * and ECC bytes have been accounted for).
+	 */
+	if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) {
+		cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
+		cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
+		if (!cxt->pstore.buf) {
+			pr_err("cannot allocate pstore crash dump buffer\n");
+			err = -ENOMEM;
+			goto fail_clear;
+		}
+	}
+
 	err = pstore_register(&cxt->pstore);
 	if (err) {
 		pr_err("registering with pstore failed\n");
-- 
cgit v1.2.3