From a6136922d905d103b4edb88c5f3dd3ccf3ce51fe Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 11 Dec 2018 12:37:49 -0500 Subject: aio: fix spectre gadget in lookup_ioctx commit a538e3ff9dabcdf6c3f477a373c629213d1c3066 upstream. Matthew pointed out that the ioctx_table is susceptible to spectre v1, because the index can be controlled by an attacker. The below patch should mitigate the attack for all of the aio system calls. Cc: stable@vger.kernel.org Reported-by: Matthew Wilcox Reported-by: Dan Carpenter Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 04c4d6218978..44551d96eaa4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -45,6 +45,7 @@ #include #include +#include #include "internal.h" @@ -1038,6 +1039,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; + id = array_index_nospec(id, table->nr); ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { if (percpu_ref_tryget_live(&ctx->users)) -- cgit v1.2.3 From 498a6e6be0de3cd8ccccce6628f0ea1dc4d8f159 Mon Sep 17 00:00:00 2001 From: Piotr Jaroszynski Date: Fri, 14 Dec 2018 14:17:14 -0800 Subject: fs/iomap.c: get/put the page in iomap_page_create/release() commit 61c6de667263184125d5ca75e894fcad632b0dd3 upstream. migrate_page_move_mapping() expects pages with private data set to have a page_count elevated by 1. This is what used to happen for xfs through the buffer_heads code before the switch to iomap in commit 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads"). Not having the count elevated causes move_pages() to fail on memory mapped files coming from xfs. Make iomap compatible with the migrate_page_move_mapping() assumption by elevating the page count as part of iomap_page_create() and lowering it in iomap_page_release(). It causes the move_pages() syscall to misbehave on memory mapped files from xfs. It does not not move any pages, which I suppose is "just" a perf issue, but it also ends up returning a positive number which is out of spec for the syscall. Talking to Michal Hocko, it sounds like returning positive numbers might be a necessary update to move_pages() anyway though (https://lkml.kernel.org/r/20181116114955.GJ14706@dhcp22.suse.cz). I only hit this in tests that verify that move_pages() actually moved the pages. The test also got confused by the positive return from move_pages() (it got treated as a success as positive numbers were not expected and not handled) making it a bit harder to track down what's going on. Link: http://lkml.kernel.org/r/20181115184140.1388751-1-pjaroszynski@nvidia.com Fixes: 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads") Signed-off-by: Piotr Jaroszynski Reviewed-by: Christoph Hellwig Cc: William Kucharski Cc: Darrick J. Wong Cc: Brian Foster Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/iomap.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..37da7a61a6c5 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -117,6 +117,12 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -133,6 +139,7 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); + put_page(page); kfree(iop); } -- cgit v1.2.3 From d41c49daf25953490fe3a0832fa6bb0a6739ab0c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 14 Dec 2018 14:17:17 -0800 Subject: userfaultfd: check VM_MAYWRITE was set after verifying the uffd is registered commit 01e881f5a1fca4677e82733061868c6d6ea05ca7 upstream. Calling UFFDIO_UNREGISTER on virtual ranges not yet registered in uffd could trigger an harmless false positive WARN_ON. Check the vma is already registered before checking VM_MAYWRITE to shut off the false positive warning. Link: http://lkml.kernel.org/r/20181206212028.18726-2-aarcange@redhat.com Cc: Fixes: 29ec90660d68 ("userfaultfd: shmem/hugetlbfs: only allow to register VM_MAYWRITE vmas") Signed-off-by: Andrea Arcangeli Reported-by: syzbot+06c7092e7d71218a2c16@syzkaller.appspotmail.com Acked-by: Mike Rapoport Acked-by: Hugh Dickins Acked-by: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/userfaultfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cd58939dc977..7a85e609fc27 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1566,7 +1566,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!vma_can_userfault(vma)); - WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); /* * Nothing to do: this vma is already registered into this @@ -1575,6 +1574,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (!vma->vm_userfaultfd_ctx.ctx) goto skip; + WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); + if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); -- cgit v1.2.3 From 3faf68a42f97b5ca3aa78a0e93ec4d3ff9971ce7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 5 Nov 2018 07:50:10 +0200 Subject: ovl: fix decode of dir file handle with multi lower layers commit 155b8a0492a90a4c6e22f046a3568b92a6bc48da upstream. When decoding a lower file handle, we first call ovl_check_origin_fh() with connected=false to get any real lower dentry for overlay inode cache lookup. If the real dentry is a disconnected dir dentry, ovl_check_origin_fh() is called again with connected=true to get a connected real dentry and find the lower layer the real dentry belongs to. If the first call returned a connected real dentry, we use it to lookup an overlay connected dentry, but the first ovl_check_origin_fh() call with connected=false did not check that the found dentry is under the root of the layer (see ovl_acceptable()), it only checked that the found dentry super block matches the uuid of the lower file handle. In case there are multiple lower layers on the same fs and the found dentry is not from the top most lower layer, using the layer index returned from the first ovl_check_origin_fh() is wrong and we end up failing to decode the file handle. Fix this by always calling ovl_check_origin_fh() with connected=true if we got a directory dentry in the first call. Fixes: 8b58924ad55c ("ovl: lookup in inode cache first when decoding...") Cc: # v4.17 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/export.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 8fa37cd7818a..54e5d17d7f3e 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -754,9 +754,8 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out; } - /* Otherwise, get a connected non-upper dir or disconnected non-dir */ - if (d_is_dir(origin.dentry) && - (origin.dentry->d_flags & DCACHE_DISCONNECTED)) { + /* Find origin.dentry again with ovl_acceptable() layer check */ + if (d_is_dir(origin.dentry)) { dput(origin.dentry); origin.dentry = NULL; err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack); @@ -769,6 +768,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb, goto out_err; } + /* Get a connected non-upper dir or disconnected non-dir */ dentry = ovl_get_dentry(sb, NULL, &origin, index); out: -- cgit v1.2.3 From 2a335229946ec7e04fdd9477e63a08650a7a9999 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 14 Nov 2018 16:01:34 +0200 Subject: ovl: fix missing override creds in link of a metacopy upper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 91ff20f34e94424e586f57f4f593beae16504f86 upstream. Theodore Ts'o reported a v4.19 regression with docker-dropbox: https://marc.info/?l=linux-fsdevel&m=154070089431116&w=2 "I was rebuilding my dropbox Docker container, and it failed in 4.19 with the following error: ... dpkg: error: error creating new backup file \ '/var/lib/dpkg/status-old': Invalid cross-device link" The problem did not reproduce with metacopy feature disabled. The error was caused by insufficient credentials to set "trusted.overlay.redirect" xattr on link of a metacopy file. Reproducer: echo Y > /sys/module/overlay/parameters/redirect_dir echo Y > /sys/module/overlay/parameters/metacopy cd /tmp mkdir l u w m chmod 777 l u touch l/foo ln l/foo l/link chmod 666 l/foo mount -t overlay none -olowerdir=l,upperdir=u,workdir=w m su fsgqa ln m/foo m/bar [ 21.455823] overlayfs: failed to set redirect (-1) ln: failed to create hard link 'm/bar' => 'm/foo':\ Invalid cross-device link Reported-by: Theodore Y. Ts'o Reported-by: Maciej Zięba Fixes: 4120fe64dce4 ("ovl: Set redirect on upper inode when it is linked") Cc: # v4.19 Signed-off-by: Amir Goldstein Acked-by: Vivek Goyal Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/dir.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 3bbde0a9f48f..b2aadd3e1fec 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -652,6 +652,18 @@ static int ovl_symlink(struct inode *dir, struct dentry *dentry, return ovl_create_object(dentry, S_IFLNK, 0, link); } +static int ovl_set_link_redirect(struct dentry *dentry) +{ + const struct cred *old_cred; + int err; + + old_cred = ovl_override_creds(dentry->d_sb); + err = ovl_set_redirect(dentry, false); + revert_creds(old_cred); + + return err; +} + static int ovl_link(struct dentry *old, struct inode *newdir, struct dentry *new) { @@ -672,7 +684,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, goto out_drop_write; if (ovl_is_metacopy_dentry(old)) { - err = ovl_set_redirect(old, false); + err = ovl_set_link_redirect(old); if (err) goto out_drop_write; } -- cgit v1.2.3 From d9267e136044854e72c3196179cac63ba4fa4254 Mon Sep 17 00:00:00 2001 From: Chad Austin Date: Mon, 10 Dec 2018 10:54:52 -0800 Subject: fuse: continue to send FUSE_RELEASEDIR when FUSE_OPEN returns ENOSYS commit 2e64ff154ce6ce9a8dc0f9556463916efa6ff460 upstream. When FUSE_OPEN returns ENOSYS, the no_open bit is set on the connection. Because the FUSE_RELEASE and FUSE_RELEASEDIR paths share code, this incorrectly caused the FUSE_RELEASEDIR request to be dropped and never sent to userspace. Pass an isdir bool to distinguish between FUSE_RELEASE and FUSE_RELEASEDIR inside of fuse_file_put. Fixes: 7678ac50615d ("fuse: support clients that don't implement 'open'") Cc: # v3.14 Signed-off-by: Chad Austin Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dir.c | 2 +- fs/fuse/file.c | 21 +++++++++++---------- fs/fuse/fuse_i.h | 2 +- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0979609d6eba..82a13221775e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1439,7 +1439,7 @@ static int fuse_dir_open(struct inode *inode, struct file *file) static int fuse_dir_release(struct inode *inode, struct file *file) { - fuse_release_common(file, FUSE_RELEASEDIR); + fuse_release_common(file, true); return 0; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a0ffed34b85d..fbd6978479cb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -87,12 +87,12 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) iput(req->misc.release.inode); } -static void fuse_file_put(struct fuse_file *ff, bool sync) +static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open) { + if (ff->fc->no_open && !isdir) { /* * Drop the release request when client does not * implement 'open' @@ -245,10 +245,11 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) req->in.args[0].value = inarg; } -void fuse_release_common(struct file *file, int opcode) +void fuse_release_common(struct file *file, bool isdir) { struct fuse_file *ff = file->private_data; struct fuse_req *req = ff->reserved_req; + int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(ff, file->f_flags, opcode); @@ -270,7 +271,7 @@ void fuse_release_common(struct file *file, int opcode) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL); + fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -286,7 +287,7 @@ static int fuse_release(struct inode *inode, struct file *file) if (fc->writeback_cache) write_inode_now(inode, 1); - fuse_release_common(file, FUSE_RELEASE); + fuse_release_common(file, false); /* return value is ignored by VFS */ return 0; @@ -300,7 +301,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags) * iput(NULL) is a no-op and since the refcount is 1 and everything's * synchronous, we are fine with not doing igrab() here" */ - fuse_file_put(ff, true); + fuse_file_put(ff, true, false); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -805,7 +806,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) put_page(page); } if (req->ff) - fuse_file_put(req->ff, false); + fuse_file_put(req->ff, false, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1459,7 +1460,7 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) __free_page(req->pages[i]); if (req->ff) - fuse_file_put(req->ff, false); + fuse_file_put(req->ff, false, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1616,7 +1617,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fc, fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, 0); + fuse_file_put(ff, false, false); return err; } @@ -1930,7 +1931,7 @@ static int fuse_writepages(struct address_space *mapping, err = 0; } if (data.ff) - fuse_file_put(data.ff, false); + fuse_file_put(data.ff, false, false); kfree(data.orig_pages); out: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f78e9614bb5f..cec8b8e74969 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -749,7 +749,7 @@ void fuse_sync_release(struct fuse_file *ff, int flags); /** * Send RELEASE or RELEASEDIR request */ -void fuse_release_common(struct file *file, int opcode); +void fuse_release_common(struct file *file, bool isdir); /** * Send FSYNC or FSYNCDIR request -- cgit v1.2.3 From de956d40781191903c7eb8e6843b0cb506ab1f43 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 27 Nov 2018 19:31:30 +0000 Subject: nfs: don't dirty kernel pages read by direct-io [ Upstream commit ad3cba223ac02dc769c3bbe88efe277bbb457566 ] When we use direct_IO with an NFS backing store, we can trigger a WARNING in __set_page_dirty(), as below, since we're dirtying the page unnecessarily in nfs_direct_read_completion(). To fix, replicate the logic in commit 53cbf3b157a0 ("fs: direct-io: don't dirtying pages for ITER_BVEC/ITER_KVEC direct read"). Other filesystems that implement direct_IO handle this; most use blockdev_direct_IO(). ceph and cifs have similar logic. mount 127.0.0.1:/export /nfs dd if=/dev/zero of=/nfs/image bs=1M count=200 losetup --direct-io=on -f /nfs/image mkfs.btrfs /dev/loop0 mount -t btrfs /dev/loop0 /mnt/ kernel: WARNING: CPU: 0 PID: 8067 at fs/buffer.c:580 __set_page_dirty+0xaf/0xd0 kernel: Modules linked in: loop(E) nfsv3(E) rpcsec_gss_krb5(E) nfsv4(E) dns_resolver(E) nfs(E) fscache(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) fuse(E) tun(E) ip6t_rpfilter(E) ipt_REJECT(E) nf_ kernel: snd_seq(E) snd_seq_device(E) snd_pcm(E) video(E) snd_timer(E) snd(E) soundcore(E) ip_tables(E) xfs(E) libcrc32c(E) sd_mod(E) sr_mod(E) cdrom(E) ata_generic(E) pata_acpi(E) crc32c_intel(E) ahci(E) li kernel: CPU: 0 PID: 8067 Comm: kworker/0:2 Tainted: G E 4.20.0-rc1.master.20181111.ol7.x86_64 #1 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 kernel: Workqueue: nfsiod rpc_async_release [sunrpc] kernel: RIP: 0010:__set_page_dirty+0xaf/0xd0 kernel: Code: c3 48 8b 02 f6 c4 04 74 d4 48 89 df e8 ba 05 f7 ff 48 89 c6 eb cb 48 8b 43 08 a8 01 75 1f 48 89 d8 48 8b 00 a8 04 74 02 eb 87 <0f> 0b eb 83 48 83 e8 01 eb 9f 48 83 ea 01 0f 1f 00 eb 8b 48 83 e8 kernel: RSP: 0000:ffffc1c8825b7d78 EFLAGS: 00013046 kernel: RAX: 000fffffc0020089 RBX: fffff2b603308b80 RCX: 0000000000000001 kernel: RDX: 0000000000000001 RSI: ffff9d11478115c8 RDI: ffff9d11478115d0 kernel: RBP: ffffc1c8825b7da0 R08: 0000646f6973666e R09: 8080808080808080 kernel: R10: 0000000000000001 R11: 0000000000000000 R12: ffff9d11478115d0 kernel: R13: ffff9d11478115c8 R14: 0000000000003246 R15: 0000000000000001 kernel: FS: 0000000000000000(0000) GS:ffff9d115ba00000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 00007f408686f640 CR3: 0000000104d8e004 CR4: 00000000000606f0 kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 kernel: Call Trace: kernel: __set_page_dirty_buffers+0xb6/0x110 kernel: set_page_dirty+0x52/0xb0 kernel: nfs_direct_read_completion+0xc4/0x120 [nfs] kernel: nfs_pgio_release+0x10/0x20 [nfs] kernel: rpc_free_task+0x30/0x70 [sunrpc] kernel: rpc_async_release+0x12/0x20 [sunrpc] kernel: process_one_work+0x174/0x390 kernel: worker_thread+0x4f/0x3e0 kernel: kthread+0x102/0x140 kernel: ? drain_workqueue+0x130/0x130 kernel: ? kthread_stop+0x110/0x110 kernel: ret_from_fork+0x35/0x40 kernel: ---[ end trace 01341980905412c9 ]--- Signed-off-by: Dave Kleikamp Signed-off-by: Santosh Shilimkar [forward-ported to v4.20] Signed-off-by: Calum Mackay Reviewed-by: Dave Kleikamp Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/direct.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aa12c3063bae..33824a0a57bf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -98,8 +98,11 @@ struct nfs_direct_req { struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ struct work_struct work; int flags; + /* for write */ #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + /* for read */ +#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ struct nfs_writeverf verf; /* unstable write verifier */ }; @@ -412,7 +415,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; - if (!PageCompound(page) && bytes < hdr->good_bytes) + if (!PageCompound(page) && bytes < hdr->good_bytes && + (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY)) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); @@ -587,6 +591,9 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + if (iter_is_iovec(iter)) + dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; -- cgit v1.2.3 From b5a8028c25f3f3c3bbbe09646fe331a570189cbf Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 3 Nov 2018 15:02:44 -0500 Subject: cifs: In Kconfig CONFIG_CIFS_POSIX needs depends on legacy (insecure cifs) [ Upstream commit 6e785302dad32228819d8066e5376acd15d0e6ba ] Missing a dependency. Shouldn't show cifs posix extensions in Kconfig if CONFIG_CIFS_ALLOW_INSECURE_DIALECTS (ie SMB1 protocol) is disabled. Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky Signed-off-by: Sasha Levin --- fs/cifs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index abcd78e332fe..85dadb93c992 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -133,7 +133,7 @@ config CIFS_XATTR config CIFS_POSIX bool "CIFS POSIX Extensions" - depends on CIFS_XATTR + depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR help Enabling this option will cause the cifs client to attempt to negotiate a newer dialect with servers, such as Samba 3.0.5 -- cgit v1.2.3 From b4c7c826709b7d882ec9b264d5032e887e6bd720 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 31 Oct 2018 10:06:08 -0700 Subject: Btrfs: fix missing delayed iputs on unmount [ Upstream commit d6fd0ae25c6495674dc5a41a8d16bc8e0073276d ] There's a race between close_ctree() and cleaner_kthread(). close_ctree() sets btrfs_fs_closing(), and the cleaner stops when it sees it set, but this is racy; the cleaner might have already checked the bit and could be cleaning stuff. In particular, if it deletes unused block groups, it will create delayed iputs for the free space cache inodes. As of "btrfs: don't run delayed_iputs in commit", we're no longer running delayed iputs after a commit. Therefore, if the cleaner creates more delayed iputs after delayed iputs are run in btrfs_commit_super(), we will leak inodes on unmount and get a busy inode crash from the VFS. Fix it by parking the cleaner before we actually close anything. Then, any remaining delayed iputs will always be handled in btrfs_commit_super(). This also ensures that the commit in close_ctree() is really the last commit, so we can get rid of the commit in cleaner_kthread(). The fstest/generic/475 followed by 476 can trigger a crash that manifests as a slab corruption caused by accessing the freed kthread structure by a wake up function. Sample trace: [ 5657.077612] BUG: unable to handle kernel NULL pointer dereference at 00000000000000cc [ 5657.079432] PGD 1c57a067 P4D 1c57a067 PUD da10067 PMD 0 [ 5657.080661] Oops: 0000 [#1] PREEMPT SMP [ 5657.081592] CPU: 1 PID: 5157 Comm: fsstress Tainted: G W 4.19.0-rc8-default+ #323 [ 5657.083703] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 5657.086577] RIP: 0010:shrink_page_list+0x2f9/0xe90 [ 5657.091937] RSP: 0018:ffffb5c745c8f728 EFLAGS: 00010287 [ 5657.092953] RAX: 0000000000000074 RBX: ffffb5c745c8f830 RCX: 0000000000000000 [ 5657.094590] RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff9a8747fdf3d0 [ 5657.095987] RBP: ffffb5c745c8f9e0 R08: 0000000000000000 R09: 0000000000000000 [ 5657.097159] R10: ffff9a8747fdf5e8 R11: 0000000000000000 R12: ffffb5c745c8f788 [ 5657.098513] R13: ffff9a877f6ff2c0 R14: ffff9a877f6ff2c8 R15: dead000000000200 [ 5657.099689] FS: 00007f948d853b80(0000) GS:ffff9a877d600000(0000) knlGS:0000000000000000 [ 5657.101032] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5657.101953] CR2: 00000000000000cc CR3: 00000000684bd000 CR4: 00000000000006e0 [ 5657.103159] Call Trace: [ 5657.103776] shrink_inactive_list+0x194/0x410 [ 5657.104671] shrink_node_memcg.constprop.84+0x39a/0x6a0 [ 5657.105750] shrink_node+0x62/0x1c0 [ 5657.106529] try_to_free_pages+0x1a4/0x500 [ 5657.107408] __alloc_pages_slowpath+0x2c9/0xb20 [ 5657.108418] __alloc_pages_nodemask+0x268/0x2b0 [ 5657.109348] kmalloc_large_node+0x37/0x90 [ 5657.110205] __kmalloc_node+0x236/0x310 [ 5657.111014] kvmalloc_node+0x3e/0x70 Fixes: 30928e9baac2 ("btrfs: don't run delayed_iputs in commit") Signed-off-by: Omar Sandoval Reviewed-by: David Sterba [ add trace ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/disk-io.c | 51 +++++++++++++++------------------------------------ 1 file changed, 15 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 834a3f5ef642..d4a7f7ca4145 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1656,9 +1656,8 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_fs_info *fs_info = root->fs_info; int again; - struct btrfs_trans_handle *trans; - do { + while (1) { again = 0; /* Make the cleaner go to sleep early. */ @@ -1707,42 +1706,16 @@ static int cleaner_kthread(void *arg) */ btrfs_delete_unused_bgs(fs_info); sleep: + if (kthread_should_park()) + kthread_parkme(); + if (kthread_should_stop()) + return 0; if (!again) { set_current_state(TASK_INTERRUPTIBLE); - if (!kthread_should_stop()) - schedule(); + schedule(); __set_current_state(TASK_RUNNING); } - } while (!kthread_should_stop()); - - /* - * Transaction kthread is stopped before us and wakes us up. - * However we might have started a new transaction and COWed some - * tree blocks when deleting unused block groups for example. So - * make sure we commit the transaction we started to have a clean - * shutdown when evicting the btree inode - if it has dirty pages - * when we do the final iput() on it, eviction will trigger a - * writeback for it which will fail with null pointer dereferences - * since work queues and other resources were already released and - * destroyed by the time the iput/eviction/writeback is made. - */ - trans = btrfs_attach_transaction(root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) - btrfs_err(fs_info, - "cleaner transaction attach returned %ld", - PTR_ERR(trans)); - } else { - int ret; - - ret = btrfs_commit_transaction(trans); - if (ret) - btrfs_err(fs_info, - "cleaner open transaction commit returned %d", - ret); } - - return 0; } static int transaction_kthread(void *arg) @@ -3923,6 +3896,13 @@ void close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + /* + * We don't want the cleaner to start new transactions, add more delayed + * iputs, etc. while we're closing. We can't use kthread_stop() yet + * because that frees the task_struct, and the transaction kthread might + * still try to wake up the cleaner. + */ + kthread_park(fs_info->cleaner_kthread); /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -3950,9 +3930,8 @@ void close_ctree(struct btrfs_fs_info *fs_info) if (!sb_rdonly(fs_info->sb)) { /* - * If the cleaner thread is stopped and there are - * block groups queued for removal, the deletion will be - * skipped when we quit the cleaner thread. + * The cleaner kthread is stopped, so do one final pass over + * unused block groups. */ btrfs_delete_unused_bgs(fs_info); -- cgit v1.2.3 From 38d072a4a71738d95a2b9e47a4bbd091d7e55af3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 20 Dec 2018 23:23:24 +1100 Subject: iomap: Revert "fs/iomap.c: get/put the page in iomap_page_create/release()" [ Upstream commit a837eca2412051628c0529768c9bc4f3580b040e ] This reverts commit 61c6de667263184125d5ca75e894fcad632b0dd3. The reverted commit added page reference counting to iomap page structures that are used to track block size < page size state. This was supposed to align the code with page migration page accounting assumptions, but what it has done instead is break XFS filesystems. Every fstests run I've done on sub-page block size XFS filesystems has since picking up this commit 2 days ago has failed with bad page state errors such as: # ./run_check.sh "-m rmapbt=1,reflink=1 -i sparse=1 -b size=1k" "generic/038" .... SECTION -- xfs FSTYP -- xfs (debug) PLATFORM -- Linux/x86_64 test1 4.20.0-rc6-dgc+ MKFS_OPTIONS -- -f -m rmapbt=1,reflink=1 -i sparse=1 -b size=1k /dev/sdc MOUNT_OPTIONS -- /dev/sdc /mnt/scratch generic/038 454s ... run fstests generic/038 at 2018-12-20 18:43:05 XFS (sdc): Unmounting Filesystem XFS (sdc): Mounting V5 Filesystem XFS (sdc): Ending clean mount BUG: Bad page state in process kswapd0 pfn:3a7fa page:ffffea0000ccbeb0 count:0 mapcount:0 mapping:ffff88800d9b6360 index:0x1 flags: 0xfffffc0000000() raw: 000fffffc0000000 dead000000000100 dead000000000200 ffff88800d9b6360 raw: 0000000000000001 0000000000000000 00000000ffffffff page dumped because: non-NULL mapping CPU: 0 PID: 676 Comm: kswapd0 Not tainted 4.20.0-rc6-dgc+ #915 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014 Call Trace: dump_stack+0x67/0x90 bad_page.cold.116+0x8a/0xbd free_pcppages_bulk+0x4bf/0x6a0 free_unref_page_list+0x10f/0x1f0 shrink_page_list+0x49d/0xf50 shrink_inactive_list+0x19d/0x3b0 shrink_node_memcg.constprop.77+0x398/0x690 ? shrink_slab.constprop.81+0x278/0x3f0 shrink_node+0x7a/0x2f0 kswapd+0x34b/0x6d0 ? node_reclaim+0x240/0x240 kthread+0x11f/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x24/0x30 Disabling lock debugging due to kernel taint .... The failures are from anyway that frees pages and empties the per-cpu page magazines, so it's not a predictable failure or an easy to debug failure. generic/038 is a reliable reproducer of this problem - it has a 9 in 10 failure rate on one of my test machines. Failure on other machines have been at random points in fstests runs but every run has ended up tripping this problem. Hence generic/038 was used to bisect the failure because it was the most reliable failure. It is too close to the 4.20 release (not to mention holidays) to try to diagnose, fix and test the underlying cause of the problem, so reverting the commit is the only option we have right now. The revert has been tested against a current tot 4.20-rc7+ kernel across multiple machines running sub-page block size XFs filesystems and none of the bad page state failures have been seen. Signed-off-by: Dave Chinner Cc: Piotr Jaroszynski Cc: Christoph Hellwig Cc: William Kucharski Cc: Darrick J. Wong Cc: Brian Foster Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/iomap.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 37da7a61a6c5..ec15cf2ec696 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -117,12 +117,6 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); - - /* - * migrate_page_move_mapping() assumes that pages with private data have - * their count elevated by 1. - */ - get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -139,7 +133,6 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); - put_page(page); kfree(iop); } -- cgit v1.2.3 From 9c5ccadb7b42ee124d4009212a2b737c98590da7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Jul 2018 17:51:20 +0200 Subject: Revert "vfs: Allow userns root to call mknod on owned filesystems." commit 94f82008ce30e2624537d240d64ce718255e0b80 upstream. This reverts commit 55956b59df336f6738da916dbb520b6e37df9fbd. commit 55956b59df33 ("vfs: Allow userns root to call mknod on owned filesystems.") enabled mknod() in user namespaces for userns root if CAP_MKNOD is available. However, these device nodes are useless since any filesystem mounted from a non-initial user namespace will set the SB_I_NODEV flag on the filesystem. Now, when a device node s created in a non-initial user namespace a call to open() on said device node will fail due to: bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } The problem with this is that as of the aforementioned commit mknod() creates partially functional device nodes in non-initial user namespaces. In particular, it has the consequence that as of the aforementioned commit open() will be more privileged with respect to device nodes than mknod(). Before it was the other way around. Specifically, if mknod() succeeded then it was transparent for any userspace application that a fatal error must have occured when open() failed. All of this breaks multiple userspace workloads and a widespread assumption about how to handle mknod(). Basically, all container runtimes and systemd live by the slogan "ask for forgiveness not permission" when running user namespace workloads. For mknod() the assumption is that if the syscall succeeds the device nodes are useable irrespective of whether it succeeds in a non-initial user namespace or not. This logic was chosen explicitly to allow for the glorious day when mknod() will actually be able to create fully functional device nodes in user namespaces. A specific problem people are already running into when running 4.18 rc kernels are failing systemd services. For any distro that is run in a container systemd services started with the PrivateDevices= property set will fail to start since the device nodes in question cannot be opened (cf. the arguments in [1]). Full disclosure, Seth made the very sound argument that it is already possible to end up with partially functional device nodes. Any filesystem mounted with MS_NODEV set will allow mknod() to succeed but will not allow open() to succeed. The difference to the case here is that the MS_NODEV case is transparent to userspace since it is an explicitly set mount option while the SB_I_NODEV case is an implicit property enforced by the kernel and hence opaque to userspace. [1]: https://github.com/systemd/systemd/pull/9483 Signed-off-by: Christian Brauner Cc: "Eric W. Biederman" Cc: Seth Forshee Cc: Serge Hallyn Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 0cab6494978c..914178cdbe94 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3701,8 +3701,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && - !ns_capable(dentry->d_sb->s_user_ns, CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) -- cgit v1.2.3 From 0736458856868c920079f42a1e881d271e201390 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 7 Nov 2018 23:04:43 +0100 Subject: ubifs: Handle re-linking of inodes correctly while recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e58725d51fa8da9133f3f1c54170aa2e43056b91 upstream. UBIFS's recovery code strictly assumes that a deleted inode will never come back, therefore it removes all data which belongs to that inode as soon it faces an inode with link count 0 in the replay list. Before O_TMPFILE this assumption was perfectly fine. With O_TMPFILE it can lead to data loss upon a power-cut. Consider a journal with entries like: 0: inode X (nlink = 0) /* O_TMPFILE was created */ 1: data for inode X /* Someone writes to the temp file */ 2: inode X (nlink = 0) /* inode was changed, xattr, chmod, … */ 3: inode X (nlink = 1) /* inode was re-linked via linkat() */ Upon replay of entry #2 UBIFS will drop all data that belongs to inode X, this will lead to an empty file after mounting. As solution for this problem, scan the replay list for a re-link entry before dropping data. Fixes: 474b93704f32 ("ubifs: Implement O_TMPFILE") Cc: stable@vger.kernel.org Cc: Russell Senior Cc: Rafał Miłecki Reported-by: Russell Senior Reported-by: Rafał Miłecki Tested-by: Rafał Miłecki Signed-off-by: Richard Weinberger Signed-off-by: Greg Kroah-Hartman --- fs/ubifs/replay.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 4844538eb926..c6f9b2225387 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -209,6 +209,38 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) return ubifs_tnc_remove_range(c, &min_key, &max_key); } +/** + * inode_still_linked - check whether inode in question will be re-linked. + * @c: UBIFS file-system description object + * @rino: replay entry to test + * + * O_TMPFILE files can be re-linked, this means link count goes from 0 to 1. + * This case needs special care, otherwise all references to the inode will + * be removed upon the first replay entry of an inode with link count 0 + * is found. + */ +static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino) +{ + struct replay_entry *r; + + ubifs_assert(c, rino->deletion); + ubifs_assert(c, key_type(c, &rino->key) == UBIFS_INO_KEY); + + /* + * Find the most recent entry for the inode behind @rino and check + * whether it is a deletion. + */ + list_for_each_entry_reverse(r, &c->replay_list, list) { + ubifs_assert(c, r->sqnum >= rino->sqnum); + if (key_inum(c, &r->key) == key_inum(c, &rino->key)) + return r->deletion == 0; + + } + + ubifs_assert(c, 0); + return false; +} + /** * apply_replay_entry - apply a replay entry to the TNC. * @c: UBIFS file-system description object @@ -236,6 +268,11 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { ino_t inum = key_inum(c, &r->key); + if (inode_still_linked(c, r)) { + err = 0; + break; + } + err = ubifs_tnc_remove_ino(c, inum); break; } -- cgit v1.2.3 From 6bb41321166fe7db834fd7137b596d4312e38273 Mon Sep 17 00:00:00 2001 From: Ivan Delalande Date: Thu, 13 Dec 2018 15:20:52 -0800 Subject: proc/sysctl: don't return ENOMEM on lookup when a table is unregistering commit ea5751ccd665a2fd1b24f9af81f6167f0718c5f6 upstream. proc_sys_lookup can fail with ENOMEM instead of ENOENT when the corresponding sysctl table is being unregistered. In our case we see this upon opening /proc/sys/net/*/conf files while network interfaces are being deleted, which confuses our configuration daemon. The problem was successfully reproduced and this fix tested on v4.9.122 and v4.20-rc6. v2: return ERR_PTRs in all cases when proc_sys_make_inode fails instead of mixing them with NULL. Thanks Al Viro for the feedback. Fixes: ace0c791e6c3 ("proc/sysctl: Don't grab i_lock under sysctl_lock.") Cc: stable@vger.kernel.org Signed-off-by: Ivan Delalande Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/proc/proc_sysctl.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 89921a0d2ebb..4d598a399bbf 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -464,7 +464,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode = new_inode(sb); if (!inode) - goto out; + return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); @@ -474,8 +474,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (unlikely(head->unregistering)) { spin_unlock(&sysctl_lock); iput(inode); - inode = NULL; - goto out; + return ERR_PTR(-ENOENT); } ei->sysctl = head; ei->sysctl_entry = table; @@ -500,7 +499,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, if (root->set_ownership) root->set_ownership(head, table, &inode->i_uid, &inode->i_gid); -out: return inode; } @@ -549,10 +547,11 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, goto out; } - err = ERR_PTR(-ENOMEM); inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); - if (!inode) + if (IS_ERR(inode)) { + err = ERR_CAST(inode); goto out; + } d_set_d_op(dentry, &proc_sys_dentry_operations); err = d_splice_alias(inode, dentry); @@ -685,7 +684,7 @@ static bool proc_sys_fill_cache(struct file *file, if (d_in_lookup(child)) { struct dentry *res; inode = proc_sys_make_inode(dir->d_sb, head, table); - if (!inode) { + if (IS_ERR(inode)) { d_lookup_done(child); dput(child); return false; -- cgit v1.2.3 From b878c8a7f08f0c225b6a46ba1ac867e9c5d17807 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 25 Nov 2018 17:20:31 -0500 Subject: ext4: add ext4_sb_bread() to disambiguate ENOMEM cases commit fb265c9cb49e2074ddcdd4de99728aefdd3b3592 upstream. Today, when sb_bread() returns NULL, this can either be because of an I/O error or because the system failed to allocate the buffer. Since it's an old interface, changing would require changing many call sites. So instead we create our own ext4_sb_bread(), which also allows us to set the REQ_META flag. Also fixed a problem in the xattr code where a NULL return in a function could also mean that the xattr was not found, which could lead to the wrong error getting returned to userspace. Fixes: ac27a0ec112a ("ext4: initial copy of files from ext3") Cc: stable@kernel.org # 2.6.19 Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 2 ++ fs/ext4/migrate.c | 36 +++++++++++++------------- fs/ext4/resize.c | 72 ++++++++++++++++++++++++++-------------------------- fs/ext4/super.c | 23 +++++++++++++++++ fs/ext4/xattr.c | 76 ++++++++++++++++++++++++++----------------------------- 5 files changed, 115 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5cfb1e2f6a5b..05eddfbd3404 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2542,6 +2542,8 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern struct buffer_head *ext4_sb_bread(struct super_block *sb, + sector_t block, int op_flags); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 61a9d1927817..a98bfca9c463 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -116,9 +116,9 @@ static int update_ind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -145,9 +145,9 @@ static int update_dind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -175,9 +175,9 @@ static int update_tind_extent_range(handle_t *handle, struct inode *inode, int i, retval = 0; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, pblock); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, pblock, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); i_data = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -224,9 +224,9 @@ static int free_dind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -254,9 +254,9 @@ static int free_tind_blocks(handle_t *handle, struct buffer_head *bh; unsigned long max_entries = inode->i_sb->s_blocksize >> 2; - bh = sb_bread(inode->i_sb, le32_to_cpu(i_data)); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, le32_to_cpu(i_data), 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); tmp_idata = (__le32 *)bh->b_data; for (i = 0; i < max_entries; i++) { @@ -382,9 +382,9 @@ static int free_ext_idx(handle_t *handle, struct inode *inode, struct ext4_extent_header *eh; block = ext4_idx_pblock(ix); - bh = sb_bread(inode->i_sb, block); - if (!bh) - return -EIO; + bh = ext4_sb_bread(inode->i_sb, block, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); eh = (struct ext4_extent_header *)bh->b_data; if (eh->eh_depth != 0) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a5efee34415f..87350b642681 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -127,10 +127,12 @@ static int verify_group_input(struct super_block *sb, else if (free_blocks_count < 0) ext4_warning(sb, "Bad blocks count %u", input->blocks_count); - else if (!(bh = sb_bread(sb, end - 1))) + else if (IS_ERR(bh = ext4_sb_bread(sb, end - 1, 0))) { + err = PTR_ERR(bh); + bh = NULL; ext4_warning(sb, "Cannot read last block (%llu)", end - 1); - else if (outside(input->block_bitmap, start, end)) + } else if (outside(input->block_bitmap, start, end)) ext4_warning(sb, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) @@ -781,11 +783,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, struct ext4_super_block *es = EXT4_SB(sb)->s_es; unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; - struct buffer_head **o_group_desc, **n_group_desc; - struct buffer_head *dind; - struct buffer_head *gdb_bh; + struct buffer_head **o_group_desc, **n_group_desc = NULL; + struct buffer_head *dind = NULL; + struct buffer_head *gdb_bh = NULL; int gdbackups; - struct ext4_iloc iloc; + struct ext4_iloc iloc = { .bh = NULL }; __le32 *data; int err; @@ -794,21 +796,22 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); gdbackups = verify_reserved_gdb(sb, group, gdb_bh); if (gdbackups < 0) { err = gdbackups; - goto exit_bh; + goto errout; } data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; - goto exit_bh; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; + goto errout; } data = (__le32 *)dind->b_data; @@ -816,18 +819,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, ext4_warning(sb, "new group %u GDT block %llu not reserved", group, gdblock); err = -EINVAL; - goto exit_dind; + goto errout; } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) - goto exit_dind; + goto errout; BUFFER_TRACE(dind, "get_write_access"); err = ext4_journal_get_write_access(handle, dind); @@ -837,7 +840,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, /* ext4_reserve_inode_write() gets a reference on the iloc */ err = ext4_reserve_inode_write(handle, inode, &iloc); if (unlikely(err)) - goto exit_dind; + goto errout; n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), @@ -846,7 +849,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = -ENOMEM; ext4_warning(sb, "not enough memory for %lu groups", gdb_num + 1); - goto exit_inode; + goto errout; } /* @@ -862,7 +865,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, dind); if (unlikely(err)) { ext4_std_error(sb, err); - goto exit_inode; + goto errout; } inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> (9 - EXT4_SB(sb)->s_cluster_bits); @@ -871,8 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); - iloc.bh = NULL; - goto exit_inode; + goto errout; } brelse(dind); @@ -888,15 +890,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_super(handle, sb); if (err) ext4_std_error(sb, err); - return err; - -exit_inode: +errout: kvfree(n_group_desc); brelse(iloc.bh); -exit_dind: brelse(dind); -exit_bh: brelse(gdb_bh); ext4_debug("leaving with error %d\n", err); @@ -916,9 +914,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb, gdblock = ext4_meta_bg_first_block_no(sb, group) + ext4_bg_has_super(sb, group); - gdb_bh = sb_bread(sb, gdblock); - if (!gdb_bh) - return -EIO; + gdb_bh = ext4_sb_bread(sb, gdblock, 0); + if (IS_ERR(gdb_bh)) + return PTR_ERR(gdb_bh); n_group_desc = ext4_kvmalloc((gdb_num + 1) * sizeof(struct buffer_head *), GFP_NOFS); @@ -975,9 +973,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, return -ENOMEM; data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK; - dind = sb_bread(sb, le32_to_cpu(*data)); - if (!dind) { - err = -EIO; + dind = ext4_sb_bread(sb, le32_to_cpu(*data), 0); + if (IS_ERR(dind)) { + err = PTR_ERR(dind); + dind = NULL; goto exit_free; } @@ -996,9 +995,10 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, err = -EINVAL; goto exit_bh; } - primary[res] = sb_bread(sb, blk); - if (!primary[res]) { - err = -EIO; + primary[res] = ext4_sb_bread(sb, blk, 0); + if (IS_ERR(primary[res])) { + err = PTR_ERR(primary[res]); + primary[res] = NULL; goto exit_bh; } gdbackups = verify_reserved_gdb(sb, group, primary[res]); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a149df1c6a1..804ce1e5a1e7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -140,6 +140,29 @@ MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +/* + * This works like sb_bread() except it uses ERR_PTR for error + * returns. Currently with sb_bread it's impossible to distinguish + * between ENOMEM and EIO situations (since both result in a NULL + * return. + */ +struct buffer_head * +ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags) +{ + struct buffer_head *bh = sb_getblk(sb, block); + + if (bh == NULL) + return ERR_PTR(-ENOMEM); + if (buffer_uptodate(bh)) + return bh; + ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + put_bh(bh); + return ERR_PTR(-EIO); +} + static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 4380c8630539..52856d98afd3 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -522,14 +522,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); - error = -ENODATA; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); @@ -696,26 +695,23 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); - error = 0; if (!EXT4_I(inode)->i_file_acl) - goto cleanup; + return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) - goto cleanup; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); - error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); - + error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, + buffer_size); cleanup: brelse(bh); - return error; } @@ -830,9 +826,9 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - ret = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); goto out; } @@ -1825,16 +1821,15 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ - bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bs->bh) - goto cleanup; + bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bs->bh)) + return PTR_ERR(bs->bh); ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) - goto cleanup; + return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); @@ -1843,13 +1838,10 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) - goto cleanup; + return error; bs->s.not_found = error; } - error = 0; - -cleanup: - return error; + return 0; } static int @@ -2278,9 +2270,9 @@ static struct buffer_head *ext4_xattr_get_block(struct inode *inode) if (!EXT4_I(inode)->i_file_acl) return NULL; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) - return ERR_PTR(-EIO); + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) + return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2750,10 +2742,11 @@ retry: if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - error = -EIO; - if (!bh) + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); goto cleanup; + } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); @@ -2907,11 +2900,12 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, } if (EXT4_I(inode)->i_file_acl) { - bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); - if (!bh) { - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); - error = -EIO; + bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + if (error == -EIO) + EXT4_ERROR_INODE(inode, "block %llu read error", + EXT4_I(inode)->i_file_acl); goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3064,8 +3058,10 @@ ext4_xattr_block_cache_find(struct inode *inode, while (ce) { struct buffer_head *bh; - bh = sb_bread(inode->i_sb, ce->e_value); - if (!bh) { + bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); + if (IS_ERR(bh)) { + if (PTR_ERR(bh) == -ENOMEM) + return NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { -- cgit v1.2.3 From 0a1c177dd90348342ccf918423d91aebb1e32bfb Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Mon, 3 Dec 2018 23:28:02 -0500 Subject: ext4: fix possible use after free in ext4_quota_enable commit 61157b24e60fb3cd1f85f2c76a7b1d628f970144 upstream. The function frees qf_inode via iput but then pass qf_inode to lockdep_set_quota_inode on the failure path. This may result in a use-after-free bug. The patch frees df_inode only when it is never used. Fixes: daf647d2dd5 ("ext4: add lockdep annotations for i_data_sem") Cc: stable@kernel.org # 4.6 Reviewed-by: Jan Kara Signed-off-by: Pan Bian Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 804ce1e5a1e7..98b60657e1dd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5712,9 +5712,9 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_enable(qf_inode, type, format_id, flags); - iput(qf_inode); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); + iput(qf_inode); return err; } -- cgit v1.2.3 From 0d078853b87a929b9e37055932a7c873647327cb Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Tue, 4 Dec 2018 00:06:53 -0500 Subject: ext4: missing unlock/put_page() in ext4_try_to_write_inline_data() commit 132d00becb31e88469334e1e62751c81345280e0 upstream. In case of error, ext4_try_to_write_inline_data() should unlock and release the page it holds. Fixes: f19d5870cbf7 ("ext4: add normal write support for inline data") Cc: stable@kernel.org # 3.8 Signed-off-by: Maurizio Lombardi Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9c4bac18cc6c..27373d88b5f0 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -705,8 +705,11 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (!PageUptodate(page)) { ret = ext4_read_inline_page(inode, page); - if (ret < 0) + if (ret < 0) { + unlock_page(page); + put_page(page); goto out_up_read; + } } ret = 1; -- cgit v1.2.3 From 11bb168baef2dc9f613e021749b6f22966ae0959 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?ruippan=20=28=E6=BD=98=E7=9D=BF=29?= Date: Tue, 4 Dec 2018 01:04:12 -0500 Subject: ext4: fix EXT4_IOC_GROUP_ADD ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e647e29196b7f802f8242c39ecb7cc937f5ef217 upstream. Commit e2b911c53584 ("ext4: clean up feature test macros with predicate functions") broke the EXT4_IOC_GROUP_ADD ioctl. This was not noticed since only very old versions of resize2fs (before e2fsprogs 1.42) use this ioctl. However, using a new kernel with an enterprise Linux userspace will cause attempts to use online resize to fail with "No reserved GDT blocks". Fixes: e2b911c53584 ("ext4: clean up feature test macros with predicate...") Cc: stable@kernel.org # v4.4 Signed-off-by: Theodore Ts'o Signed-off-by: ruippan (潘睿) Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 87350b642681..bc8ee0c498cc 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1631,7 +1631,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (ext4_has_feature_resize_inode(sb) || + if (!ext4_has_feature_resize_inode(sb) || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); -- cgit v1.2.3 From 6633fcb231a0864ca5c961a7664b137474ca3779 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Dec 2018 12:28:13 -0500 Subject: ext4: include terminating u32 in size of xattr entries when expanding inodes commit a805622a757b6d7f65def4141d29317d8e37b8a1 upstream. In ext4_expand_extra_isize_ea(), we calculate the total size of the xattr header, plus the xattr entries so we know how much of the beginning part of the xattrs to move when expanding the inode extra size. We need to include the terminating u32 at the end of the xattr entries, or else if there is uninitialized, non-zero bytes after the xattr entries and before the xattr values, the list of xattr entries won't be properly terminated. Reported-by: Steve Graham Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 52856d98afd3..8ebf0967424f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2725,7 +2725,7 @@ retry: base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; - total_ino = sizeof(struct ext4_xattr_ibody_header); + total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) -- cgit v1.2.3 From 263663888d2f00421887e251d93ebc52650a37ab Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Dec 2018 12:29:13 -0500 Subject: ext4: avoid declaring fs inconsistent due to invalid file handles commit 8a363970d1dc38c4ec4ad575c862f776f468d057 upstream. If we receive a file handle, either from NFS or open_by_handle_at(2), and it points at an inode which has not been initialized, and the file system has metadata checksums enabled, we shouldn't try to get the inode, discover the checksum is invalid, and then declare the file system as being inconsistent. This can be reproduced by creating a test file system via "mke2fs -t ext4 -O metadata_csum /tmp/foo.img 8M", mounting it, cd'ing into that directory, and then running the following program. #define _GNU_SOURCE #include struct handle { struct file_handle fh; unsigned char fid[MAX_HANDLE_SZ]; }; int main(int argc, char **argv) { struct handle h = {{8, 1 }, { 12, }}; open_by_handle_at(AT_FDCWD, &h.fh, O_RDONLY); return 0; } Google-Bug-Id: 120690101 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 15 +++++++++++++-- fs/ext4/ialloc.c | 2 +- fs/ext4/inode.c | 54 +++++++++++++++++++++++++++++++++++++----------------- fs/ext4/ioctl.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/ext4/resize.c | 5 +++-- fs/ext4/super.c | 19 +++++-------------- fs/ext4/xattr.c | 5 +++-- 8 files changed, 65 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 05eddfbd3404..032cf9b92665 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2459,8 +2459,19 @@ int do_journal_get_write_access(handle_t *handle, #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 -extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); +typedef enum { + EXT4_IGET_NORMAL = 0, + EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ + EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */ +} ext4_iget_flags; + +extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line); + +#define ext4_iget(sb, ino, flags) \ + __ext4_iget((sb), (ino), (flags), __func__, __LINE__) + extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2addcb8730e1..091a18a51c99 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1225,7 +1225,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) if (!ext4_test_bit(bit, bitmap_bh->b_data)) goto bad_orphan; - inode = ext4_iget(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(sb, "couldn't read orphan inode %lu (err %d)", diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 244531d3065a..bcf08801f935 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4786,7 +4786,9 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode) return inode_peek_iversion(inode); } -struct inode *ext4_iget(struct super_block *sb, unsigned long ino) +struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + ext4_iget_flags flags, const char *function, + unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; @@ -4800,6 +4802,18 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) gid_t i_gid; projid_t i_projid; + if (((flags & EXT4_IGET_NORMAL) && + (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + (ino < EXT4_ROOT_INO) || + (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + if (flags & EXT4_IGET_HANDLE) + return ERR_PTR(-ESTALE); + __ext4_error(sb, function, line, + "inode #%lu: comm %s: iget: illegal inode #", + ino, current->comm); + return ERR_PTR(-EFSCORRUPTED); + } + inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); @@ -4815,18 +4829,26 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) raw_inode = ext4_raw_inode(&iloc); if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { - EXT4_ERROR_INODE(inode, "root inode unallocated"); + ext4_error_inode(inode, function, line, 0, + "iget: root inode unallocated"); ret = -EFSCORRUPTED; goto bad_inode; } + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; + goto bad_inode; + } + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { - EXT4_ERROR_INODE(inode, - "bad extra_isize %u (inode size %u)", + ext4_error_inode(inode, function, line, 0, + "iget: bad extra_isize %u " + "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; @@ -4848,7 +4870,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { - EXT4_ERROR_INODE(inode, "checksum invalid"); + ext4_error_inode(inode, function, line, 0, + "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4905,7 +4928,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { - EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ext4_error_inode(inode, function, line, 0, + "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } @@ -4981,7 +5005,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = 0; if (ei->i_file_acl && !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { - EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", + ext4_error_inode(inode, function, line, 0, + "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; @@ -5009,8 +5034,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - EXT4_ERROR_INODE(inode, - "immutable or append flags not allowed on symlinks"); + ext4_error_inode(inode, function, line, 0, + "iget: immutable or append flags " + "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } @@ -5040,7 +5066,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) make_bad_inode(inode); } else { ret = -EFSCORRUPTED; - EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); + ext4_error_inode(inode, function, line, 0, + "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } brelse(iloc.bh); @@ -5054,13 +5081,6 @@ bad_inode: return ERR_PTR(ret); } -struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) -{ - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-EFSCORRUPTED); - return ext4_iget(sb, ino); -} - static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0edee31913d1..d37dafa1d133 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -125,7 +125,7 @@ static long swap_inode_boot_loader(struct super_block *sb, !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) return -EPERM; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); + inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ffa25753e929..4f8de2b9e87e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1571,7 +1571,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EFSCORRUPTED); } - inode = ext4_iget_normal(dir->i_sb, ino); + inode = ext4_iget(dir->i_sb, ino, EXT4_IGET_NORMAL); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1613,7 +1613,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EFSCORRUPTED); } - return d_obtain_alias(ext4_iget_normal(child->d_sb, ino)); + return d_obtain_alias(ext4_iget(child->d_sb, ino, EXT4_IGET_NORMAL)); } /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bc8ee0c498cc..48421de803b7 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1637,7 +1637,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) "No reserved GDT blocks, can't resize"); return -EPERM; } - inode = ext4_iget(sb, EXT4_RESIZE_INO); + inode = ext4_iget(sb, EXT4_RESIZE_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(inode); @@ -1965,7 +1965,8 @@ retry: } if (!resize_inode) - resize_inode = ext4_iget(sb, EXT4_RESIZE_INO); + resize_inode = ext4_iget(sb, EXT4_RESIZE_INO, + EXT4_IGET_SPECIAL); if (IS_ERR(resize_inode)) { ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 98b60657e1dd..2908a212a7a4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1173,20 +1173,11 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, { struct inode *inode; - if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-ESTALE); - if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) - return ERR_PTR(-ESTALE); - - /* iget isn't really right if the inode is currently unallocated!! - * - * ext4_read_inode will return a bad_inode if the inode had been - * deleted, so we should be safe. - * + /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget_normal(sb, ino); + inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { @@ -4350,7 +4341,7 @@ no_journal: * so we can safely mount the rest of the filesystem now. */ - root = ext4_iget(sb, EXT4_ROOT_INO); + root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); @@ -4620,7 +4611,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ - journal_inode = ext4_iget(sb, journal_inum); + journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return NULL; @@ -5702,7 +5693,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, if (!qf_inums[type]) return -EPERM; - qf_inode = ext4_iget(sb, qf_inums[type]); + qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]); return PTR_ERR(qf_inode); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8ebf0967424f..c0ba5206cd9d 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -384,7 +384,7 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, struct inode *inode; int err; - inode = ext4_iget(parent->i_sb, ea_ino); + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, @@ -1486,7 +1486,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, } while (ce) { - ea_inode = ext4_iget(inode->i_sb, ce->e_value); + ea_inode = ext4_iget(inode->i_sb, ce->e_value, + EXT4_IGET_NORMAL); if (!IS_ERR(ea_inode) && !is_bad_inode(ea_inode) && (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) && -- cgit v1.2.3 From bf2fd1f970405aeae0dc7cbf42d1ba82e7bf6b70 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Dec 2018 14:07:58 -0500 Subject: ext4: force inode writes when nfsd calls commit_metadata() commit fde872682e175743e0c3ef939c89e3c6008a1529 upstream. Some time back, nfsd switched from calling vfs_fsync() to using a new commit_metadata() hook in export_operations(). If the file system did not provide a commit_metadata() hook, it fell back to using sync_inode_metadata(). Unfortunately doesn't work on all file systems. In particular, it doesn't work on ext4 due to how the inode gets journalled --- the VFS writeback code will not always call ext4_write_inode(). So we need to provide our own ext4_nfs_commit_metdata() method which calls ext4_write_inode() directly. Google-Bug-Id: 121195940 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2908a212a7a4..ee0f30852835 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1202,6 +1202,16 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +static int ext4_nfs_commit_metadata(struct inode *inode) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + trace_ext4_nfs_commit_metadata(inode); + return ext4_write_inode(inode, &wbc); +} + /* * Try to release metadata pages (indirect blocks, directories) which are * mapped via the block device. Since these pages could have journal heads @@ -1406,6 +1416,7 @@ static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, + .commit_metadata = ext4_nfs_commit_metadata, }; enum { -- cgit v1.2.3 From 0cb4f6559087e99c14f129bc1332c40e84fd0559 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Dec 2018 14:36:58 -0500 Subject: ext4: check for shutdown and r/o file system in ext4_write_inode() commit 18f2c4fcebf2582f96cbd5f2238f4f354a0e4847 upstream. If the file system has been shut down or is read-only, then ext4_write_inode() needs to bail out early. Also use jbd2_complete_transaction() instead of ext4_force_commit() so we only force a commit if it is needed. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bcf08801f935..36abbdafb26e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5369,9 +5369,13 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || + sb_rdonly(inode->i_sb)) return 0; + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); @@ -5387,7 +5391,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; - err = ext4_force_commit(inode->i_sb); + err = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; -- cgit v1.2.3 From 28867a52e48d989a4c0f8476223a5d76918041f5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sun, 11 Nov 2018 22:22:17 +0800 Subject: btrfs: dev-replace: go back to suspended state if target device is missing commit 0d228ece59a35a9b9e8ff0d40653234a6d90f61e upstream. At the time of forced unmount we place the running replace to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state, so when the system comes back and expect the target device is missing. Then let the replace state continue to be in BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state instead of BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED as there isn't any matching scrub running as part of replace. Fixes: e93c89c1aaaa ("Btrfs: add new sources for device replace code") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/dev-replace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 981434764bb9..8eb9307b6394 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -887,6 +887,8 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) "cannot continue dev_replace, tgtdev is missing"); btrfs_info(fs_info, "you may cancel the operation after 'mount -o degraded'"); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; btrfs_dev_replace_write_unlock(dev_replace); return 0; } -- cgit v1.2.3 From c1f90eb019714c51f191f2c1abbfda7ebfaf7c35 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sun, 11 Nov 2018 22:22:18 +0800 Subject: btrfs: dev-replace: go back to suspend state if another EXCL_OP is running commit 05c49e6bc1e8866ecfd674ebeeb58cdbff9145c2 upstream. In a secnario where balance and replace co-exists as below, - start balance - pause balance - start replace - reboot and when system restarts, balance resumes first. Then the replace is attempted to restart but will fail as the EXCL_OP lock is already held by the balance. If so place the replace state back to BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED state. Fixes: 010a47bde9420 ("btrfs: add proper safety check before resuming dev-replace") CC: stable@vger.kernel.org # 4.18+ Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/dev-replace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 8eb9307b6394..b2b283e48439 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -900,6 +900,10 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * dev-replace to start anyway. */ if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + btrfs_dev_replace_write_lock(dev_replace); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; + btrfs_dev_replace_write_unlock(dev_replace); btrfs_info(fs_info, "cannot resume dev-replace, other exclusive operation running"); return 0; -- cgit v1.2.3 From 7708a83090bab6e6c7348b0dae4f47681921ca1c Mon Sep 17 00:00:00 2001 From: Lu Fengqi Date: Thu, 29 Nov 2018 17:31:32 +0800 Subject: btrfs: skip file_extent generation check for free_space_inode in run_delalloc_nocow commit 27a7ff554e8d349627a90bda275c527b7348adae upstream. The test case btrfs/001 with inode_cache mount option will encounter the following warning: WARNING: CPU: 1 PID: 23700 at fs/btrfs/inode.c:956 cow_file_range.isra.19+0x32b/0x430 [btrfs] CPU: 1 PID: 23700 Comm: btrfs Kdump: loaded Tainted: G W O 4.20.0-rc4-custom+ #30 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:cow_file_range.isra.19+0x32b/0x430 [btrfs] Call Trace: ? free_extent_buffer+0x46/0x90 [btrfs] run_delalloc_nocow+0x455/0x900 [btrfs] btrfs_run_delalloc_range+0x1a7/0x360 [btrfs] writepage_delalloc+0xf9/0x150 [btrfs] __extent_writepage+0x125/0x3e0 [btrfs] extent_write_cache_pages+0x1b6/0x3e0 [btrfs] ? __wake_up_common_lock+0x63/0xc0 extent_writepages+0x50/0x80 [btrfs] do_writepages+0x41/0xd0 ? __filemap_fdatawrite_range+0x9e/0xf0 __filemap_fdatawrite_range+0xbe/0xf0 btrfs_fdatawrite_range+0x1b/0x50 [btrfs] __btrfs_write_out_cache+0x42c/0x480 [btrfs] btrfs_write_out_ino_cache+0x84/0xd0 [btrfs] btrfs_save_ino_cache+0x551/0x660 [btrfs] commit_fs_roots+0xc5/0x190 [btrfs] btrfs_commit_transaction+0x2bf/0x8d0 [btrfs] btrfs_mksubvol+0x48d/0x4d0 [btrfs] btrfs_ioctl_snap_create_transid+0x170/0x180 [btrfs] btrfs_ioctl_snap_create_v2+0x124/0x180 [btrfs] btrfs_ioctl+0x123f/0x3030 [btrfs] The file extent generation of the free space inode is equal to the last snapshot of the file root, so the inode will be passed to cow_file_rage. But the inode was created and its extents were preallocated in btrfs_save_ino_cache, there are no cow copies on disk. The preallocated extent is not yet in the extent tree, and btrfs_cross_ref_exist will ignore the -ENOENT returned by check_committed_ref, so we can directly write the inode to the disk. Fixes: 78d4295b1eee ("btrfs: lift some btrfs_cross_ref_exist checks in nocow path") CC: stable@vger.kernel.org # 4.18+ Reviewed-by: Filipe Manana Signed-off-by: Lu Fengqi Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7158b5b77c9d..8baef9e9160e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1373,7 +1373,8 @@ next_slot: * Do the same check as in btrfs_cross_ref_exist but * without the unnecessary search. */ - if (btrfs_file_extent_generation(leaf, fi) <= + if (!nolock && + btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) -- cgit v1.2.3 From 10b04210aabf4488930fac768994ff7d1d359ac3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 28 Nov 2018 14:54:28 +0000 Subject: Btrfs: fix fsync of files with multiple hard links in new directories commit 41bd60676923822de1df2c50b3f9a10171f4338a upstream. The log tree has a long standing problem that when a file is fsync'ed we only check for new ancestors, created in the current transaction, by following only the hard link for which the fsync was issued. We follow the ancestors using the VFS' dget_parent() API. This means that if we create a new link for a file in a directory that is new (or in an any other new ancestor directory) and then fsync the file using an old hard link, we end up not logging the new ancestor, and on log replay that new hard link and ancestor do not exist. In some cases, involving renames, the file will not exist at all. Example: mkfs.btrfs -f /dev/sdb mount /dev/sdb /mnt mkdir /mnt/A touch /mnt/foo ln /mnt/foo /mnt/A/bar xfs_io -c fsync /mnt/foo In this example after log replay only the hard link named 'foo' exists and directory A does not exist, which is unexpected. In other major linux filesystems, such as ext4, xfs and f2fs for example, both hard links exist and so does directory A after mounting again the filesystem. Checking if any new ancestors are new and need to be logged was added in 2009 by commit 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes"), however only for the ancestors of the hard link (dentry) for which the fsync was issued, instead of checking for all ancestors for all of the inode's hard links. So fix this by tracking the id of the last transaction where a hard link was created for an inode and then on fsync fallback to a full transaction commit when an inode has more than one hard link and at least one new hard link was created in the current transaction. This is the simplest solution since this is not a common use case (adding frequently hard links for which there's an ancestor created in the current transaction and then fsync the file). In case it ever becomes a common use case, a solution that consists of iterating the fs/subvol btree for each hard link and check if any ancestor is new, could be implemented. This solves many unexpected scenarios reported by Jayashree Mohan and Vijay Chidambaram, and for which there is a new test case for fstests under review. Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes") CC: stable@vger.kernel.org # 4.4+ Reported-by: Vijay Chidambaram Reported-by: Jayashree Mohan Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/btrfs_inode.h | 6 ++++++ fs/btrfs/inode.c | 17 +++++++++++++++++ fs/btrfs/tree-log.c | 16 ++++++++++++++++ 3 files changed, 39 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 1343ac57b438..7177d1d33584 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -146,6 +146,12 @@ struct btrfs_inode { */ u64 last_unlink_trans; + /* + * Track the transaction id of the last transaction used to create a + * hard link for the inode. This is used by the log tree (fsync). + */ + u64 last_link_trans; + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8baef9e9160e..14c85e61134d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3689,6 +3689,21 @@ cache_index: * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + /* + * Similar reasoning for last_link_trans, needs to be set otherwise + * for a case like the following: + * + * mkdir A + * touch foo + * ln foo A/bar + * echo 2 > /proc/sys/vm/drop_caches + * fsync foo + * + * + * Would result in link bar and directory A not existing after the power + * failure. + */ + BTRFS_I(inode)->last_link_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || @@ -6647,6 +6662,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; } + BTRFS_I(inode)->last_link_trans = trans->transid; d_instantiate(dentry, inode); ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, true, NULL); @@ -9175,6 +9191,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; + ei->last_link_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 16ecb76fa53c..0805f8c5e72d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5781,6 +5781,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + /* + * If a new hard link was added to the inode in the current transaction + * and its link count is now greater than 1, we need to fallback to a + * transaction commit, otherwise we can end up not logging all its new + * parents for all the hard links. Here just from the dentry used to + * fsync, we can not visit the ancestor inodes for all the other hard + * links to figure out if any is new, so we fallback to a transaction + * commit (instead of adding a lot of complexity of scanning a btree, + * since this scenario is not a common use case). + */ + if (inode->vfs_inode.i_nlink > 1 && + inode->last_link_trans > last_committed) { + ret = -EMLINK; + goto end_trans; + } + while (1) { if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; -- cgit v1.2.3 From 6911b074a0055f9b350d85fde530f61abbef2171 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Nov 2018 11:52:14 -0500 Subject: btrfs: run delayed items before dropping the snapshot commit 0568e82dbe2510fc1fa664f58e5c997d3f1e649e upstream. With my delayed refs patches in place we started seeing a large amount of aborts in __btrfs_free_extent: BTRFS error (device sdb1): unable to find ref byte nr 91947008 parent 0 root 35964 owner 1 offset 0 Call Trace: ? btrfs_merge_delayed_refs+0xaf/0x340 __btrfs_run_delayed_refs+0x6ea/0xfc0 ? btrfs_set_path_blocking+0x31/0x60 btrfs_run_delayed_refs+0xeb/0x180 btrfs_commit_transaction+0x179/0x7f0 ? btrfs_check_space_for_delayed_refs+0x30/0x50 ? should_end_transaction.isra.19+0xe/0x40 btrfs_drop_snapshot+0x41c/0x7c0 btrfs_clean_one_deleted_snapshot+0xb5/0xd0 cleaner_kthread+0xf6/0x120 kthread+0xf8/0x130 ? btree_invalidatepage+0x90/0x90 ? kthread_bind+0x10/0x10 ret_from_fork+0x35/0x40 This was because btrfs_drop_snapshot depends on the root not being modified while it's dropping the snapshot. It will unlock the root node (and really every node) as it walks down the tree, only to re-lock it when it needs to do something. This is a problem because if we modify the tree we could cow a block in our path, which frees our reference to that block. Then once we get back to that shared block we'll free our reference to it again, and get ENOENT when trying to lookup our extent reference to that block in __btrfs_free_extent. This is ultimately happening because we have delayed items left to be processed for our deleted snapshot _after_ all of the inodes are closed for the snapshot. We only run the delayed inode item if we're deleting the inode, and even then we do not run the delayed insertions or delayed removals. These can be run at any point after our final inode does its last iput, which is what triggers the snapshot deletion. We can end up with the snapshot deletion happening and then have the delayed items run on that file system, resulting in the above problem. This problem has existed forever, however my patches made it much easier to hit as I wake up the cleaner much more often to deal with delayed iputs, which made us more likely to start the snapshot dropping work before the transaction commits, which is when the delayed items would generally be run. Before, generally speaking, we would run the delayed items, commit the transaction, and wakeup the cleaner thread to start deleting snapshots, which means we were less likely to hit this problem. You could still hit it if you had multiple snapshots to be deleted and ended up with lots of delayed items, but it was definitely harder. Fix for now by simply running all the delayed items before starting to drop the snapshot. We could make this smarter in the future by making the delayed items per-root, and then simply drop any delayed items for roots that we are going to delete. But for now just a quick and easy solution is the safest. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 51e41e53d4ae..a16760b410b1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8911,6 +8911,10 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_free; } + err = btrfs_run_delayed_items(trans); + if (err) + goto out_end_trans; + if (block_rsv) trans->block_rsv = block_rsv; -- cgit v1.2.3 From 9eec74b48477df26e65994115eecf7380374c49b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Dec 2018 10:19:45 +0000 Subject: Btrfs: send, fix race with transaction commits that create snapshots commit be6821f82c3cc36e026f5afd10249988852b35ea upstream. If we create a snapshot of a snapshot currently being used by a send operation, we can end up with send failing unexpectedly (returning -ENOENT error to user space for example). The following diagram shows how this happens. CPU 1 CPU2 CPU3 btrfs_ioctl_send() (...) create_snapshot() -> creates snapshot of a root used by the send task btrfs_commit_transaction() create_pending_snapshot() __get_inode_info() btrfs_search_slot() btrfs_search_slot_get_root() down_read commit_root_sem get reference on eb of the commit root -> eb with bytenr == X up_read commit_root_sem btrfs_cow_block(root node) btrfs_free_tree_block() -> creates delayed ref to free the extent btrfs_run_delayed_refs() -> runs the delayed ref, adds extent to fs_info->pinned_extents btrfs_finish_extent_commit() unpin_extent_range() -> marks extent as free in the free space cache transaction commit finishes btrfs_start_transaction() (...) btrfs_cow_block() btrfs_alloc_tree_block() btrfs_reserve_extent() -> allocates extent at bytenr == X btrfs_init_new_buffer(bytenr X) btrfs_find_create_tree_block() alloc_extent_buffer(bytenr X) find_extent_buffer(bytenr X) -> returns existing eb, which the send task got (...) -> modifies content of the eb with bytenr == X -> uses an eb that now belongs to some other tree and no more matches the commit root of the snapshot, resuts will be unpredictable The consequences of this race can be various, and can lead to searches in the commit root performed by the send task failing unexpectedly (unable to find inode items, returning -ENOENT to user space, for example) or not failing because an inode item with the same number was added to the tree that reused the metadata extent, in which case send can behave incorrectly in the worst case or just fail later for some reason. Fix this by performing a copy of the commit root's extent buffer when doing a search in the context of a send operation. CC: stable@vger.kernel.org # 4.4.x: 1fc28d8e2e9: Btrfs: move get root out of btrfs_search_slot to a helper CC: stable@vger.kernel.org # 4.4.x: f9ddfd0592a: Btrfs: remove unused check of skip_locking CC: stable@vger.kernel.org # 4.4.x Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 089b46c4d97f..fa18520529f3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2624,14 +2624,27 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, root_lock = BTRFS_READ_LOCK; if (p->search_commit_root) { - /* The commit roots are read only so we always do read locks */ - if (p->need_commit_sem) + /* + * The commit roots are read only so we always do read locks, + * and we always must hold the commit_root_sem when doing + * searches on them, the only exception is send where we don't + * want to block transaction commits for a long time, so + * we need to clone the commit root in order to avoid races + * with transaction commits that create a snapshot of one of + * the roots used by a send operation. + */ + if (p->need_commit_sem) { down_read(&fs_info->commit_root_sem); - b = root->commit_root; - extent_buffer_get(b); - level = btrfs_header_level(b); - if (p->need_commit_sem) + b = btrfs_clone_extent_buffer(root->commit_root); up_read(&fs_info->commit_root_sem); + if (!b) + return ERR_PTR(-ENOMEM); + + } else { + b = root->commit_root; + extent_buffer_get(b); + } + level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when * p->search_commit_root = 1. @@ -2757,6 +2770,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto done; + } while (b) { level = btrfs_header_level(b); -- cgit v1.2.3 From c555772c2a4e1042a12ea27413e93a89d01b04b7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 5 Jan 2019 11:45:08 -0800 Subject: dax: Don't access a freed inode commit 55e56f06ed71d9441f3abd5b1d3c1a870812b3fe upstream. After we drop the i_pages lock, the inode can be freed at any time. The get_unlocked_entry() code has no choice but to reacquire the lock, so it can't be used here. Create a new wait_entry_unlocked() which takes care not to acquire the lock or dereference the address_space in any way. Fixes: c2a7d2a11552 ("filesystem-dax: Introduce dax_lock_mapping_entry()") Cc: Signed-off-by: Matthew Wilcox Reviewed-by: Jan Kara Signed-off-by: Dan Williams Signed-off-by: Sasha Levin --- fs/dax.c | 69 ++++++++++++++++++++++++++++++---------------------------------- 1 file changed, 32 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 3a2682a6c832..415605fafaeb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -229,8 +229,8 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, * * Must be called with the i_pages lock held. */ -static void *__get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp, bool (*wait_fn)(void)) +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) { void *entry, **slot; struct wait_exceptional_entry_queue ewait; @@ -240,8 +240,6 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, ewait.wait.func = wake_exceptional_entry_func; for (;;) { - bool revalidate; - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || @@ -256,30 +254,39 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xa_unlock_irq(&mapping->i_pages); - revalidate = wait_fn(); + schedule(); finish_wait(wq, &ewait.wait); xa_lock_irq(&mapping->i_pages); - if (revalidate) { - put_unlocked_mapping_entry(mapping, index, entry); - return ERR_PTR(-EAGAIN); - } } } -static bool entry_wait(void) +/* + * The only thing keeping the address space around is the i_pages lock + * (it's cycled in clear_inode() after removing the entries from i_pages) + * After we call xas_unlock_irq(), we cannot touch xas->xa. + */ +static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index, + void ***slotp, void *entry) { + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq; + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + + wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); + prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); + xa_unlock_irq(&mapping->i_pages); schedule(); + finish_wait(wq, &ewait.wait); + /* - * Never return an ERR_PTR() from - * __get_unlocked_mapping_entry(), just keep looping. + * Entry lock waits are exclusive. Wake up the next waiter since + * we aren't sure we will acquire the entry lock and thus wake + * the next waiter up on unlock. */ - return false; -} - -static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) -{ - return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &ewait.key); } static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) @@ -398,19 +405,6 @@ static struct page *dax_busy_page(void *entry) return NULL; } -static bool entry_wait_revalidate(void) -{ - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - - /* - * Tell __get_unlocked_mapping_entry() to take a break, we need - * to revalidate page->mapping after dropping locks - */ - return true; -} - bool dax_lock_mapping_entry(struct page *page) { pgoff_t index; @@ -446,14 +440,15 @@ bool dax_lock_mapping_entry(struct page *page) } index = page->index; - entry = __get_unlocked_mapping_entry(mapping, index, &slot, - entry_wait_revalidate); + entry = __radix_tree_lookup(&mapping->i_pages, index, + NULL, &slot); if (!entry) { xa_unlock_irq(&mapping->i_pages); break; - } else if (IS_ERR(entry)) { - xa_unlock_irq(&mapping->i_pages); - WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); + } else if (slot_locked(mapping, slot)) { + rcu_read_unlock(); + wait_entry_unlocked(mapping, index, &slot, entry); + rcu_read_lock(); continue; } lock_slot(mapping, slot); -- cgit v1.2.3 From 9621ea6b9c4c8268d715785759678b401ff36509 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 5 Jan 2019 11:45:13 -0800 Subject: dax: Use non-exclusive wait in wait_entry_unlocked() commit d8a706414af4827fc0b4b1c0c631c607351938b9 upstream. get_unlocked_entry() uses an exclusive wait because it is guaranteed to eventually obtain the lock and follow on with an unlock+wakeup cycle. The wait_entry_unlocked() path does not have the same guarantee. Rather than open-code an extra wakeup, just switch to a non-exclusive wait. Cc: Matthew Wilcox Reported-by: Linus Torvalds Reviewed-by: Jan Kara Signed-off-by: Dan Williams Signed-off-by: Sasha Levin --- fs/dax.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 415605fafaeb..09fa70683c41 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -275,18 +275,16 @@ static void wait_entry_unlocked(struct address_space *mapping, pgoff_t index, ewait.wait.func = wake_exceptional_entry_func; wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); - prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); + /* + * Unlike get_unlocked_entry() there is no guarantee that this + * path ever successfully retrieves an unlocked entry before an + * inode dies. Perform a non-exclusive wait in case this path + * never successfully performs its own wake up. + */ + prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xa_unlock_irq(&mapping->i_pages); schedule(); finish_wait(wq, &ewait.wait); - - /* - * Entry lock waits are exclusive. Wake up the next waiter since - * we aren't sure we will acquire the entry lock and thus wake - * the next waiter up on unlock. - */ - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &ewait.key); } static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -- cgit v1.2.3 From ce5b0057f7687fab11c23b586b94766b1fc40e96 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 22 Nov 2018 18:58:46 +0800 Subject: f2fs: read page index before freeing commit 0ea295dd853e0879a9a30ab61f923c26be35b902 upstream. The function truncate_node frees the page with f2fs_put_page. However, the page index is read after that. So, the patch reads the index before freeing the page. Fixes: bf39c00a9a7f ("f2fs: drop obsolete node page when it is truncated") Cc: Signed-off-by: Pan Bian Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 42ea42acb487..19a0d83aae65 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -827,6 +827,7 @@ static int truncate_node(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; int err; + pgoff_t index; err = f2fs_get_node_info(sbi, dn->nid, &ni); if (err) @@ -846,10 +847,11 @@ static int truncate_node(struct dnode_of_data *dn) clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); + index = dn->node_page->index; f2fs_put_page(dn->node_page, 1); invalidate_mapping_pages(NODE_MAPPING(sbi), - dn->node_page->index, dn->node_page->index); + index, index); dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); -- cgit v1.2.3 From 58d7ab7163d9119a05c576506692a4d90ca22f65 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 22 Dec 2018 11:22:26 +0100 Subject: f2fs: fix validation of the block count in sanity_check_raw_super commit 88960068f25fcc3759455d85460234dcc9d43fef upstream. Treat "block_count" from struct f2fs_super_block as 64-bit little endian value in sanity_check_raw_super() because struct f2fs_super_block declares "block_count" as "__le64". This fixes a bug where the superblock validation fails on big endian devices with the following error: F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0) F2FS-fs (sda1): Can't find valid F2FS filesystem in 1th superblock F2FS-fs (sda1): Wrong segment_count / block_count (61439 > 0) F2FS-fs (sda1): Can't find valid F2FS filesystem in 2th superblock As result of this the partition cannot be mounted. With this patch applied the superblock validation works fine and the partition can be mounted again: F2FS-fs (sda1): Mounted with checkpoint version = 7c84 My little endian x86-64 hardware was able to mount the partition without this fix. To confirm that mounting f2fs filesystems works on big endian machines again I tested this on a 32-bit MIPS big endian (lantiq) device. Fixes: 0cfe75c5b01199 ("f2fs: enhance sanity_check_raw_super() to avoid potential overflows") Cc: stable@vger.kernel.org Signed-off-by: Martin Blumenstingl Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 287c9fe9fff9..338138b34993 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2267,10 +2267,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (segment_count > (le32_to_cpu(raw_super->block_count) >> 9)) { + if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) { f2fs_msg(sb, KERN_INFO, - "Wrong segment_count / block_count (%u > %u)", - segment_count, le32_to_cpu(raw_super->block_count)); + "Wrong segment_count / block_count (%u > %llu)", + segment_count, le64_to_cpu(raw_super->block_count)); return 1; } -- cgit v1.2.3 From 5036fcd9b14516f62efae6ed0c42dfbb9798b643 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 26 Dec 2018 19:54:07 -0800 Subject: f2fs: sanity check of xattr entry size commit 64beba0558fce7b59e9a8a7afd77290e82a22163 upstream. There is a security report where f2fs_getxattr() has a hole to expose wrong memory region when the image is malformed like this. f2fs_getxattr: entry->e_name_len: 4, size: 12288, buffer_size: 16384, len: 4 Cc: Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/xattr.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 77a010e625f5..087e53a2d96c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -291,7 +291,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr) + void **base_addr, int *base_size) { void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; @@ -302,8 +302,8 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!size && !inline_size) return -ENODATA; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), - inline_size + size + XATTR_PADDING_SIZE, GFP_NOFS); + *base_size = inline_size + size + XATTR_PADDING_SIZE; + txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); if (!txattr_addr) return -ENOMEM; @@ -315,8 +315,10 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, *xe = __find_inline_xattr(inode, txattr_addr, &last_addr, index, len, name); - if (*xe) + if (*xe) { + *base_size = inline_size; goto check; + } } /* read from xattr node block */ @@ -477,6 +479,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, int error = 0; unsigned int size, len; void *base_addr = NULL; + int base_size; if (name == NULL) return -EINVAL; @@ -487,7 +490,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr); + &entry, &base_addr, &base_size); up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -501,6 +504,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (buffer) { char *pval = entry->e_name + entry->e_name_len; + + if (base_size - (pval - (char *)base_addr) < size) { + error = -ERANGE; + goto out; + } memcpy(buffer, pval, size); } error = size; -- cgit v1.2.3 From 1827d1c439bc7e0e7e1341aabd7e0ca6b6e194f6 Mon Sep 17 00:00:00 2001 From: Georgy A Bystrenin Date: Fri, 21 Dec 2018 00:11:42 -0600 Subject: CIFS: Fix error mapping for SMB2_LOCK command which caused OFD lock problem commit 9a596f5b39593414c0ec80f71b94a226286f084e upstream. While resolving a bug with locks on samba shares found a strange behavior. When a file locked by one node and we trying to lock it from another node it fail with errno 5 (EIO) but in that case errno must be set to (EACCES | EAGAIN). This isn't happening when we try to lock file second time on same node. In this case it returns EACCES as expected. Also this issue not reproduces when we use SMB1 protocol (vers=1.0 in mount options). Further investigation showed that the mapping from status_to_posix_error is different for SMB1 and SMB2+ implementations. For SMB1 mapping is [NT_STATUS_LOCK_NOT_GRANTED to ERRlock] (See fs/cifs/netmisc.c line 66) but for SMB2+ mapping is [STATUS_LOCK_NOT_GRANTED to -EIO] (see fs/cifs/smb2maperror.c line 383) Quick changes in SMB2+ mapping from EIO to EACCES has fixed issue. BUG: https://bugzilla.kernel.org/show_bug.cgi?id=201971 Signed-off-by: Georgy A Bystrenin Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2maperror.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 20a2d304c603..c3ae8c1d6089 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -379,8 +379,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_NONEXISTENT_EA_ENTRY, -EIO, "STATUS_NONEXISTENT_EA_ENTRY"}, {STATUS_NO_EAS_ON_FILE, -ENODATA, "STATUS_NO_EAS_ON_FILE"}, {STATUS_EA_CORRUPT_ERROR, -EIO, "STATUS_EA_CORRUPT_ERROR"}, - {STATUS_FILE_LOCK_CONFLICT, -EIO, "STATUS_FILE_LOCK_CONFLICT"}, - {STATUS_LOCK_NOT_GRANTED, -EIO, "STATUS_LOCK_NOT_GRANTED"}, + {STATUS_FILE_LOCK_CONFLICT, -EACCES, "STATUS_FILE_LOCK_CONFLICT"}, + {STATUS_LOCK_NOT_GRANTED, -EACCES, "STATUS_LOCK_NOT_GRANTED"}, {STATUS_DELETE_PENDING, -ENOENT, "STATUS_DELETE_PENDING"}, {STATUS_CTL_FILE_NOT_SUPPORTED, -ENOSYS, "STATUS_CTL_FILE_NOT_SUPPORTED"}, -- cgit v1.2.3 From ba77e8c7f7044394c213e1221e630943c3379a6b Mon Sep 17 00:00:00 2001 From: Paul Aurich Date: Mon, 31 Dec 2018 14:13:34 -0800 Subject: smb3: fix large reads on encrypted connections commit 6d2f84eee098540ae857998fe32f29b9e2cd9613 upstream. When passing a large read to receive_encrypted_read(), ensure that the demultiplex_thread knows that a MID was processed. Without this, those operations never complete. This is a similar issue/fix to lease break handling: commit 7af929d6d05ba5564139718e30d5bc96bdbc716a ("smb3: fix lease break problem introduced by compounding") CC: Stable # 4.19+ Fixes: b24df3e30cbf ("cifs: update receive_encrypted_standard to handle compounded responses") Signed-off-by: Paul Aurich Tested-by: Yves-Alexis Perez Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 812da3e56a22..f44bb4a304e9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3184,8 +3184,10 @@ smb3_receive_transform(struct TCP_Server_Info *server, } /* TODO: add support for compounds containing READ. */ - if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) + if (pdu_length > CIFSMaxBufSize + MAX_HEADER_SIZE(server)) { + *num_mids = 1; return receive_encrypted_read(server, &mids[0]); + } return receive_encrypted_standard(server, mids, bufs, num_mids); } -- cgit v1.2.3 From 03acbec28a24fd961b0e78904d5644a30bb5cfc5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 15 Nov 2018 13:15:05 +0300 Subject: dlm: fixed memory leaks after failed ls_remove_names allocation commit b982896cdb6e6a6b89d86dfb39df489d9df51e14 upstream. If allocation fails on last elements of array need to free already allocated elements. v2: just move existing out_rsbtbl label to right place Fixes 789924ba635f ("dlm: fix race between remove and lookup") Cc: stable@kernel.org # 3.6 Signed-off-by: Vasily Averin Signed-off-by: David Teigland Signed-off-by: Greg Kroah-Hartman --- fs/dlm/lockspace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 5ba94be006ee..6a1529e478f3 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -680,11 +680,11 @@ static int new_lockspace(const char *name, const char *cluster, kfree(ls->ls_recover_buf); out_lkbidr: idr_destroy(&ls->ls_lkbidr); + out_rsbtbl: for (i = 0; i < DLM_REMOVE_NAMES_MAX; i++) { if (ls->ls_remove_names[i]) kfree(ls->ls_remove_names[i]); } - out_rsbtbl: vfree(ls->ls_rsbtbl); out_lsfree: if (do_unreg) -- cgit v1.2.3 From 294776562f96efbd9b6a64348020b9e96ccf6e9e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 15 Nov 2018 13:18:18 +0300 Subject: dlm: possible memory leak on error path in create_lkb() commit 23851e978f31eda8b2d01bd410d3026659ca06c7 upstream. Fixes 3d6aa675fff9 ("dlm: keep lkbs in idr") Cc: stable@kernel.org # 3.1 Signed-off-by: Vasily Averin Signed-off-by: David Teigland Signed-off-by: Greg Kroah-Hartman --- fs/dlm/lock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index cc91963683de..2cb125cc21c9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1209,6 +1209,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) if (rv < 0) { log_error(ls, "create_lkb idr error %d", rv); + dlm_free_lkb(lkb); return rv; } -- cgit v1.2.3 From c5fa01a0153fc4df23777034bc47eae31ac27731 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 15 Nov 2018 13:18:24 +0300 Subject: dlm: lost put_lkb on error path in receive_convert() and receive_unlock() commit c0174726c3976e67da8649ac62cae43220ae173a upstream. Fixes 6d40c4a708e0 ("dlm: improve error and debug messages") Cc: stable@kernel.org # 3.5 Signed-off-by: Vasily Averin Signed-off-by: David Teigland Signed-off-by: Greg Kroah-Hartman --- fs/dlm/lock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 2cb125cc21c9..03d767b94f7b 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -4180,6 +4180,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms) (unsigned long long)lkb->lkb_recover_seq, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } @@ -4233,6 +4234,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) lkb->lkb_id, lkb->lkb_remid, ms->m_header.h_nodeid, ms->m_lkid); error = -ENOENT; + dlm_put_lkb(lkb); goto fail; } -- cgit v1.2.3 From 30d3dfd4c42005311a0a06aa5bb9d3557fdc2e0e Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 15 Nov 2018 13:18:56 +0300 Subject: dlm: memory leaks on error path in dlm_user_request() commit d47b41aceeadc6b58abc9c7c6485bef7cfb75636 upstream. According to comment in dlm_user_request() ua should be freed in dlm_free_lkb() after successful attach to lkb. However ua is attached to lkb not in set_lock_args() but later, inside request_lock(). Fixes 597d0cae0f99 ("[DLM] dlm: user locks") Cc: stable@kernel.org # 2.6.19 Signed-off-by: Vasily Averin Signed-off-by: David Teigland Signed-off-by: Greg Kroah-Hartman --- fs/dlm/lock.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 03d767b94f7b..a928ba008d7d 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5795,20 +5795,20 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } - - /* After ua is attached to lkb it will be freed by dlm_free_lkb(). - When DLM_IFL_USER is set, the dlm knows that this is a userspace - lock and that lkb_astparam is the dlm_user_args structure. */ - error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); - lkb->lkb_flags |= DLM_IFL_USER; - if (error) { + kfree(ua->lksb.sb_lvbptr); + ua->lksb.sb_lvbptr = NULL; + kfree(ua); __put_lkb(ls, lkb); goto out; } + /* After ua is attached to lkb it will be freed by dlm_free_lkb(). + When DLM_IFL_USER is set, the dlm knows that this is a userspace + lock and that lkb_astparam is the dlm_user_args structure. */ + lkb->lkb_flags |= DLM_IFL_USER; error = request_lock(ls, lkb, name, namelen, &args); switch (error) { -- cgit v1.2.3 From 310486107d4105ceb0a1f7884074ff0da32e7d1c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 26 Nov 2018 18:45:35 +0100 Subject: gfs2: Get rid of potential double-freeing in gfs2_create_inode commit 6ff9b09e00a441599f3aacdf577254455a048bc9 upstream. In gfs2_create_inode, after setting and releasing the acl / default_acl, the acl / default_acl pointers are not set to NULL as they should be. In that state, when the function reaches label fail_free_acls, gfs2_create_inode will try to release the same acls again. Fix that by setting the pointers to NULL after releasing the acls. Slightly simplify the logic. Also, posix_acl_release checks for NULL already, so there is no need to duplicate those checks here. Fixes: e01580bf9e4d ("gfs2: use generic posix ACL infrastructure") Reported-by: Pan Bian Cc: Christoph Hellwig Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/inode.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 648f0ca1ad57..998051c4aea7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -744,17 +744,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, the gfs2 structures. */ if (default_acl) { error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto fail_gunlock3; posix_acl_release(default_acl); + default_acl = NULL; } if (acl) { - if (!error) - error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto fail_gunlock3; posix_acl_release(acl); + acl = NULL; } - if (error) - goto fail_gunlock3; - error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, &gfs2_initxattrs, NULL); if (error) @@ -789,10 +791,8 @@ fail_free_inode: } gfs2_rsqa_delete(ip, NULL); fail_free_acls: - if (default_acl) - posix_acl_release(default_acl); - if (acl) - posix_acl_release(acl); + posix_acl_release(default_acl); + posix_acl_release(acl); fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); -- cgit v1.2.3 From 6ef56c9ad7f151f62992f524acce093f1a15d45b Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 4 Dec 2018 15:06:27 +0100 Subject: gfs2: Fix loop in gfs2_rbm_find commit 2d29f6b96d8f80322ed2dd895bca590491c38d34 upstream. Fix the resource group wrap-around logic in gfs2_rbm_find that commit e579ed4f44 broke. The bug can lead to unnecessary repeated scanning of the same bitmaps; there is a risk that future changes will turn this into an endless loop. Fixes: e579ed4f44 ("GFS2: Introduce rbm field bii") Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 449d0cb45a84..e978f6930575 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1747,9 +1747,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { + n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; - n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; -- cgit v1.2.3 From 0b6001b941af35cde42ac3fdd4079488f2f19a82 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 1 Nov 2018 13:39:49 -0400 Subject: lockd: Show pid of lockd for remote locks commit b8eee0e90f9797b747113638bc75e739b192ad38 upstream. Commit 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks") specified that the l_pid returned for F_GETLK on a local file that has a remote lock should be the pid of the lock manager process. That commit, while updating other filesystems, failed to update lockd, such that locks created by lockd had their fl_pid set to that of the remote process holding the lock. Fix that here to be the pid of lockd. Also, fix the client case so that the returned lock pid is negative, which indicates a remote lock on a remote file. Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific...") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Coddington Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/lockd/clntproc.c | 2 +- fs/lockd/xdr.c | 4 ++-- fs/lockd/xdr4.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index d20b92f271c2..0a67dd4250e9 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -442,7 +442,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) fl->fl_start = req->a_res.lock.fl.fl_start; fl->fl_end = req->a_res.lock.fl.fl_end; fl->fl_type = req->a_res.lock.fl.fl_type; - fl->fl_pid = 0; + fl->fl_pid = -req->a_res.lock.fl.fl_pid; break; default: status = nlm_stat_to_errno(req->a_res.status); diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 7147e4aebecc..9846f7e95282 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -127,7 +127,7 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock) locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = (pid_t)lock->svid; + fl->fl_pid = current->tgid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ start = ntohl(*p++); @@ -269,7 +269,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32) 0; - lock->fl.fl_pid = (pid_t)lock->svid; + lock->fl.fl_pid = current->tgid; if (!(p = nlm_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 7ed9edf9aed4..70154f376695 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -119,7 +119,7 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = (pid_t)lock->svid; + fl->fl_pid = current->tgid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ p = xdr_decode_hyper(p, &start); @@ -266,7 +266,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32) 0; - lock->fl.fl_pid = (pid_t)lock->svid; + lock->fl.fl_pid = current->tgid; if (!(p = nlm4_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, -- cgit v1.2.3 From 3f26e68af5c0a3a0704a44dd0286c785b603a5c8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Nov 2018 11:21:40 -0500 Subject: nfsd4: zero-length WRITE should succeed commit fdec6114ee1f0f43b1ad081ad8d46b23ba126d70 upstream. Zero-length writes are legal; from 5661 section 18.32.3: "If the count is zero, the WRITE will succeed and return a count of zero subject to permissions checking". This check is unnecessary and is causing zero-length reads to return EINVAL. Cc: stable@vger.kernel.org Fixes: 3fd9557aec91 "NFSD: Refactor the generic write vector fill helper" Cc: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9d6b4f0f1a25..f35aa9f88b5e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1015,8 +1015,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist, &write->wr_head, write->wr_buflen); - if (!nvecs) - return nfserr_io; WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, -- cgit v1.2.3 From 9797226bf661499ddfb7d754d155f594d5742f5e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 29 Nov 2018 11:22:50 +0800 Subject: ceph: don't update importing cap's mseq when handing cap export commit 3c1392d4c49962a31874af14ae9ff289cb2b3851 upstream. Updating mseq makes client think importer mds has accepted all prior cap messages and importer mds knows what caps client wants. Actually some cap messages may have been dropped because of mseq mismatch. If mseq is left untouched, importing cap's mds_wanted later will get reset by cap import message. Cc: stable@vger.kernel.org Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/caps.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dd7dfdd2ba13..eadffaa39f4e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3566,7 +3566,6 @@ retry: tcap->cap_id = t_cap_id; tcap->seq = t_seq - 1; tcap->issue_seq = t_seq - 1; - tcap->mseq = t_mseq; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) -- cgit v1.2.3 From d7068618ae1fbb80fc16bd7c58798e208a696483 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Jan 2019 11:44:41 +0000 Subject: Btrfs: fix deadlock when using free space tree due to block group creation commit a6d8654d885d7d79a3fb82da64eaa489ca332a82 upstream. When modifying the free space tree we can end up COWing one of its extent buffers which in turn might result in allocating a new chunk, which in turn can result in flushing (finish creation) of pending block groups. If that happens we can deadlock because creating a pending block group needs to update the free space tree, and if any of the updates tries to modify the same extent buffer that we are COWing, we end up in a deadlock since we try to write lock twice the same extent buffer. So fix this by skipping pending block group creation if we are COWing an extent buffer from the free space tree. This is a case missed by commit 5ce555578e091 ("Btrfs: fix deadlock when writing out free space caches"). Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202173 Fixes: 5ce555578e091 ("Btrfs: fix deadlock when writing out free space caches") CC: stable@vger.kernel.org # 4.18+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index fa18520529f3..7ad6f2eec711 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1051,19 +1051,21 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; /* - * If we are COWing a node/leaf from the extent, chunk or device trees, - * make sure that we do not finish block group creation of pending block - * groups. We do this to avoid a deadlock. + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. * COWing can result in allocation of a new chunk, and flushing pending * block groups (btrfs_create_pending_block_groups()) can be triggered * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk and device trees, therefore we could - * deadlock with ourselves since we are holding a lock on an extent - * buffer that btrfs_create_pending_block_groups() may try to COW later. + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. */ if (root == fs_info->extent_root || root == fs_info->chunk_root || - root == fs_info->dev_root) + root == fs_info->dev_root || + root == fs_info->free_space_root) trans->can_flush_pending_bgs = false; cow = btrfs_alloc_tree_block(trans, root, parent_start, -- cgit v1.2.3 From d1130682d1270495f49282535854221bbdf20638 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 19 Dec 2018 22:49:09 +0000 Subject: CIFS: Fix adjustment of credits for MTU requests commit b983f7e92348d7e7d091db1b78b7915e9dd3d63a upstream. Currently for MTU requests we allocate maximum possible credits in advance and then adjust them according to the request size. While we were adjusting the number of credits belonging to the server, we were skipping adjustment of credits belonging to the request. This patch fixes it by setting request credits to CreditCharge field value of SMB2 packet header. Also ask 1 credit more for async read and write operations to increase parallelism and match the behavior of other operations. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f54d07bda067..dba986524917 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3185,12 +3185,14 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = shdr->CreditCharge; + shdr->CreditRequest = + cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); + rdata->credits = le16_to_cpu(shdr->CreditCharge); flags |= CIFS_HAS_CREDITS; } @@ -3462,12 +3464,14 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = shdr->CreditCharge; + shdr->CreditRequest = + cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 1); spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(shdr->CreditCharge); spin_unlock(&server->req_lock); wake_up(&server->request_q); + wdata->credits = le16_to_cpu(shdr->CreditCharge); flags |= CIFS_HAS_CREDITS; } -- cgit v1.2.3 From c3606c64678369dc108f397109569cfa0ce0c101 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 3 Jan 2019 16:45:13 -0800 Subject: CIFS: Do not set credits to 1 if the server didn't grant anything commit 33fa5c8b8a7dbe6353a56eaa654b790348890d42 upstream. Currently we reset the number of total credits granted by the server to 1 if the server didn't grant us anything int the response. This violates the SMB3 protocol - we need to trust the server and use the credit values from the response. Fix this by removing the corresponding code. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/transport.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 333729cf46cd..a610381f8140 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -885,8 +885,6 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) if (midQ[i]->resp_buf) credits += ses->server->ops->get_credits(midQ[i]); - if (!credits) - credits = 1; for (i = 0; i < num_rqst; i++) { if (rc < 0) -- cgit v1.2.3 From d2f76f6f9fa963d337e72616f6e06b8bbac5eb7f Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 10 Jan 2019 11:27:28 -0800 Subject: CIFS: Do not hide EINTR after sending network packets commit ee13919c2e8d1f904e035ad4b4239029a8994131 upstream. Currently we hide EINTR code returned from sock_sendmsg() and return 0 instead. This makes a caller think that we successfully completed the network operation which is not true. Fix this by properly returning EINTR to callers. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index a610381f8140..005c2af9d696 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -378,7 +378,7 @@ smbd_done: if (rc < 0 && rc != -EINTR) cifs_dbg(VFS, "Error %d sending data on socket to server\n", rc); - else + else if (rc > 0) rc = 0; return rc; -- cgit v1.2.3 From 7dcc5b36ea7f5f5d15907e439befa14cecffc9f1 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 22 Dec 2018 12:40:05 -0800 Subject: CIFS: Fix credit computation for compounded requests commit 8544f4aa9dd19a04d1244dae10feecc813ccf175 upstream. In SMB3 protocol every part of the compound chain consumes credits individually, so we need to call wait_for_free_credits() for each of the PDUs in the chain. If an operation is interrupted, we must ensure we return all credits taken from the server structure back. Without this patch server can sometimes disconnect the session due to credit mismatches, especially when first operation(s) are large writes. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/transport.c | 59 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 005c2af9d696..66348b3d28e6 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -786,7 +786,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, int i, j, rc = 0; int timeout, optype; struct mid_q_entry *midQ[MAX_COMPOUND]; - unsigned int credits = 0; + bool cancelled_mid[MAX_COMPOUND] = {false}; + unsigned int credits[MAX_COMPOUND] = {0}; char *buf; timeout = flags & CIFS_TIMEOUT_MASK; @@ -804,13 +805,31 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, return -ENOENT; /* - * Ensure that we do not send more than 50 overlapping requests - * to the same server. We may make this configurable later or - * use ses->maxReq. + * Ensure we obtain 1 credit per request in the compound chain. + * It can be optimized further by waiting for all the credits + * at once but this can wait long enough if we don't have enough + * credits due to some heavy operations in progress or the server + * not granting us much, so a fallback to the current approach is + * needed anyway. */ - rc = wait_for_free_request(ses->server, timeout, optype); - if (rc) - return rc; + for (i = 0; i < num_rqst; i++) { + rc = wait_for_free_request(ses->server, timeout, optype); + if (rc) { + /* + * We haven't sent an SMB packet to the server yet but + * we already obtained credits for i requests in the + * compound chain - need to return those credits back + * for future use. Note that we need to call add_credits + * multiple times to match the way we obtained credits + * in the first place and to account for in flight + * requests correctly. + */ + for (j = 0; j < i; j++) + add_credits(ses->server, 1, optype); + return rc; + } + credits[i] = 1; + } /* * Make sure that we sign in the same order that we send on this socket @@ -826,8 +845,10 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); mutex_unlock(&ses->server->srv_mutex); + /* Update # of requests on wire to server */ - add_credits(ses->server, 1, optype); + for (j = 0; j < num_rqst; j++) + add_credits(ses->server, credits[j], optype); return PTR_ERR(midQ[i]); } @@ -874,17 +895,16 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { midQ[i]->mid_flags |= MID_WAIT_CANCELLED; midQ[i]->callback = DeleteMidQEntry; - spin_unlock(&GlobalMid_Lock); - add_credits(ses->server, 1, optype); - return rc; + cancelled_mid[i] = true; } spin_unlock(&GlobalMid_Lock); } } for (i = 0; i < num_rqst; i++) - if (midQ[i]->resp_buf) - credits += ses->server->ops->get_credits(midQ[i]); + if (!cancelled_mid[i] && midQ[i]->resp_buf + && (midQ[i]->mid_state == MID_RESPONSE_RECEIVED)) + credits[i] = ses->server->ops->get_credits(midQ[i]); for (i = 0; i < num_rqst; i++) { if (rc < 0) @@ -892,8 +912,9 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, rc = cifs_sync_mid_result(midQ[i], ses->server); if (rc != 0) { - add_credits(ses->server, credits, optype); - return rc; + /* mark this mid as cancelled to not free it below */ + cancelled_mid[i] = true; + goto out; } if (!midQ[i]->resp_buf || @@ -940,9 +961,11 @@ out: * This is prevented above by using a noop callback that will not * wake this thread except for the very last PDU. */ - for (i = 0; i < num_rqst; i++) - cifs_delete_mid(midQ[i]); - add_credits(ses->server, credits, optype); + for (i = 0; i < num_rqst; i++) { + if (!cancelled_mid[i]) + cifs_delete_mid(midQ[i]); + add_credits(ses->server, credits[i], optype); + } return rc; } -- cgit v1.2.3 From 2a71a47e03ffa7aa68893406f8134977cb164b59 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 8 Jan 2019 18:30:57 +0000 Subject: cifs: Fix potential OOB access of lock element array commit b9a74cde94957d82003fb9f7ab4777938ca851cd upstream. If maxBuf is small but non-zero, it could result in a zero sized lock element array which we would then try and access OOB. Signed-off-by: Ross Lagerwall Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 8 ++++---- fs/cifs/smb2file.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8d41ca7bfcf1..7b637fc27990 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1120,10 +1120,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) { free_xid(xid); return -EINVAL; } @@ -1460,10 +1460,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) return -EINVAL; max_num = (max_buf - sizeof(struct smb_hdr)) / diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 4ed10dd086e6..2fc3d31967ee 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -122,10 +122,10 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, /* * Accessing maxBuf is racy with cifs_reconnect - need to store value - * and check it for zero before using. + * and check it before using. */ max_buf = tcon->ses->server->maxBuf; - if (!max_buf) + if (max_buf < sizeof(struct smb2_lock_element)) return -EINVAL; max_num = max_buf / sizeof(struct smb2_lock_element); -- cgit v1.2.3 From 7c2ea25e1364d9cdcbc7c44a92dccaf86e799f3a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 24 Dec 2018 20:27:08 -0500 Subject: ext4: make sure enough credits are reserved for dioread_nolock writes commit 812c0cab2c0dfad977605dbadf9148490ca5d93f upstream. There are enough credits reserved for most dioread_nolock writes; however, if the extent tree is sufficiently deep, and/or quota is enabled, the code was not allowing for all eventualities when reserving journal credits for the unwritten extent conversion. This problem can be seen using xfstests ext4/034: WARNING: CPU: 1 PID: 257 at fs/ext4/ext4_jbd2.c:271 __ext4_handle_dirty_metadata+0x10c/0x180 Workqueue: ext4-rsv-conversion ext4_end_io_rsv_work RIP: 0010:__ext4_handle_dirty_metadata+0x10c/0x180 ... EXT4-fs: ext4_free_blocks:4938: aborting transaction: error 28 in __ext4_handle_dirty_metadata EXT4: jbd2_journal_dirty_metadata failed: handle type 11 started at line 4921, credits 4/0, errcode -28 EXT4-fs error (device dm-1) in ext4_free_blocks:4950: error 28 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 36abbdafb26e..0a5388347fca 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2748,7 +2748,8 @@ static int ext4_writepages(struct address_space *mapping, * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ - rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits); + rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, + PAGE_SIZE >> inode->i_blkbits); } /* -- cgit v1.2.3 From 926cdac10439eda9ab7a11fd5f28eda203ccbb26 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 25 Dec 2018 00:56:33 -0500 Subject: ext4: fix a potential fiemap/page fault deadlock w/ inline_data commit 2b08b1f12cd664dc7d5c84ead9ff25ae97ad5491 upstream. The ext4_inline_data_fiemap() function calls fiemap_fill_next_extent() while still holding the xattr semaphore. This is not necessary and it triggers a circular lockdep warning. This is because fiemap_fill_next_extent() could trigger a page fault when it writes into page which triggers a page fault. If that page is mmaped from the inline file in question, this could very well result in a deadlock. This problem can be reproduced using generic/519 with a file system configuration which has the inline_data feature enabled. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 27373d88b5f0..56f6e1782d5f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1890,12 +1890,12 @@ int ext4_inline_data_fiemap(struct inode *inode, physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); - if (physical) - error = fiemap_fill_next_extent(fieinfo, start, physical, - inline_len, flags); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); + if (physical) + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); return (error < 0 ? error : 0); } -- cgit v1.2.3 From 01db6e5cf81f905028de903f2290c10172226043 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 30 Dec 2018 23:20:39 -0500 Subject: ext4: avoid kernel warning when writing the superblock to a dead device commit e86807862e6880809f191c4cea7f88a489f0ed34 upstream. The xfstests generic/475 test switches the underlying device with dm-error while running a stress test. This results in a large number of file system errors, and since we can't lock the buffer head when marking the superblock dirty in the ext4_grp_locked_error() case, it's possible the superblock to be !buffer_uptodate() without buffer_write_io_error() being true. We need to set buffer_uptodate() before we call mark_buffer_dirty() or this will trigger a WARN_ON. It's safe to do this since the superblock must have been properly read into memory or the mount would have been successful. So if buffer_uptodate() is not set, we can safely assume that this happened due to a failed attempt to write the superblock. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ee0f30852835..a1cf7d68b4f0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4904,7 +4904,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) ext4_superblock_csum_set(sb); if (sync) lock_buffer(sbh); - if (buffer_write_io_error(sbh)) { + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the -- cgit v1.2.3 From da38a1b47b02b01f96976bbf75466bc1ae519d76 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 31 Dec 2018 00:10:48 -0500 Subject: ext4: use ext4_write_inode() when fsyncing w/o a journal commit ad211f3e94b314a910d4af03178a0b52a7d1ee0a upstream. In no-journal mode, we previously used __generic_file_fsync() in no-journal mode. This triggers a lockdep warning, and in addition, it's not safe to depend on the inode writeback mechanism in the case ext4. We can solve both problems by calling ext4_write_inode() directly. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/fsync.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 26a7fe5c4fd3..87a7ff00ef62 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -116,8 +116,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } + ret = file_write_and_wait_range(file, start, end); + if (ret) + return ret; + if (!journal) { - ret = __generic_file_fsync(file, start, end, datasync); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL + }; + + ret = ext4_write_inode(inode, &wbc); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) @@ -125,9 +133,6 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = file_write_and_wait_range(file, start, end); - if (ret) - return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. -- cgit v1.2.3 From bb80ad0dc3924f193eaa752e9da2e9384f58e627 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 31 Dec 2018 00:11:07 -0500 Subject: ext4: track writeback errors using the generic tracking infrastructure commit 95cb67138746451cc84cf8e516e14989746e93b0 upstream. We already using mapping_set_error() in fs/ext4/page_io.c, so all we need to do is to use file_check_and_advance_wb_err() when handling fsync() requests in ext4_sync_file(). Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/fsync.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 87a7ff00ef62..712f00995390 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -164,6 +164,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = err; } out: + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; trace_ext4_sync_file_exit(inode, ret); return ret; } -- cgit v1.2.3 From 5dc41af3d19e94253c95f62e8ac01158dd8df078 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 31 Dec 2018 22:34:31 -0500 Subject: ext4: fix special inode number checks in __ext4_iget() commit 191ce17876c9367819c4b0a25b503c0f6d9054d8 upstream. The check for special (reserved) inode number checks in __ext4_iget() was broken by commit 8a363970d1dc: ("ext4: avoid declaring fs inconsistent due to invalid file handles"). This was caused by a botched reversal of the sense of the flag now known as EXT4_IGET_SPECIAL (when it was previously named EXT4_IGET_NORMAL). Fix the logic appropriately. Fixes: 8a363970d1dc ("ext4: avoid declaring fs inconsistent...") Signed-off-by: Theodore Ts'o Reported-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0a5388347fca..2c43c5b92229 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4803,7 +4803,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if (((flags & EXT4_IGET_NORMAL) && + if ((!(flags & EXT4_IGET_SPECIAL) && (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { -- cgit v1.2.3 From 829431a2a5a8495ee3a144de1be12e34dec45f91 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Nov 2018 09:48:12 +0000 Subject: Btrfs: fix access to available allocation bits when starting balance commit 5a8067c0d17feb7579db0476191417b441a8996e upstream. The available allocation bits members from struct btrfs_fs_info are protected by a sequence lock, and when starting balance we access them incorrectly in two different ways: 1) In the read sequence lock loop at btrfs_balance() we use the values we read from fs_info->avail_*_alloc_bits and we can immediately do actions that have side effects and can not be undone (printing a message and jumping to a label). This is wrong because a retry might be needed, so our actions must not have side effects and must be repeatable as long as read_seqretry() returns a non-zero value. In other words, we were essentially ignoring the sequence lock; 2) Right below the read sequence lock loop, we were reading the values from avail_metadata_alloc_bits and avail_data_alloc_bits without any protection from concurrent writers, that is, reading them outside of the read sequence lock critical section. So fix this by making sure we only read the available allocation bits while in a read sequence lock critical section and that what we do in the critical section is repeatable (has nothing that can not be undone) so that any eventual retry that is needed is handled properly. Fixes: de98ced9e743 ("Btrfs: use seqlock to protect fs_info->avail_{data, metadata, system}_alloc_bits") Fixes: 14506127979a ("btrfs: fix a bogus warning when converting only data or metadata") Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f4405e430da6..223334f08530 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3712,6 +3712,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, int ret; u64 num_devices; unsigned seq; + bool reducing_integrity; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -3796,24 +3797,30 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, !(bctl->sys.target & allowed)) || ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && (fs_info->avail_metadata_alloc_bits & allowed) && - !(bctl->meta.target & allowed))) { - if (bctl->flags & BTRFS_BALANCE_FORCE) { - btrfs_info(fs_info, - "balance: force reducing metadata integrity"); - } else { - btrfs_err(fs_info, - "balance: reduces metadata integrity, use --force if you want this"); - ret = -EINVAL; - goto out; - } - } + !(bctl->meta.target & allowed))) + reducing_integrity = true; + else + reducing_integrity = false; + + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - /* if we're not converting, the target field is uninitialized */ - meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->meta.target : fs_info->avail_metadata_alloc_bits; - data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? - bctl->data.target : fs_info->avail_data_alloc_bits; + if (reducing_integrity) { + if (bctl->flags & BTRFS_BALANCE_FORCE) { + btrfs_info(fs_info, + "balance: force reducing metadata integrity"); + } else { + btrfs_err(fs_info, + "balance: reduces metadata integrity, use --force if you want this"); + ret = -EINVAL; + goto out; + } + } + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { int meta_index = btrfs_bg_flags_to_raid_index(meta_target); -- cgit v1.2.3 From 79aa5c0daa5c7a2c4de945f4964702f3fcb6f8a3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 19 Nov 2018 14:15:36 +0000 Subject: Btrfs: fix deadlock when enabling quotas due to concurrent snapshot creation commit 9a6f209e36500efac51528132a3e3083586eda5f upstream. If the quota enable and snapshot creation ioctls are called concurrently we can get into a deadlock where the task enabling quotas will deadlock on the fs_info->qgroup_ioctl_lock mutex because it attempts to lock it twice, or the task creating a snapshot tries to commit the transaction while the task enabling quota waits for the former task to commit the transaction while holding the mutex. The following time diagrams show how both cases happen. First scenario: CPU 0 CPU 1 btrfs_ioctl() btrfs_ioctl_quota_ctl() btrfs_quota_enable() mutex_lock(fs_info->qgroup_ioctl_lock) btrfs_start_transaction() btrfs_ioctl() btrfs_ioctl_snap_create_v2 create_snapshot() --> adds snapshot to the list pending_snapshots of the current transaction btrfs_commit_transaction() create_pending_snapshots() create_pending_snapshot() qgroup_account_snapshot() btrfs_qgroup_inherit() mutex_lock(fs_info->qgroup_ioctl_lock) --> deadlock, mutex already locked by this task at btrfs_quota_enable() Second scenario: CPU 0 CPU 1 btrfs_ioctl() btrfs_ioctl_quota_ctl() btrfs_quota_enable() mutex_lock(fs_info->qgroup_ioctl_lock) btrfs_start_transaction() btrfs_ioctl() btrfs_ioctl_snap_create_v2 create_snapshot() --> adds snapshot to the list pending_snapshots of the current transaction btrfs_commit_transaction() --> waits for task at CPU 0 to release its transaction handle btrfs_commit_transaction() --> sees another task started the transaction commit first --> releases its transaction handle --> waits for the transaction commit to be completed by the task at CPU 1 create_pending_snapshot() qgroup_account_snapshot() btrfs_qgroup_inherit() mutex_lock(fs_info->qgroup_ioctl_lock) --> deadlock, task at CPU 0 has the mutex locked but it is waiting for us to finish the transaction commit So fix this by setting the quota enabled flag in fs_info after committing the transaction at btrfs_quota_enable(). This ends up serializing quota enable and snapshot creation as if the snapshot creation happened just before the quota enable request. The quota rescan task, scheduled after committing the transaction in btrfs_quote_enable(), will do the accounting. Fixes: 6426c7ad697d ("btrfs: qgroup: Fix qgroup accounting when creating snapshot") Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/qgroup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ff434663d65b..e1fcb28ad4cc 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1013,16 +1013,22 @@ out_add_root: btrfs_abort_transaction(trans, ret); goto out_free_path; } - spin_lock(&fs_info->qgroup_lock); - fs_info->quota_root = quota_root; - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - spin_unlock(&fs_info->qgroup_lock); ret = btrfs_commit_transaction(trans); trans = NULL; if (ret) goto out_free_path; + /* + * Set quota enabled flag after committing the transaction, to avoid + * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot + * creation. + */ + spin_lock(&fs_info->qgroup_lock); + fs_info->quota_root = quota_root; + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + spin_unlock(&fs_info->qgroup_lock); + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); -- cgit v1.2.3 From 7a1b9b76bac76498657b9322164957c2a50c573b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 10 Dec 2018 17:53:35 +0000 Subject: Btrfs: use nofs context when initializing security xattrs to avoid deadlock commit 827aa18e7b903c5ff3b3cd8fec328a99b1dbd411 upstream. When initializing the security xattrs, we are holding a transaction handle therefore we need to use a GFP_NOFS context in order to avoid a deadlock with reclaim in case it's triggered. Fixes: 39a27ec1004e8 ("btrfs: use GFP_KERNEL for xattr and acl allocations") Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/xattr.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ea78c3d6dcfc..f141b45ce349 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" @@ -422,9 +423,15 @@ static int btrfs_initxattrs(struct inode *inode, { const struct xattr *xattr; struct btrfs_trans_handle *trans = fs_info; + unsigned int nofs_flag; char *name; int err = 0; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name) + 1, GFP_KERNEL); @@ -440,6 +447,7 @@ static int btrfs_initxattrs(struct inode *inode, if (err < 0) break; } + memalloc_nofs_restore(nofs_flag); return err; } -- cgit v1.2.3 From 4675f90ef892b7071c09e30f80565adf628fff3b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 9 Jan 2019 15:02:23 +0100 Subject: Revert "btrfs: balance dirty metadata pages in btrfs_finish_ordered_io" commit 77b7aad195099e7c6da11e94b7fa6ef5e6fb0025 upstream. This reverts commit e73e81b6d0114d4a303205a952ab2e87c44bd279. This patch causes a few problems: - adds latency to btrfs_finish_ordered_io - as btrfs_finish_ordered_io is used for free space cache, generating more work from btrfs_btree_balance_dirty_nodelay could end up in the same workque, effectively deadlocking 12260 kworker/u96:16+btrfs-freespace-write D [<0>] balance_dirty_pages+0x6e6/0x7ad [<0>] balance_dirty_pages_ratelimited+0x6bb/0xa90 [<0>] btrfs_finish_ordered_io+0x3da/0x770 [<0>] normal_work_helper+0x1c5/0x5a0 [<0>] process_one_work+0x1ee/0x5a0 [<0>] worker_thread+0x46/0x3d0 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff Transaction commit will wait on the freespace cache: 838 btrfs-transacti D [<0>] btrfs_start_ordered_extent+0x154/0x1e0 [<0>] btrfs_wait_ordered_range+0xbd/0x110 [<0>] __btrfs_wait_cache_io+0x49/0x1a0 [<0>] btrfs_write_dirty_block_groups+0x10b/0x3b0 [<0>] commit_cowonly_roots+0x215/0x2b0 [<0>] btrfs_commit_transaction+0x37e/0x910 [<0>] transaction_kthread+0x14d/0x180 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff And then writepages ends up waiting on transaction commit: 9520 kworker/u96:13+flush-btrfs-1 D [<0>] wait_current_trans+0xac/0xe0 [<0>] start_transaction+0x21b/0x4b0 [<0>] cow_file_range_inline+0x10b/0x6b0 [<0>] cow_file_range.isra.69+0x329/0x4a0 [<0>] run_delalloc_range+0x105/0x3c0 [<0>] writepage_delalloc+0x119/0x180 [<0>] __extent_writepage+0x10c/0x390 [<0>] extent_write_cache_pages+0x26f/0x3d0 [<0>] extent_writepages+0x4f/0x80 [<0>] do_writepages+0x17/0x60 [<0>] __writeback_single_inode+0x59/0x690 [<0>] writeback_sb_inodes+0x291/0x4e0 [<0>] __writeback_inodes_wb+0x87/0xb0 [<0>] wb_writeback+0x3bb/0x500 [<0>] wb_workfn+0x40d/0x610 [<0>] process_one_work+0x1ee/0x5a0 [<0>] worker_thread+0x1e0/0x3d0 [<0>] kthread+0xf5/0x130 [<0>] ret_from_fork+0x24/0x30 [<0>] 0xffffffffffffffff Eventually, we have every process in the system waiting on balance_dirty_pages(), and nobody is able to make progress on page writeback. The original patch tried to fix an OOM condition, that happened on 4.4 but no success reproducing that on later kernels (4.19 and 4.20). This is more likely a problem in OOM itself. Link: https://lore.kernel.org/linux-btrfs/20180528054821.9092-1-ethanlien@synology.com/ Reported-by: Chris Mason CC: stable@vger.kernel.org # 4.18+ CC: ethanlien Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 14c85e61134d..4f6dc56b4f4d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3151,9 +3151,6 @@ out: /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - /* Try to release some metadata so we don't get an OOM but don't wait */ - btrfs_btree_balance_dirty_nodelay(fs_info); - return ret; } -- cgit v1.2.3 From 01634ac56393272de45f1f88be9d28f52f91a272 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 21 Nov 2018 14:05:45 -0500 Subject: btrfs: wait on ordered extents on abort cleanup commit 74d5d229b1bf60f93bff244b2dfc0eb21ec32a07 upstream. If we flip read-only before we initiate writeback on all dirty pages for ordered extents we've created then we'll have ordered extents left over on umount, which results in all sorts of bad things happening. Fix this by making sure we wait on ordered extents if we have to do the aborted transaction cleanup stuff. generic/475 can produce this warning: [ 8531.177332] WARNING: CPU: 2 PID: 11997 at fs/btrfs/disk-io.c:3856 btrfs_free_fs_root+0x95/0xa0 [btrfs] [ 8531.183282] CPU: 2 PID: 11997 Comm: umount Tainted: G W 5.0.0-rc1-default+ #394 [ 8531.185164] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),BIOS rel-1.11.2-0-gf9626cc-prebuilt.qemu-project.org 04/01/2014 [ 8531.187851] RIP: 0010:btrfs_free_fs_root+0x95/0xa0 [btrfs] [ 8531.193082] RSP: 0018:ffffb1ab86163d98 EFLAGS: 00010286 [ 8531.194198] RAX: ffff9f3449494d18 RBX: ffff9f34a2695000 RCX:0000000000000000 [ 8531.195629] RDX: 0000000000000002 RSI: 0000000000000001 RDI:0000000000000000 [ 8531.197315] RBP: ffff9f344e930000 R08: 0000000000000001 R09:0000000000000000 [ 8531.199095] R10: 0000000000000000 R11: ffff9f34494d4ff8 R12:ffffb1ab86163dc0 [ 8531.200870] R13: ffff9f344e9300b0 R14: ffffb1ab86163db8 R15:0000000000000000 [ 8531.202707] FS: 00007fc68e949fc0(0000) GS:ffff9f34bd800000(0000)knlGS:0000000000000000 [ 8531.204851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8531.205942] CR2: 00007ffde8114dd8 CR3: 000000002dfbd000 CR4:00000000000006e0 [ 8531.207516] Call Trace: [ 8531.208175] btrfs_free_fs_roots+0xdb/0x170 [btrfs] [ 8531.210209] ? wait_for_completion+0x5b/0x190 [ 8531.211303] close_ctree+0x157/0x350 [btrfs] [ 8531.212412] generic_shutdown_super+0x64/0x100 [ 8531.213485] kill_anon_super+0x14/0x30 [ 8531.214430] btrfs_kill_super+0x12/0xa0 [btrfs] [ 8531.215539] deactivate_locked_super+0x29/0x60 [ 8531.216633] cleanup_mnt+0x3b/0x70 [ 8531.217497] task_work_run+0x98/0xc0 [ 8531.218397] exit_to_usermode_loop+0x83/0x90 [ 8531.219324] do_syscall_64+0x15b/0x180 [ 8531.220192] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 8531.221286] RIP: 0033:0x7fc68e5e4d07 [ 8531.225621] RSP: 002b:00007ffde8116608 EFLAGS: 00000246 ORIG_RAX:00000000000000a6 [ 8531.227512] RAX: 0000000000000000 RBX: 00005580c2175970 RCX:00007fc68e5e4d07 [ 8531.229098] RDX: 0000000000000001 RSI: 0000000000000000 RDI:00005580c2175b80 [ 8531.230730] RBP: 0000000000000000 R08: 00005580c2175ba0 R09:00007ffde8114e80 [ 8531.232269] R10: 0000000000000000 R11: 0000000000000246 R12:00005580c2175b80 [ 8531.233839] R13: 00007fc68eac61c4 R14: 00005580c2175a68 R15:0000000000000000 Leaving a tree in the rb-tree: 3853 void btrfs_free_fs_root(struct btrfs_root *root) 3854 { 3855 iput(root->ino_cache_inode); 3856 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); CC: stable@vger.kernel.org Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik [ add stacktrace ] Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d4a7f7ca4145..d96d1390068a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4155,6 +4155,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->ordered_root_lock); } spin_unlock(&fs_info->ordered_root_lock); + + /* + * We need this here because if we've been flipped read-only we won't + * get sync() from the umount, so we need to make sure any ordered + * extents that haven't had their dirty pages IO start writeout yet + * actually get run and error out properly. + */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, -- cgit v1.2.3 From 483ac8e65a8a645d0ba31ab7077ac50debeb4127 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 20 Jan 2019 14:33:34 -0800 Subject: pstore/ram: Avoid allocation and leak of platform data commit 5631e8576a3caf606cdc375f97425a67983b420c upstream. Yue Hu noticed that when parsing device tree the allocated platform data was never freed. Since it's not used beyond the function scope, this switches to using a stack variable instead. Reported-by: Yue Hu Fixes: 35da60941e44 ("pstore/ram: add Device Tree bindings") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/ram.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 03cd59375abe..eb67bb7f04de 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -713,18 +713,15 @@ static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct ramoops_platform_data *pdata = dev->platform_data; + struct ramoops_platform_data pdata_local; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; if (dev_of_node(dev) && !pdata) { - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - pr_err("cannot allocate platform data buffer\n"); - err = -ENOMEM; - goto fail_out; - } + pdata = &pdata_local; + memset(pdata, 0, sizeof(*pdata)); err = ramoops_parse_dt(pdev, pdata); if (err < 0) -- cgit v1.2.3 From 1e11b1d630f8fdbca3843259e581fdec0941dc03 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Jan 2019 09:48:10 +0100 Subject: blockdev: Fix livelocks on loop device commit 04906b2f542c23626b0ef6219b808406f8dddbe9 upstream. bd_set_size() updates also block device's block size. This is somewhat unexpected from its name and at this point, only blkdev_open() uses this functionality. Furthermore, this can result in changing block size under a filesystem mounted on a loop device which leads to livelocks inside __getblk_gfp() like: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 10863 Comm: syz-executor0 Not tainted 4.18.0-rc5+ #151 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__sanitizer_cov_trace_pc+0x3f/0x50 kernel/kcov.c:106 ... Call Trace: init_page_buffers+0x3e2/0x530 fs/buffer.c:904 grow_dev_page fs/buffer.c:947 [inline] grow_buffers fs/buffer.c:1009 [inline] __getblk_slow fs/buffer.c:1036 [inline] __getblk_gfp+0x906/0xb10 fs/buffer.c:1313 __bread_gfp+0x2d/0x310 fs/buffer.c:1347 sb_bread include/linux/buffer_head.h:307 [inline] fat12_ent_bread+0x14e/0x3d0 fs/fat/fatent.c:75 fat_ent_read_block fs/fat/fatent.c:441 [inline] fat_alloc_clusters+0x8ce/0x16e0 fs/fat/fatent.c:489 fat_add_cluster+0x7a/0x150 fs/fat/inode.c:101 __fat_get_block fs/fat/inode.c:148 [inline] ... Trivial reproducer for the problem looks like: truncate -s 1G /tmp/image losetup /dev/loop0 /tmp/image mkfs.ext4 -b 1024 /dev/loop0 mount -t ext4 /dev/loop0 /mnt losetup -c /dev/loop0 l /mnt Fix the problem by moving initialization of a block device block size into a separate function and call it when needed. Thanks to Tetsuo Handa for help with debugging the problem. Reported-by: syzbot+9933e4476f365f5d5a1b@syzkaller.appspotmail.com Signed-off-by: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 38b8ce05cbc7..cdbb888a8d4a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -104,6 +104,20 @@ void invalidate_bdev(struct block_device *bdev) } EXPORT_SYMBOL(invalidate_bdev); +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_block_size = bsize; + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ @@ -1408,18 +1422,9 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_logical_block_size(bdev); - inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); inode_unlock(bdev->bd_inode); - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_block_size = bsize; - bdev->bd_inode->i_blkbits = blksize_bits(bsize); } EXPORT_SYMBOL(bd_set_size); @@ -1496,8 +1501,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + set_init_blocksize(bdev); + } /* * If the device is invalidated, rescan partition @@ -1532,6 +1539,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); + set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) -- cgit v1.2.3 From c356972f27cc8194ecb1c0473d318a607300e7c2 Mon Sep 17 00:00:00 2001 From: Daniel Santos Date: Fri, 19 Oct 2018 03:30:20 -0500 Subject: jffs2: Fix use of uninitialized delayed_work, lockdep breakage [ Upstream commit a788c5272769ddbcdbab297cf386413eeac04463 ] jffs2_sync_fs makes the assumption that if CONFIG_JFFS2_FS_WRITEBUFFER is defined then a write buffer is available and has been initialized. However, this does is not the case when the mtd device has no out-of-band buffer: int jffs2_nand_flash_setup(struct jffs2_sb_info *c) { if (!c->mtd->oobsize) return 0; ... The resulting call to cancel_delayed_work_sync passing a uninitialized (but zeroed) delayed_work struct forces lockdep to become disabled. [ 90.050639] overlayfs: upper fs does not support tmpfile. [ 90.652264] INFO: trying to register non-static key. [ 90.662171] the code is fine but needs lockdep annotation. [ 90.673090] turning off the locking correctness validator. [ 90.684021] CPU: 0 PID: 1762 Comm: mount_root Not tainted 4.14.63 #0 [ 90.696672] Stack : 00000000 00000000 80d8f6a2 00000038 805f0000 80444600 8fe364f4 805dfbe7 [ 90.713349] 80563a30 000006e2 8068370c 00000001 00000000 00000001 8e2fdc48 ffffffff [ 90.730020] 00000000 00000000 80d90000 00000000 00000106 00000000 6465746e 312e3420 [ 90.746690] 6b636f6c 03bf0000 f8000000 20676e69 00000000 80000000 00000000 8e2c2a90 [ 90.763362] 80d90000 00000001 00000000 8e2c2a90 00000003 80260dc0 08052098 80680000 [ 90.780033] ... [ 90.784902] Call Trace: [ 90.789793] [<8000f0d8>] show_stack+0xb8/0x148 [ 90.798659] [<8005a000>] register_lock_class+0x270/0x55c [ 90.809247] [<8005cb64>] __lock_acquire+0x13c/0xf7c [ 90.818964] [<8005e314>] lock_acquire+0x194/0x1dc [ 90.828345] [<8003f27c>] flush_work+0x200/0x24c [ 90.837374] [<80041dfc>] __cancel_work_timer+0x158/0x210 [ 90.847958] [<801a8770>] jffs2_sync_fs+0x20/0x54 [ 90.857173] [<80125cf4>] iterate_supers+0xf4/0x120 [ 90.866729] [<80158fc4>] sys_sync+0x44/0x9c [ 90.875067] [<80014424>] syscall_common+0x34/0x58 Signed-off-by: Daniel Santos Reviewed-by: Hou Tao Signed-off-by: Boris Brezillon Signed-off-by: Sasha Levin --- fs/jffs2/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 902a7dd10e5c..bb6ae387469f 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -101,7 +101,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - cancel_delayed_work_sync(&c->wbuf_dwork); + if (jffs2_is_writebuffered(c)) + cancel_delayed_work_sync(&c->wbuf_dwork); #endif mutex_lock(&c->alloc_sem); -- cgit v1.2.3 From 265242d82a3c6a8bd9120d06b4801f8d7ae9a346 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 3 Nov 2018 16:38:18 -0700 Subject: pstore/ram: Do not treat empty buffers as valid [ Upstream commit 30696378f68a9e3dad6bfe55938b112e72af00c2 ] The ramoops backend currently calls persistent_ram_save_old() even if a buffer is empty. While this appears to work, it is does not seem like the right thing to do and could lead to future bugs so lets avoid that. It also prevents misleading prints in the logs which claim the buffer is valid. I got something like: found existing buffer, size 0, start 0 When I was expecting: no valid data in buffer (sig = ...) This bails out early (and reports with pr_debug()), since it's an acceptable state. Signed-off-by: Joel Fernandes (Google) Co-developed-by: Kees Cook Signed-off-by: Kees Cook Signed-off-by: Sasha Levin --- fs/pstore/ram_core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 0792595ebcfb..3c777ec80d47 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -496,6 +496,11 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { + if (buffer_size(prz) == 0) { + pr_debug("found existing empty buffer\n"); + return 0; + } + if (buffer_size(prz) > prz->buffer_size || buffer_start(prz) > buffer_size(prz)) pr_info("found existing invalid buffer, size %zu, start %zu\n", -- cgit v1.2.3 From bb5717a4a16592d22f3d5ca244b3e0a341b4062c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 5 Oct 2018 17:45:54 +0800 Subject: btrfs: volumes: Make sure there is no overlap of dev extents at mount time [ Upstream commit 5eb193812a42dc49331f25137a38dfef9612d3e4 ] Enhance btrfs_verify_dev_extents() to remember previous checked dev extents, so it can verify no dev extents can overlap. Analysis from Hans: "Imagine allocating a DATA|DUP chunk. In the chunk allocator, we first set... max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE ... which is 10GiB. Then... /* we don't want a chunk larger than 10% of writeable space */ max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); Imagine we only have one 7880MiB block device in this filesystem. Now max_chunk_size is down to 788MiB. The next step in the code is to search for max_stripe_size * dev_stripes amount of free space on the device, which is in our example 1GiB * 2 = 2GiB. Imagine the device has exactly 1578MiB free in one contiguous piece. This amount of bytes will be put in devices_info[ndevs - 1].max_avail Next we recalculate the stripe_size (which is actually the device extent length), based on the actual maximum amount of available raw disk space: stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); stripe_size is now 789MiB Next we do... data_stripes = num_stripes / ncopies ...where data_stripes ends up as 1, because num_stripes is 2 (the amount of device extents we're going to have), and DUP has ncopies 2. Next there's a check... if (stripe_size * data_stripes > max_chunk_size) ...which matches because 789MiB * 1 > 788MiB. We go into the if code, and next is... stripe_size = div_u64(max_chunk_size, data_stripes); ...which resets stripe_size to max_chunk_size: 788MiB Next is a fun one... /* bump the answer up to a 16MB boundary */ stripe_size = round_up(stripe_size, SZ_16M); ...which changes stripe_size from 788MiB to 800MiB. We're not done changing stripe_size yet... /* But don't go higher than the limits we found while searching * for free extents */ stripe_size = min(devices_info[ndevs - 1].max_avail, stripe_size); This is bad. max_avail is twice the stripe_size (we need to fit 2 device extents on the same device for DUP). The result here is that 800MiB < 1578MiB, so it's unchanged. However, the resulting DUP chunk will need 1600MiB disk space, which isn't there, and the second dev_extent might extend into the next thing (next dev_extent? end of device?) for 22MiB. The last shown line of code relies on a situation where there's twice the value of stripe_size present as value for the variable stripe_size when it's DUP. This was actually the case before commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling", from which I quote: "[...] in the meantime there's a check to see if the stripe_size does not exceed max_chunk_size. Since during this check stripe_size is twice the amount as intended, the check will reduce the stripe_size to max_chunk_size if the actual correct to be used stripe_size is more than half the amount of max_chunk_size." In the previous version of the code, the 16MiB alignment (why is this done, by the way?) would result in a 50% chance that it would actually do an 8MiB alignment for the individual dev_extents, since it was operating on double the size. Does this matter? Does it matter that stripe_size can be set to anything which is not 16MiB aligned because of the amount of remaining available disk space which is just taken? What is the main purpose of this round_up? The most straightforward thing to do seems something like... stripe_size = min( div_u64(devices_info[ndevs - 1].max_avail, dev_stripes), stripe_size ) ..just putting half of the max_avail into stripe_size." Link: https://lore.kernel.org/linux-btrfs/b3461a38-e5f8-f41d-c67c-2efac8129054@mendix.com/ Reported-by: Hans van Kranenburg Signed-off-by: Qu Wenruo [ add analysis from report ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/volumes.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 223334f08530..dbb893d0ae81 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7474,6 +7474,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) struct btrfs_path *path; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; + u64 prev_devid = 0; + u64 prev_dev_ext_end = 0; int ret = 0; key.objectid = 1; @@ -7518,10 +7520,22 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); physical_len = btrfs_dev_extent_length(leaf, dext); + /* Check if this dev extent overlaps with the previous one */ + if (devid == prev_devid && physical_offset < prev_dev_ext_end) { + btrfs_err(fs_info, +"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", + devid, physical_offset, prev_dev_ext_end); + ret = -EUCLEAN; + goto out; + } + ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) goto out; + prev_devid = devid; + prev_dev_ext_end = physical_offset + physical_len; + ret = btrfs_next_item(root, path); if (ret < 0) goto out; -- cgit v1.2.3 From 720b86a53a1024996708121abfd7bc490d68768f Mon Sep 17 00:00:00 2001 From: Hans van Kranenburg Date: Thu, 4 Oct 2018 23:24:40 +0200 Subject: btrfs: alloc_chunk: fix more DUP stripe size handling [ Upstream commit baf92114c7e6dd6124aa3d506e4bc4b694da3bc3 ] Commit 92e222df7b "btrfs: alloc_chunk: fix DUP stripe size handling" fixed calculating the stripe_size for a new DUP chunk. However, the same calculation reappears a bit later, and that one was not changed yet. The resulting bug that is exposed is that the newly allocated device extents ('stripes') can have a few MiB overlap with the next thing stored after them, which is another device extent or the end of the disk. The scenario in which this can happen is: * The block device for the filesystem is less than 10GiB in size. * The amount of contiguous free unallocated disk space chosen to use for chunk allocation is 20% of the total device size, or a few MiB more or less. An example: - The filesystem device is 7880MiB (max_chunk_size gets set to 788MiB) - There's 1578MiB unallocated raw disk space left in one contiguous piece. In this case stripe_size is first calculated as 789MiB, (half of 1578MiB). Since 789MiB (stripe_size * data_stripes) > 788MiB (max_chunk_size), we enter the if block. Now stripe_size value is immediately overwritten while calculating an adjusted value based on max_chunk_size, which ends up as 788MiB. Next, the value is rounded up to a 16MiB boundary, 800MiB, which is actually more than the value we had before. However, the last comparison fails to detect this, because it's comparing the value with the total amount of free space, which is about twice the size of stripe_size. In the example above, this means that the resulting raw disk space being allocated is 1600MiB, while only a gap of 1578MiB has been found. The second device extent object for this DUP chunk will overlap for 22MiB with whatever comes next. The underlying problem here is that the stripe_size is reused all the time for different things. So, when entering the code in the if block, stripe_size is immediately overwritten with something else. If later we decide we want to have the previous value back, then the logic to compute it was copy pasted in again. With this change, the value in stripe_size is not unnecessarily destroyed, so the duplicated calculation is not needed any more. Signed-off-by: Hans van Kranenburg Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/volumes.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dbb893d0ae81..0ee1cd4b56fb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4768,19 +4768,17 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, /* * Use the number of data stripes to figure out how big this chunk * is really going to be in terms of logical address space, - * and compare that answer with the max chunk size + * and compare that answer with the max chunk size. If it's higher, + * we try to reduce stripe_size. */ if (stripe_size * data_stripes > max_chunk_size) { - stripe_size = div_u64(max_chunk_size, data_stripes); - - /* bump the answer up to a 16MB boundary */ - stripe_size = round_up(stripe_size, SZ_16M); - /* - * But don't go higher than the limits we found while searching - * for free extents + * Reduce stripe_size, round it up to a 16MB boundary again and + * then use it, unless it ends up being even bigger than the + * previous value we had already. */ - stripe_size = min(devices_info[ndevs - 1].max_avail, + stripe_size = min(round_up(div_u64(max_chunk_size, + data_stripes), SZ_16M), stripe_size); } -- cgit v1.2.3 From 38b17eee7074490c15128b596a35c837d9d94c60 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 14 Nov 2018 13:50:26 +0800 Subject: btrfs: fix use-after-free due to race between replace start and cancel [ Upstream commit d189dd70e2556181732598956d808ea53cc8774e ] The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/dev-replace.c | 63 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index b2b283e48439..8fed470bb7e1 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -800,39 +800,58 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; btrfs_dev_replace_write_unlock(dev_replace); - goto leave; + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; + tgt_device = dev_replace->tgtdev; + src_device = dev_replace->srcdev; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_scrub_cancel(fs_info); + /* btrfs_dev_replace_finishing() will handle the cleanup part */ + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + break; case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + /* + * Scrub doing the replace isn't running so we need to do the + * cleanup step of btrfs_dev_replace_finishing() here + */ result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; tgt_device = dev_replace->tgtdev; src_device = dev_replace->srcdev; dev_replace->tgtdev = NULL; dev_replace->srcdev = NULL; - break; - } - dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; - dev_replace->time_stopped = ktime_get_real_seconds(); - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_write_unlock(dev_replace); - btrfs_scrub_cancel(fs_info); + dev_replace->replace_state = + BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; + dev_replace->time_stopped = ktime_get_real_seconds(); + dev_replace->item_needs_writeback = 1; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - WARN_ON(ret); + btrfs_dev_replace_write_unlock(dev_replace); - btrfs_info_in_rcu(fs_info, - "dev_replace from %s (devid %llu) to %s canceled", - btrfs_dev_name(src_device), src_device->devid, - btrfs_dev_name(tgt_device)); + btrfs_scrub_cancel(fs_info); + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } + ret = btrfs_commit_transaction(trans); + WARN_ON(ret); - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(tgt_device); + btrfs_info_in_rcu(fs_info, + "suspended dev_replace from %s (devid %llu) to %s canceled", + btrfs_dev_name(src_device), src_device->devid, + btrfs_dev_name(tgt_device)); + + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); + break; + default: + result = -EINVAL; + } -leave: mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return result; } -- cgit v1.2.3 From 310f8296d6305249ea1f4bbb6fb9a9690e003451 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 12 Dec 2018 15:14:17 +0100 Subject: btrfs: improve error handling of btrfs_add_link [ Upstream commit 1690dd41e0cb1dade80850ed8a3eb0121b96d22f ] In the error handling block, err holds the return value of either btrfs_del_root_ref() or btrfs_del_inode_ref() but it hasn't been checked since it's introduction with commit fe66a05a0679 (Btrfs: improve error handling for btrfs_insert_dir_item callers) in 2012. If the error handling in the error handling fails, there's not much left to do and the abort either happened earlier in the callees or is necessary here. So if one of btrfs_del_root_ref() or btrfs_del_inode_ref() failed, abort the transaction, but still return the original code of the failure stored in 'ret' as this will be reported to the user. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4f6dc56b4f4d..83b3a626c796 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6440,14 +6440,19 @@ fail_dir_item: err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name, name_len); - + if (err) + btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, name_len, ino, parent_ino, &local_index); + if (err) + btrfs_abort_transaction(trans, err); } + + /* Return the original error code */ return ret; } -- cgit v1.2.3 From 876b79b973f2ca50cd9a57cc4263f91576cf25af Mon Sep 17 00:00:00 2001 From: Javier Barrio Date: Thu, 13 Dec 2018 01:06:29 +0100 Subject: quota: Lock s_umount in exclusive mode for Q_XQUOTA{ON,OFF} quotactls. [ Upstream commit 41c4f85cdac280d356df1f483000ecec4a8868be ] Commit 1fa5efe3622db58cb8c7b9a50665e9eb9a6c7e97 (ext4: Use generic helpers for quotaon and quotaoff) made possible to call quotactl(Q_XQUOTAON/OFF) on ext4 filesystems with sysfile quota support. This leads to calling dquot_enable/disable without s_umount held in excl. mode, because quotactl_cmd_onoff checks only for Q_QUOTAON/OFF. The following WARN_ON_ONCE triggers (in this case for dquot_enable, ext4, latest Linus' tree): [ 117.807056] EXT4-fs (dm-0): mounted filesystem with ordered data mode. Opts: quota,prjquota [...] [ 155.036847] WARNING: CPU: 0 PID: 2343 at fs/quota/dquot.c:2469 dquot_enable+0x34/0xb9 [ 155.036851] Modules linked in: quota_v2 quota_tree ipv6 af_packet joydev mousedev psmouse serio_raw pcspkr i2c_piix4 intel_agp intel_gtt e1000 ttm drm_kms_helper drm agpgart fb_sys_fops syscopyarea sysfillrect sysimgblt i2c_core input_leds kvm_intel kvm irqbypass qemu_fw_cfg floppy evdev parport_pc parport button crc32c_generic dm_mod ata_generic pata_acpi ata_piix libata loop ext4 crc16 mbcache jbd2 usb_storage usbcore sd_mod scsi_mod [ 155.036901] CPU: 0 PID: 2343 Comm: qctl Not tainted 4.20.0-rc6-00025-gf5d582777bcb #9 [ 155.036903] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 155.036911] RIP: 0010:dquot_enable+0x34/0xb9 [ 155.036915] Code: 41 56 41 55 41 54 55 53 4c 8b 6f 28 74 02 0f 0b 4d 8d 7d 70 49 89 fc 89 cb 41 89 d6 89 f5 4c 89 ff e8 23 09 ea ff 85 c0 74 0a <0f> 0b 4c 89 ff e8 8b 09 ea ff 85 db 74 6a 41 8b b5 f8 00 00 00 0f [ 155.036918] RSP: 0018:ffffb09b00493e08 EFLAGS: 00010202 [ 155.036922] RAX: 0000000000000001 RBX: 0000000000000008 RCX: 0000000000000008 [ 155.036924] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9781b67cd870 [ 155.036926] RBP: 0000000000000002 R08: 0000000000000000 R09: 61c8864680b583eb [ 155.036929] R10: ffffb09b00493e48 R11: ffffffffff7ce7d4 R12: ffff9781b7ee8d78 [ 155.036932] R13: ffff9781b67cd800 R14: 0000000000000004 R15: ffff9781b67cd870 [ 155.036936] FS: 00007fd813250b88(0000) GS:ffff9781ba000000(0000) knlGS:0000000000000000 [ 155.036939] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 155.036942] CR2: 00007fd812ff61d6 CR3: 000000007c882000 CR4: 00000000000006b0 [ 155.036951] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 155.036953] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 155.036955] Call Trace: [ 155.037004] dquot_quota_enable+0x8b/0xd0 [ 155.037011] kernel_quotactl+0x628/0x74e [ 155.037027] ? do_mprotect_pkey+0x2a6/0x2cd [ 155.037034] __x64_sys_quotactl+0x1a/0x1d [ 155.037041] do_syscall_64+0x55/0xe4 [ 155.037078] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 155.037105] RIP: 0033:0x7fd812fe1198 [ 155.037109] Code: 02 77 0d 48 89 c1 48 c1 e9 3f 75 04 48 8b 04 24 48 83 c4 50 5b c3 48 83 ec 08 49 89 ca 48 63 d2 48 63 ff b8 b3 00 00 00 0f 05 <48> 89 c7 e8 c1 eb ff ff 5a c3 48 63 ff b8 bb 00 00 00 0f 05 48 89 [ 155.037112] RSP: 002b:00007ffe8cd7b050 EFLAGS: 00000206 ORIG_RAX: 00000000000000b3 [ 155.037116] RAX: ffffffffffffffda RBX: 00007ffe8cd7b148 RCX: 00007fd812fe1198 [ 155.037119] RDX: 0000000000000000 RSI: 00007ffe8cd7cea9 RDI: 0000000000580102 [ 155.037121] RBP: 00007ffe8cd7b0f0 R08: 000055fc8eba8a9d R09: 0000000000000000 [ 155.037124] R10: 00007ffe8cd7b074 R11: 0000000000000206 R12: 00007ffe8cd7b168 [ 155.037126] R13: 000055fc8eba8897 R14: 0000000000000000 R15: 0000000000000000 [ 155.037131] ---[ end trace 210f864257175c51 ]--- and then the syscall proceeds without s_umount locking. This patch locks the superblock ->s_umount sem. in exclusive mode for all Q_XQUOTAON/OFF quotactls too in addition to Q_QUOTAON/OFF. AFAICT, other than ext4, only xfs and ocfs2 are affected by this change. The VFS will now call in xfs_quota_* functions with s_umount held, which wasn't the case before. This looks good to me but I can not say for sure. Ext4 and ocfs2 where already beeing called with s_umount exclusive via quota_quotaon/off which is basically the same. Signed-off-by: Javier Barrio Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/quota/quota.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index f0cbf58ad4da..fd5dd806f1b9 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -791,7 +791,8 @@ static int quotactl_cmd_write(int cmd) /* Return true if quotactl command is manipulating quota on/off state */ static bool quotactl_cmd_onoff(int cmd) { - return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF); + return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF) || + (cmd == Q_XQUOTAON) || (cmd == Q_XQUOTAOFF); } /* -- cgit v1.2.3 From c9dcb871b1a99301ba65685be95cbd93a5c540fe Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 21 Dec 2018 08:42:50 -0800 Subject: iomap: don't search past page end in iomap_is_partially_uptodate [ Upstream commit 3cc31fa65d85610574c0f6a474e89f4c419923d5 ] iomap_is_partially_uptodate() is intended to check wither blocks within the selected range of a not-uptodate page are uptodate; if the range we care about is up to date, it's an optimization. However, the iomap implementation continues to check all blocks up to from+count, which is beyond the page, and can even be well beyond the iop->uptodate bitmap. I think the worst that will happen is that we may eventually find a zero bit and return "not partially uptodate" when it would have otherwise returned true, and skip the optimization. Still, it's clearly an invalid memory access that must be fixed. So: fix this by limiting the search to within the page as is done in the non-iomap variant, block_is_partially_uptodate(). Zorro noticed thiswhen KASAN went off for 512 byte blocks on a 64k page system: BUG: KASAN: slab-out-of-bounds in iomap_is_partially_uptodate+0x1a0/0x1e0 Read of size 8 at addr ffff800120c3a318 by task fsstress/22337 Reported-by: Zorro Lang Signed-off-by: Eric Sandeen Signed-off-by: Eric Sandeen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Sasha Levin --- fs/iomap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index ec15cf2ec696..e57fb1e534c5 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -488,16 +488,29 @@ done: } EXPORT_SYMBOL_GPL(iomap_readpages); +/* + * iomap_is_partially_uptodate checks whether blocks within a page are + * uptodate or not. + * + * Returns true if all blocks which correspond to a file portion + * we want to read within the page are uptodate. + */ int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count) { struct iomap_page *iop = to_iomap_page(page); struct inode *inode = page->mapping->host; - unsigned first = from >> inode->i_blkbits; - unsigned last = (from + count - 1) >> inode->i_blkbits; + unsigned len, first, last; unsigned i; + /* Limit range to one page */ + len = min_t(unsigned, PAGE_SIZE - from, count); + + /* First and last blocks in range within page */ + first = from >> inode->i_blkbits; + last = (from + len - 1) >> inode->i_blkbits; + if (iop) { for (i = first; i <= last; i++) if (!test_bit(i, iop->uptodate)) -- cgit v1.2.3 From 5a404f39f8fad2249387dfd0d15c43942f6bb4fb Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 28 Dec 2018 00:32:50 -0800 Subject: ocfs2: fix panic due to unrecovered local alloc [ Upstream commit 532e1e54c8140188e192348c790317921cb2dc1c ] mount.ocfs2 ignore the inconsistent error that journal is clean but local alloc is unrecovered. After mount, local alloc not empty, then reserver cluster didn't alloc a new local alloc window, reserveration map is empty(ocfs2_reservation_map.m_bitmap_len = 0), that triggered the following panic. This issue was reported at https://oss.oracle.com/pipermail/ocfs2-devel/2015-May/010854.html and was advised to fixed during mount. But this is a very unusual inconsistent state, usually journal dirty flag should be cleared at the last stage of umount until every other things go right. We may need do further debug to check that. Any way to avoid possible futher corruption, mount should be abort and fsck should be run. (mount.ocfs2,1765,1):ocfs2_load_local_alloc:353 ERROR: Local alloc hasn't been recovered! found = 6518, set = 6518, taken = 8192, off = 15912372 ocfs2: Mounting device (202,64) on (node 0, slot 3) with ordered data mode. o2dlm: Joining domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 8 ) 8 nodes ocfs2: Mounting device (202,80) on (node 0, slot 3) with ordered data mode. o2hb: Region 89CEAC63CC4F4D03AC185B44E0EE0F3F (xvdf) is now a quorum device o2net: Accepted connection from node yvwsoa17p (num 7) at 172.22.77.88:7777 o2dlm: Node 7 joins domain 64FE421C8C984E6D96ED12C55FEE2435 ( 0 1 2 3 4 5 6 7 8 ) 9 nodes o2dlm: Node 7 joins domain 89CEAC63CC4F4D03AC185B44E0EE0F3F ( 0 1 2 3 4 5 6 7 8 ) 9 nodes ------------[ cut here ]------------ kernel BUG at fs/ocfs2/reservations.c:507! invalid opcode: 0000 [#1] SMP Modules linked in: ocfs2 rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue configfs sunrpc ipt_REJECT nf_reject_ipv4 nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_reject_ipv6 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr ipv6 ovmapi ppdev parport_pc parport xen_netfront fb_sys_fops sysimgblt sysfillrect syscopyarea acpi_cpufreq pcspkr i2c_piix4 i2c_core sg ext4 jbd2 mbcache2 sr_mod cdrom xen_blkfront pata_acpi ata_generic ata_piix floppy dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 4349 Comm: startWebLogic.s Not tainted 4.1.12-124.19.2.el6uek.x86_64 #2 Hardware name: Xen HVM domU, BIOS 4.4.4OVM 09/06/2018 task: ffff8803fb04e200 ti: ffff8800ea4d8000 task.ti: ffff8800ea4d8000 RIP: 0010:[] [] __ocfs2_resv_find_window+0x498/0x760 [ocfs2] Call Trace: ocfs2_resmap_resv_bits+0x10d/0x400 [ocfs2] ocfs2_claim_local_alloc_bits+0xd0/0x640 [ocfs2] __ocfs2_claim_clusters+0x178/0x360 [ocfs2] ocfs2_claim_clusters+0x1f/0x30 [ocfs2] ocfs2_convert_inline_data_to_extents+0x634/0xa60 [ocfs2] ocfs2_write_begin_nolock+0x1c6/0x1da0 [ocfs2] ocfs2_write_begin+0x13e/0x230 [ocfs2] generic_perform_write+0xbf/0x1c0 __generic_file_write_iter+0x19c/0x1d0 ocfs2_file_write_iter+0x589/0x1360 [ocfs2] __vfs_write+0xb8/0x110 vfs_write+0xa9/0x1b0 SyS_write+0x46/0xb0 system_call_fastpath+0x18/0xd7 Code: ff ff 8b 75 b8 39 75 b0 8b 45 c8 89 45 98 0f 84 e5 fe ff ff 45 8b 74 24 18 41 8b 54 24 1c e9 56 fc ff ff 85 c0 0f 85 48 ff ff ff <0f> 0b 48 8b 05 cf c3 de ff 48 ba 00 00 00 00 00 00 00 10 48 85 RIP __ocfs2_resv_find_window+0x498/0x760 [ocfs2] RSP ---[ end trace 566f07529f2edf3c ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled Link: http://lkml.kernel.org/r/20181121020023.3034-2-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Yiwen Jiang Acked-by: Joseph Qi Cc: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/ocfs2/localalloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 7642b6712c39..30208233f65b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -345,13 +345,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) if (num_used || alloc->id1.bitmap1.i_used || alloc->id1.bitmap1.i_total - || la->la_bm_off) - mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + || la->la_bm_off) { + mlog(ML_ERROR, "inconsistent detected, clean journal with" + " unrecovered local alloc, please run fsck.ocfs2!\n" "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + status = -EINVAL; + goto bail; + } + osb->local_alloc_bh = alloc_bh; osb->local_alloc_state = OCFS2_LA_ENABLED; -- cgit v1.2.3 From 2011eb74180356d56afa47c238b8861f8e45ecf6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 28 Dec 2018 00:38:47 -0800 Subject: userfaultfd: clear flag if remap event not enabled [ Upstream commit 3cfd22be0ad663248fadfc8f6ffa3e255c394552 ] When the process being tracked does mremap() without UFFD_FEATURE_EVENT_REMAP on the corresponding tracking uffd file handle, we should not generate the remap event, and at the same time we should clear all the uffd flags on the new VMA. Without this patch, we can still have the VM_UFFD_MISSING|VM_UFFD_WP flags on the new VMA even the fault handling process does not even know the existance of the VMA. Link: http://lkml.kernel.org/r/20181211053409.20317-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Andrea Arcangeli Acked-by: Mike Rapoport Reviewed-by: William Kucharski Cc: Andrea Arcangeli Cc: Mike Rapoport Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Pavel Emelyanov Cc: Pravin Shedge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/userfaultfd.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7a85e609fc27..d8b8323e80f4 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -736,10 +736,18 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx; ctx = vma->vm_userfaultfd_ctx.ctx; - if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { + + if (!ctx) + return; + + if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); WRITE_ONCE(ctx->mmap_changing, true); + } else { + /* Drop uffd context if remap feature not enabled */ + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); } } -- cgit v1.2.3 From 3e05ceedf1439eea0a1306e30935ef5ee65e5d4f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 10 Jan 2019 15:41:09 +0800 Subject: ceph: clear inode pointer when snap realm gets dropped by its inode commit d95e674c01cfb5461e8b9fdeebf6d878c9b80b2f upstream. snap realm and corresponding inode have pointers to each other. The two pointer should get clear at the same time. Otherwise, snap realm's pointer may reference freed inode. Cc: stable@vger.kernel.org # 4.17+ Signed-off-by: "Yan, Zheng" Reviewed-by: Luis Henriques Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/caps.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index eadffaa39f4e..c7542e8dd096 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1030,6 +1030,8 @@ static void drop_inode_snap_realm(struct ceph_inode_info *ci) list_del_init(&ci->i_snap_realm_item); ci->i_snap_realm_counter++; ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; spin_unlock(&realm->inodes_with_caps_lock); ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, realm); -- cgit v1.2.3 From a719cbe07847346f6f42a6133e231603d752c69d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 1 Jan 2019 18:54:26 +0900 Subject: inotify: Fix fd refcount leak in inotify_add_watch(). commit 125892edfe69915a227d8d125ff0e1cd713178f4 upstream. Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") forgot to call fdput() before bailing out. Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") CC: stable@vger.kernel.org Signed-off-by: Tetsuo Handa Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/notify/inotify/inotify_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ac6978d3208c..780bba695453 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -724,8 +724,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ - if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) - return -EINVAL; + if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { + ret = -EINVAL; + goto fput_and_out; + } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { -- cgit v1.2.3 From 07b9e5e35e8f4189a0a3b87beaca1094d5c18b24 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 17 Jan 2019 08:21:24 -0800 Subject: CIFS: Fix possible hang during async MTU reads and writes commit acc58d0bab55a50e02c25f00bd6a210ee121595f upstream. When doing MTU i/o we need to leave some credits for possible reopen requests and other operations happening in parallel. Currently we leave 1 credit which is not enough even for reopen only: we need at least 2 credits if durable handle reconnect fails. Also there may be other operations at the same time including compounding ones which require 3 credits at a time each. Fix this by leaving 8 credits which is big enough to cover most scenarios. Was able to reproduce this when server was configured to give out fewer credits than usual. The proper fix would be to reconnect a file handle first and then obtain credits for an MTU request but this leads to bigger code changes and should happen in other patches. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f44bb4a304e9..0c2bbeab5f64 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -154,14 +154,14 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, scredits = server->credits; /* can deadlock with reopen */ - if (scredits == 1) { + if (scredits <= 8) { *num = SMB2_MAX_BUFFER_SIZE; *credits = 0; break; } - /* leave one credit for a possible reopen */ - scredits--; + /* leave some credits for reopen and other ops */ + scredits -= 8; *num = min_t(unsigned int, size, scredits * SMB2_MAX_BUFFER_SIZE); -- cgit v1.2.3 From 0380ed9b1cd39a582cd14b21072d59fb66bc7d50 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 17 Jan 2019 15:29:26 -0800 Subject: CIFS: Fix credits calculations for reads with errors commit 8004c78c68e894e4fd5ac3c22cc22eb7dc24cabc upstream. Currently we mark MID as malformed if we get an error from server in a read response. This leads to not properly processing credits in the readv callback. Fix this by marking such a response as normal received response and process it appropriately. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifssmb.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5657b79dbc99..269471c8f42b 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1458,18 +1458,26 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) } static int -cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +__cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid, + bool malformed) { int length; - struct cifs_readdata *rdata = mid->callback_data; length = cifs_discard_remaining_data(server); - dequeue_mid(mid, rdata->result); + dequeue_mid(mid, malformed); mid->resp_buf = server->smallbuf; server->smallbuf = NULL; return length; } +static int +cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) +{ + struct cifs_readdata *rdata = mid->callback_data; + + return __cifs_readv_discard(server, mid, rdata->result); +} + int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) { @@ -1511,12 +1519,23 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; } + /* set up first two iov for signature check and to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = server->total_read - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + /* Was the SMB read successful? */ rdata->result = server->ops->map_error(buf, false); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - return cifs_readv_discard(server, mid); + /* normal error on read response */ + return __cifs_readv_discard(server, mid, false); } /* Is there enough to get to the rest of the READ_RSP header? */ @@ -1560,14 +1579,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) server->total_read += length; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->total_read - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", - rdata->iov[0].iov_base, server->total_read); - /* how much data is in the response? */ #ifdef CONFIG_CIFS_SMB_DIRECT use_rdma_mr = rdata->mr; -- cgit v1.2.3 From 2ae6fedbd5cb4144f5cf40681b9a3ea5464d31bd Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 15:38:11 -0800 Subject: CIFS: Fix credit calculation for encrypted reads with errors commit ec678eae746dd25766a61c4095e2b649d3b20b09 upstream. We do need to account for credits received in error responses to read requests on encrypted sessions. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 0c2bbeab5f64..ffb55c174482 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2901,11 +2901,23 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, server->ops->is_status_pending(buf, server, 0)) return -1; - rdata->result = server->ops->map_error(buf, false); + /* set up first two iov to get credits */ + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = 4; + rdata->iov[1].iov_base = buf + 4; + rdata->iov[1].iov_len = + min_t(unsigned int, buf_len, server->vals->read_rsp_size) - 4; + cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", + rdata->iov[0].iov_base, rdata->iov[0].iov_len); + cifs_dbg(FYI, "1: iov_base=%p iov_len=%zu\n", + rdata->iov[1].iov_base, rdata->iov[1].iov_len); + + rdata->result = server->ops->map_error(buf, true); if (rdata->result != 0) { cifs_dbg(FYI, "%s: server returned error %d\n", __func__, rdata->result); - dequeue_mid(mid, rdata->result); + /* normal error on read response */ + dequeue_mid(mid, false); return 0; } @@ -2978,14 +2990,6 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, return 0; } - /* set up first iov for signature check */ - rdata->iov[0].iov_base = buf; - rdata->iov[0].iov_len = 4; - rdata->iov[1].iov_base = buf + 4; - rdata->iov[1].iov_len = server->vals->read_rsp_size - 4; - cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n", - rdata->iov[0].iov_base, server->vals->read_rsp_size); - length = rdata->copy_into_pages(server, rdata, &iter); kfree(bvec); -- cgit v1.2.3 From 779c65bb77391b9e096a0cd1d22c039e9a133911 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 17:25:36 -0800 Subject: CIFS: Do not reconnect TCP session in add_credits() commit ef68e831840c40c7d01b328b3c0f5d8c4796c232 upstream. When executing add_credits() we currently call cifs_reconnect() if the number of credits is zero and there are no requests in flight. In this case we may call cifs_reconnect() recursively twice and cause memory corruption given the following sequence of functions: mid1.callback() -> add_credits() -> cifs_reconnect() -> -> mid2.callback() -> add_credits() -> cifs_reconnect(). Fix this by avoiding to call cifs_reconnect() in add_credits() and checking for zero credits in the demultiplex thread. Cc: Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 21 +++++++++++++++++++++ fs/cifs/smb2ops.c | 32 +++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 52d71b64c0c6..d0bba175117c 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -533,6 +533,21 @@ server_unresponsive(struct TCP_Server_Info *server) return false; } +static inline bool +zero_credits(struct TCP_Server_Info *server) +{ + int val; + + spin_lock(&server->req_lock); + val = server->credits + server->echo_credits + server->oplock_credits; + if (server->in_flight == 0 && val == 0) { + spin_unlock(&server->req_lock); + return true; + } + spin_unlock(&server->req_lock); + return false; +} + static int cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) { @@ -545,6 +560,12 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) for (total_read = 0; msg_data_left(smb_msg); total_read += length) { try_to_freeze(); + /* reconnect if no credits and no requests in flight */ + if (zero_credits(server)) { + cifs_reconnect(server); + return -ECONNABORTED; + } + if (server_unresponsive(server)) return -ECONNABORTED; if (cifs_rdma_enabled(server) && server->smbd_conn) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index ffb55c174482..237d7281ada3 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -34,6 +34,7 @@ #include "cifs_ioctl.h" #include "smbdirect.h" +/* Change credits for different ops and return the total number of credits */ static int change_conf(struct TCP_Server_Info *server) { @@ -41,17 +42,15 @@ change_conf(struct TCP_Server_Info *server) server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: - return -1; + return 0; case 1: server->echoes = false; server->oplocks = false; - cifs_dbg(VFS, "disabling echoes and oplocks\n"); break; case 2: server->echoes = true; server->oplocks = false; server->echo_credits = 1; - cifs_dbg(FYI, "disabling oplocks\n"); break; default: server->echoes = true; @@ -64,14 +63,15 @@ change_conf(struct TCP_Server_Info *server) server->echo_credits = 1; } server->credits -= server->echo_credits + server->oplock_credits; - return 0; + return server->credits + server->echo_credits + server->oplock_credits; } static void smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, const int optype) { - int *val, rc = 0; + int *val, rc = -1; + spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); *val += add; @@ -95,8 +95,26 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, } spin_unlock(&server->req_lock); wake_up(&server->request_q); - if (rc) - cifs_reconnect(server); + + if (server->tcpStatus == CifsNeedReconnect) + return; + + switch (rc) { + case -1: + /* change_conf hasn't been executed */ + break; + case 0: + cifs_dbg(VFS, "Possible client or server bug - zero credits\n"); + break; + case 1: + cifs_dbg(VFS, "disabling echoes and oplocks\n"); + break; + case 2: + cifs_dbg(FYI, "disabling oplocks\n"); + break; + default: + cifs_dbg(FYI, "add %u credits total=%d\n", add, rc); + } } static void -- cgit v1.2.3 From 06d9f987201feeada1e48e501db5f4a202541db3 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 23 Jan 2019 16:20:38 +1000 Subject: smb3: add credits we receive from oplock/break PDUs commit 2e5700bdde438ed708b36d8acd0398dc73cbf759 upstream. Otherwise we gradually leak credits leading to potential hung session. Signed-off-by: Ronnie Sahlberg CC: Stable Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2misc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 6a9c47541c53..7b8b58fb4d3f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -648,6 +648,13 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) return false; + if (rsp->sync_hdr.CreditRequest) { + spin_lock(&server->req_lock); + server->credits += le16_to_cpu(rsp->sync_hdr.CreditRequest); + spin_unlock(&server->req_lock); + wake_up(&server->request_q); + } + if (rsp->StructureSize != smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) { if (le16_to_cpu(rsp->StructureSize) == 44) -- cgit v1.2.3 From c6961288a5f465d7753408a8b6b8158907476ea6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 26 Jan 2019 12:21:32 -0800 Subject: CIFS: Do not count -ENODATA as failure for query directory commit 8e6e72aeceaaed5aeeb1cb43d3085de7ceb14f79 upstream. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index dba986524917..ff13f8325540 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3714,8 +3714,8 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { srch_inf->endOfSearch = true; rc = 0; - } - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } else + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } -- cgit v1.2.3 From 6e7045ec336be8eafa26270efe013e5a17371cf6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 25 Jan 2019 11:38:53 -0800 Subject: CIFS: Fix trace command logging for SMB2 reads and writes commit 7d42e72fe8ee5ab70b1af843dd7d8615e6fb0abe upstream. Currently we log success once we send an async IO request to the server. Instead we need to analyse a response and then log success or failure for a particular command. Also fix argument list for read logging. Cc: # 4.18 Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ff13f8325540..f5a8139f61c0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3127,8 +3127,17 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) + if (rdata->result) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); + trace_smb3_read_err(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, rdata->offset, + rdata->bytes, rdata->result); + } else + trace_smb3_read_done(0 /* xid */, + rdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + rdata->offset, rdata->got_bytes); queue_work(cifsiod_wq, &rdata->work); DeleteMidQEntry(mid); @@ -3203,13 +3212,11 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rc) { kref_put(&rdata->refcount, cifs_readdata_release); cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); - trace_smb3_read_err(rc, 0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); - } else - trace_smb3_read_done(0 /* xid */, io_parms.persistent_fid, - io_parms.tcon->tid, io_parms.tcon->ses->Suid, - io_parms.offset, io_parms.length); + trace_smb3_read_err(0 /* xid */, io_parms.persistent_fid, + io_parms.tcon->tid, + io_parms.tcon->ses->Suid, + io_parms.offset, io_parms.length, rc); + } cifs_small_buf_release(buf); return rc; @@ -3253,10 +3260,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); + trace_smb3_read_err(xid, req->PersistentFileId, + io_parms->tcon->tid, ses->Suid, + io_parms->offset, io_parms->length, + rc); } - trace_smb3_read_err(rc, xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, io_parms->length); free_rsp_buf(resp_buftype, rsp_iov.iov_base); return rc == -ENODATA ? 0 : rc; } else @@ -3342,8 +3350,17 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->mr = NULL; } #endif - if (wdata->result) + if (wdata->result) { cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); + trace_smb3_write_err(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, wdata->offset, + wdata->bytes, wdata->result); + } else + trace_smb3_write_done(0 /* no xid */, + wdata->cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, + wdata->offset, wdata->bytes); queue_work(cifsiod_wq, &wdata->work); DeleteMidQEntry(mid); @@ -3485,10 +3502,7 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata->bytes, rc); kref_put(&wdata->refcount, release); cifs_stats_fail_inc(tcon, SMB2_WRITE_HE); - } else - trace_smb3_write_done(0 /* no xid */, req->PersistentFileId, - tcon->tid, tcon->ses->Suid, wdata->offset, - wdata->bytes); + } async_writev_out: cifs_small_buf_release(req); -- cgit v1.2.3 From e9d56f920bb26eee1a1e574463e7a1fea5a1164e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 18 Jan 2019 15:54:34 -0800 Subject: CIFS: Do not consider -ENODATA as stat failure for reads commit 082aaa8700415f6471ec9c5ef0c8307ca214989a upstream. When doing reads beyound the end of a file the server returns error STATUS_END_OF_FILE error which is mapped to -ENODATA. Currently we report it as a failure which confuses read stats. Change it to not consider -ENODATA as failure for stat purposes. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f5a8139f61c0..8a01e89ff827 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3127,7 +3127,7 @@ smb2_readv_callback(struct mid_q_entry *mid) rdata->mr = NULL; } #endif - if (rdata->result) { + if (rdata->result && rdata->result != -ENODATA) { cifs_stats_fail_inc(tcon, SMB2_READ_HE); trace_smb3_read_err(0 /* xid */, rdata->cfile->fid.persistent_fid, -- cgit v1.2.3 From bb4e1ff5a8dde1f036e2aefcbdbe098f4098c92d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 30 Jan 2019 13:52:36 -0500 Subject: fs/dcache: Fix incorrect nr_dentry_unused accounting in shrink_dcache_sb() commit 1dbd449c9943e3145148cc893c2461b72ba6fef0 upstream. The nr_dentry_unused per-cpu counter tracks dentries in both the LRU lists and the shrink lists where the DCACHE_LRU_LIST bit is set. The shrink_dcache_sb() function moves dentries from the LRU list to a shrink list and subtracts the dentry count from nr_dentry_unused. This is incorrect as the nr_dentry_unused count will also be decremented in shrink_dentry_list() via d_shrink_del(). To fix this double decrement, the decrement in the shrink_dcache_sb() function is taken out. Fixes: 4e717f5c1083 ("list_lru: remove special case function list_lru_dispose_all." Cc: stable@kernel.org Signed-off-by: Waiman Long Reviewed-by: Dave Chinner Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 2e7e8d85e9b4..cb515f183482 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1202,15 +1202,11 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, */ void shrink_dcache_sb(struct super_block *sb) { - long freed; - do { LIST_HEAD(dispose); - freed = list_lru_walk(&sb->s_dentry_lru, + list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); - - this_cpu_sub(nr_dentry_unused, freed); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } -- cgit v1.2.3 From 0a3275d785858074d5e462222c7182b34a89bdc9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 29 Jan 2019 15:52:55 -0500 Subject: NFS: Fix up return value on fatal errors in nfs_page_async_flush() commit 8fc75bed96bb94e23ca51bd9be4daf65c57697bf upstream. Ensure that we return the fatal error value that caused us to exit nfs_page_async_flush(). Fixes: c373fff7bd25 ("NFSv4: Don't special case "launder"") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.12+ Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/write.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 586726a590d8..d790faff8e47 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -621,11 +621,12 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_set_page_writeback(page); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); - ret = 0; + ret = req->wb_context->error; /* If there is a fatal error that covers this write, just exit */ - if (nfs_error_is_fatal_on_server(req->wb_context->error)) + if (nfs_error_is_fatal_on_server(ret)) goto out_launder; + ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -635,9 +636,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, nfs_context_set_write_error(req->wb_context, ret); if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - } + } else + ret = -EAGAIN; nfs_redirty_request(req); - ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); -- cgit v1.2.3 From 8b9be9db8a2ac5ec2046cc95df35487e5aaa136c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 30 Jan 2019 21:30:36 +0100 Subject: gfs2: Revert "Fix loop in gfs2_rbm_find" commit e74c98ca2d6ae4376cc15fa2a22483430909d96b upstream. This reverts commit 2d29f6b96d8f80322ed2dd895bca590491c38d34. It turns out that the fix can lead to a ~20 percent performance regression in initial writes to the page cache according to iozone. Let's revert this for now to have more time for a proper fix. Cc: stable@vger.kernel.org # v3.13+ Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/rgrp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index e978f6930575..449d0cb45a84 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1747,9 +1747,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_iter; } if (ret == -E2BIG) { - n += rbm->bii - initial_bii; rbm->bii = 0; rbm->offset = 0; + n += (rbm->bii - initial_bii); goto res_covered_end_of_rgrp; } return ret; -- cgit v1.2.3 From 5bce143671f36cf63f70eab2185b2c5910203873 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 25 Jan 2019 11:48:51 +0000 Subject: Btrfs: fix deadlock when allocating tree block during leaf/node split commit a6279470762c19ba97e454f90798373dccdf6148 upstream. When splitting a leaf or node from one of the trees that are modified when flushing pending block groups (extent, chunk, device and free space trees), we need to allocate a new tree block, which in turn can result in the need to allocate a new block group. After allocating the new block group we may need to flush new block groups that were previously allocated during the course of the current transaction, which is what may cause a deadlock due to attempts to write lock twice the same leaf or node, as when splitting a leaf or node we are holding a write lock on it and its parent node. The same type of deadlock can also happen when increasing the tree's height, since we are holding a lock on the existing root while allocating the tree block to use as the new root node. An example trace when the deadlock happens during the leaf split path is: [27175.293054] CPU: 0 PID: 3005 Comm: kworker/u17:6 Tainted: G W 4.19.16 #1 [27175.293942] Hardware name: Penguin Computing Relion 1900/MD90-FS0-ZB-XX, BIOS R15 06/25/2018 [27175.294846] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] (...) [27175.298384] RSP: 0018:ffffab2087107758 EFLAGS: 00010246 [27175.299269] RAX: 0000000000000bbd RBX: ffff9fadc7141c48 RCX: 0000000000000001 [27175.300155] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff9fadc7141c48 [27175.301023] RBP: 0000000000000001 R08: ffff9faeb6ac1040 R09: ffff9fa9c0000000 [27175.301887] R10: 0000000000000000 R11: 0000000000000040 R12: ffff9fb21aac8000 [27175.302743] R13: ffff9fb1a64d6a20 R14: 0000000000000001 R15: ffff9fb1a64d6a18 [27175.303601] FS: 0000000000000000(0000) GS:ffff9fb21fa00000(0000) knlGS:0000000000000000 [27175.304468] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [27175.305339] CR2: 00007fdc8743ead8 CR3: 0000000763e0a006 CR4: 00000000003606f0 [27175.306220] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [27175.307087] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [27175.307940] Call Trace: [27175.308802] btrfs_search_slot+0x779/0x9a0 [btrfs] [27175.309669] ? update_space_info+0xba/0xe0 [btrfs] [27175.310534] btrfs_insert_empty_items+0x67/0xc0 [btrfs] [27175.311397] btrfs_insert_item+0x60/0xd0 [btrfs] [27175.312253] btrfs_create_pending_block_groups+0xee/0x210 [btrfs] [27175.313116] do_chunk_alloc+0x25f/0x300 [btrfs] [27175.313984] find_free_extent+0x706/0x10d0 [btrfs] [27175.314855] btrfs_reserve_extent+0x9b/0x1d0 [btrfs] [27175.315707] btrfs_alloc_tree_block+0x100/0x5b0 [btrfs] [27175.316548] split_leaf+0x130/0x610 [btrfs] [27175.317390] btrfs_search_slot+0x94d/0x9a0 [btrfs] [27175.318235] btrfs_insert_empty_items+0x67/0xc0 [btrfs] [27175.319087] alloc_reserved_file_extent+0x84/0x2c0 [btrfs] [27175.319938] __btrfs_run_delayed_refs+0x596/0x1150 [btrfs] [27175.320792] btrfs_run_delayed_refs+0xed/0x1b0 [btrfs] [27175.321643] delayed_ref_async_start+0x81/0x90 [btrfs] [27175.322491] normal_work_helper+0xd0/0x320 [btrfs] [27175.323328] ? move_linked_works+0x6e/0xa0 [27175.324160] process_one_work+0x191/0x370 [27175.324976] worker_thread+0x4f/0x3b0 [27175.325763] kthread+0xf8/0x130 [27175.326531] ? rescuer_thread+0x320/0x320 [27175.327284] ? kthread_create_worker_on_cpu+0x50/0x50 [27175.328027] ret_from_fork+0x35/0x40 [27175.328741] ---[ end trace 300a1b9f0ac30e26 ]--- Fix this by preventing the flushing of new blocks groups when splitting a leaf/node and when inserting a new root node for one of the trees modified by the flushing operation, similar to what is done when COWing a node/leaf from on of these trees. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202383 Reported-by: Eli V CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.c | 78 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7ad6f2eec711..48ac8b7c43a5 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1003,6 +1003,48 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return 0; } +static struct extent_buffer *alloc_tree_block_no_bg_flush( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent_start, + const struct btrfs_disk_key *disk_key, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *ret; + + /* + * If we are COWing a node/leaf from the extent, chunk, device or free + * space trees, make sure that we do not finish block group creation of + * pending block groups. We do this to avoid a deadlock. + * COWing can result in allocation of a new chunk, and flushing pending + * block groups (btrfs_create_pending_block_groups()) can be triggered + * when finishing allocation of a new chunk. Creation of a pending block + * group modifies the extent, chunk, device and free space trees, + * therefore we could deadlock with ourselves since we are holding a + * lock on an extent buffer that btrfs_create_pending_block_groups() may + * try to COW later. + * For similar reasons, we also need to delay flushing pending block + * groups when splitting a leaf or node, from one of those trees, since + * we are holding a write lock on it and its parent or when inserting a + * new root node for one of those trees. + */ + if (root == fs_info->extent_root || + root == fs_info->chunk_root || + root == fs_info->dev_root || + root == fs_info->free_space_root) + trans->can_flush_pending_bgs = false; + + ret = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, disk_key, level, + hint, empty_size); + trans->can_flush_pending_bgs = true; + + return ret; +} + /* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked @@ -1050,28 +1092,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) parent_start = parent->start; - /* - * If we are COWing a node/leaf from the extent, chunk, device or free - * space trees, make sure that we do not finish block group creation of - * pending block groups. We do this to avoid a deadlock. - * COWing can result in allocation of a new chunk, and flushing pending - * block groups (btrfs_create_pending_block_groups()) can be triggered - * when finishing allocation of a new chunk. Creation of a pending block - * group modifies the extent, chunk, device and free space trees, - * therefore we could deadlock with ourselves since we are holding a - * lock on an extent buffer that btrfs_create_pending_block_groups() may - * try to COW later. - */ - if (root == fs_info->extent_root || - root == fs_info->chunk_root || - root == fs_info->dev_root || - root == fs_info->free_space_root) - trans->can_flush_pending_bgs = false; - - cow = btrfs_alloc_tree_block(trans, root, parent_start, - root->root_key.objectid, &disk_key, level, - search_start, empty_size); - trans->can_flush_pending_bgs = true; + cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3383,8 +3405,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &lower_key, level, root->node->start, 0); + c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, + root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3513,8 +3535,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, level, c->start, 0); + split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, + c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4298,8 +4320,8 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, + l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); -- cgit v1.2.3 From 9ee5987f311fc2df9b75fe08fac51cea520f8e84 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 30 Jan 2019 07:54:12 -0600 Subject: btrfs: On error always free subvol_name in btrfs_mount commit 532b618bdf237250d6d4566536d4b6ce3d0a31fe upstream. The subvol_name is allocated in btrfs_parse_subvol_options and is consumed and freed in mount_subvol. Add a free to the error paths that don't call mount_subvol so that it is guaranteed that subvol_name is freed when an error happens. Fixes: 312c89fbca06 ("btrfs: cleanup btrfs_mount() using btrfs_mount_root()") Cc: stable@vger.kernel.org # v4.19+ Reviewed-by: Nikolay Borisov Signed-off-by: "Eric W. Biederman" Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8ad145820ea8..8888337a95b6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1677,6 +1677,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, flags | SB_RDONLY, device_name, data); if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } @@ -1686,12 +1687,14 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error < 0) { root = ERR_PTR(error); mntput(mnt_root); + kfree(subvol_name); goto out; } } } if (IS_ERR(mnt_root)) { root = ERR_CAST(mnt_root); + kfree(subvol_name); goto out; } -- cgit v1.2.3 From c0be624777ba33bac9bf17e2f9854e90ac6d78d8 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 20 Nov 2018 15:16:36 -0200 Subject: cifs: Always resolve hostname before reconnecting commit 28eb24ff75c5ac130eb326b3b4d0dcecfc0f427d upstream. In case a hostname resolves to a different IP address (e.g. long running mounts), make sure to resolve it every time prior to calling generic_ip_connect() in reconnect. Suggested-by: Steve French Signed-off-by: Paulo Alcantara Signed-off-by: Steve French Signed-off-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d0bba175117c..a5ea742654aa 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -50,6 +50,7 @@ #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "dns_resolve.h" #include "ntlmssp.h" #include "nterr.h" #include "rfc1002pdu.h" @@ -317,6 +318,53 @@ static void cifs_prune_tlinks(struct work_struct *work); static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, const char *devname, bool is_smb3); +/* + * Resolve hostname and set ip addr in tcp ses. Useful for hostnames that may + * get their ip addresses changed at some point. + * + * This should be called with server->srv_mutex held. + */ +#ifdef CONFIG_CIFS_DFS_UPCALL +static int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + int rc; + int len; + char *unc, *ipaddr = NULL; + + if (!server->hostname) + return -EINVAL; + + len = strlen(server->hostname) + 3; + + unc = kmalloc(len, GFP_KERNEL); + if (!unc) { + cifs_dbg(FYI, "%s: failed to create UNC path\n", __func__); + return -ENOMEM; + } + snprintf(unc, len, "\\\\%s", server->hostname); + + rc = dns_resolve_server_name_to_ip(unc, &ipaddr); + kfree(unc); + + if (rc < 0) { + cifs_dbg(FYI, "%s: failed to resolve server part of %s to IP: %d\n", + __func__, server->hostname, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, + strlen(ipaddr)); + kfree(ipaddr); + + return !rc ? -1 : 0; +} +#else +static inline int reconn_set_ipaddr(struct TCP_Server_Info *server) +{ + return 0; +} +#endif + /* * cifs tcp session reconnection * @@ -417,6 +465,11 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); + rc = reconn_set_ipaddr(server); + if (rc) { + cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n", + __func__, rc); + } mutex_unlock(&server->srv_mutex); msleep(3000); } else { -- cgit v1.2.3 From ee9268a9b55b82bcc77c64bf137eb9aa7cf87aa9 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 8 Nov 2018 14:04:50 -0500 Subject: dlm: Don't swamp the CPU with callbacks queued during recovery [ Upstream commit 216f0efd19b9cc32207934fd1b87a45f2c4c593e ] Before this patch, recovery would cause all callbacks to be delayed, put on a queue, and afterward they were all queued to the callback work queue. This patch does the same thing, but occasionally takes a break after 25 of them so it won't swamp the CPU at the expense of other RT processes like corosync. Signed-off-by: Bob Peterson Signed-off-by: David Teigland Signed-off-by: Sasha Levin --- fs/dlm/ast.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 562fa8c3edff..47ee66d70109 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -292,6 +292,8 @@ void dlm_callback_suspend(struct dlm_ls *ls) flush_workqueue(ls->ls_callback_wq); } +#define MAX_CB_QUEUE 25 + void dlm_callback_resume(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; @@ -302,15 +304,23 @@ void dlm_callback_resume(struct dlm_ls *ls) if (!ls->ls_callback_wq) return; +more: mutex_lock(&ls->ls_cb_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) { list_del_init(&lkb->lkb_cb_list); queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work); count++; + if (count == MAX_CB_QUEUE) + break; } mutex_unlock(&ls->ls_cb_mutex); if (count) log_rinfo(ls, "dlm_callback_resume %d", count); + if (count == MAX_CB_QUEUE) { + count = 0; + cond_resched(); + goto more; + } } -- cgit v1.2.3 From c96d2b9d35b9e3e75360152d45cfc8d6fa0e521a Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 6 Nov 2018 10:25:29 +0800 Subject: f2fs: move dir data flush to write checkpoint process [ Upstream commit b61ac5b720146c619c7cdf17eff2551b934399e5 ] This patch move dir data flush to write checkpoint process, by doing this, it may reduce some time for dir fsync. pre: -f2fs_do_sync_file enter -file_write_and_wait_range <- flush & wait -write_checkpoint -do_checkpoint <- wait all -f2fs_do_sync_file exit now: -f2fs_do_sync_file enter -write_checkpoint -block_operations <- flush dir & no wait -do_checkpoint <- wait all -f2fs_do_sync_file exit Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5474aaa274b9..fd36aa6569dc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -220,6 +220,9 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, trace_f2fs_sync_file_enter(inode); + if (S_ISDIR(inode->i_mode)) + goto go_write; + /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); -- cgit v1.2.3 From ddab3d0a38a5824462a16a7a123ad4abd165f3ad Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Wed, 14 Nov 2018 19:34:28 +0800 Subject: f2fs: fix race between write_checkpoint and write_begin [ Upstream commit 2866fb16d67992195b0526d19e65acb6640fb87f ] The following race could lead to inconsistent SIT bitmap: Task A Task B ====== ====== f2fs_write_checkpoint block_operations f2fs_lock_all down_write(node_change) down_write(node_write) ... sync ... up_write(node_change) f2fs_file_write_iter set_inode_flag(FI_NO_PREALLOC) ...... f2fs_write_begin(index=0, has inline data) prepare_write_begin __do_map_lock(AIO) => down_read(node_change) f2fs_convert_inline_page => update SIT __do_map_lock(AIO) => up_read(node_change) f2fs_flush_sit_entries <= inconsistent SIT finish write checkpoint sudden-power-off If SPO occurs after checkpoint is finished, SIT bitmap will be set incorrectly. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/data.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 11f28342f641..08314fb42652 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2259,6 +2259,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, bool locked = false; struct extent_info ei = {0,0,0}; int err = 0; + int flag; /* * we already allocated all the blocks, so we don't need to get @@ -2268,9 +2269,15 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; + /* f2fs_lock_op avoids race between write CP and convert_inline_page */ + if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode)) + flag = F2FS_GET_BLOCK_DEFAULT; + else + flag = F2FS_GET_BLOCK_PRE_AIO; + if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + __do_map_lock(sbi, flag, true); locked = true; } restart: @@ -2308,6 +2315,7 @@ restart: f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); + WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO); locked = true; goto restart; } @@ -2321,7 +2329,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); + __do_map_lock(sbi, flag, false); return err; } -- cgit v1.2.3 From d54d612a73a6105201ed8662e283665d079ff6ab Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Wed, 21 Nov 2018 07:21:38 +0800 Subject: f2fs: fix wrong return value of f2fs_acl_create [ Upstream commit f6176473a0c7472380eef72ebeb330cf9485bf0a ] When call f2fs_acl_create_masq() failed, the caller f2fs_acl_create() should return -EIO instead of -ENOMEM, this patch makes it consistent with posix_acl_create() which has been fixed in commit beaf226b863a ("posix_acl: don't ignore return value of posix_acl_create_masq()"). Fixes: 83dfe53c185e ("f2fs: fix reference leaks in f2fs_acl_create") Signed-off-by: Tiezhu Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/acl.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 111824199a88..b9fe937a3c70 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -352,12 +352,14 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return PTR_ERR(p); clone = f2fs_acl_clone(p, GFP_NOFS); - if (!clone) - goto no_mem; + if (!clone) { + ret = -ENOMEM; + goto release_acl; + } ret = f2fs_acl_create_masq(clone, mode); if (ret < 0) - goto no_mem_clone; + goto release_clone; if (ret == 0) posix_acl_release(clone); @@ -371,11 +373,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, return 0; -no_mem_clone: +release_clone: posix_acl_release(clone); -no_mem: +release_acl: posix_acl_release(p); - return -ENOMEM; + return ret; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, -- cgit v1.2.3 From 2f0fb76b3e411462b69e7df2e3cb0063051f2492 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 27 Nov 2018 15:54:17 -0500 Subject: nfsd4: fix crash on writing v4_end_grace before nfsd startup [ Upstream commit 62a063b8e7d1db684db3f207261a466fa3194e72 ] Anatoly Trosinenko reports that this: 1) Checkout fresh master Linux branch (tested with commit e195ca6cb) 2) Copy x84_64-config-4.14 to .config, then enable NFS server v4 and build 3) From `kvm-xfstests shell`: results in NULL dereference in locks_end_grace. Check that nfsd has been started before trying to end the grace period. Reported-by: Anatoly Trosinenko Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin --- fs/nfsd/nfsctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7fb9f7c667b1..899174c7a8ae 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,6 +1126,8 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': + if (nn->nfsd_serv) + return -EBUSY; nfsd4_end_grace(nn); break; default: -- cgit v1.2.3 From 6a1d712b43811fa2992b56108a633531406eac48 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 12 Dec 2018 14:29:20 +0100 Subject: udf: Fix BUG on corrupted inode [ Upstream commit d288d95842f1503414b7eebce3773bac3390457e ] When inode is corrupted so that extent type is invalid, some functions (such as udf_truncate_extents()) will just BUG. Check that extent type is valid when loading the inode to memory. Reported-by: Anatoly Trosinenko Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/udf/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 5df554a9f9c9..ae796e10f68b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1357,6 +1357,12 @@ reread: iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; + if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && + iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { + ret = -EIO; + goto out; + } iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; -- cgit v1.2.3 From 3733632e8bc09971dccf8550e3872fd696d89d92 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 15 Oct 2018 10:45:17 +0800 Subject: btrfs: harden agaist duplicate fsid on scanned devices [ Upstream commit a9261d4125c97ce8624e9941b75dee1b43ad5df9 ] It's not that impossible to imagine that a device OR a btrfs image is copied just by using the dd or the cp command. Which in case both the copies of the btrfs will have the same fsid. If on the system with automount enabled, the copied FS gets scanned. We have a known bug in btrfs, that we let the device path be changed after the device has been mounted. So using this loop hole the new copied device would appears as if its mounted immediately after it's been copied. For example: Initially.. /dev/mmcblk0p4 is mounted as / $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part / |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/mmcblk0p4 Copy mmcblk0 to sda $ dd if=/dev/mmcblk0 of=/dev/sda And immediately after the copy completes the change in the device superblock is notified which the automount scans using btrfs device scan and the new device sda becomes the mounted root device. $ lsblk NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT sda 8:0 1 14.9G 0 disk |-sda4 8:4 1 4G 0 part / |-sda2 8:2 1 500M 0 part |-sda3 8:3 1 256M 0 part `-sda1 8:1 1 256M 0 part mmcblk0 179:0 0 29.2G 0 disk |-mmcblk0p4 179:4 0 4G 0 part |-mmcblk0p2 179:2 0 500M 0 part /boot |-mmcblk0p3 179:3 0 256M 0 part [SWAP] `-mmcblk0p1 179:1 0 256M 0 part /boot/efi $ btrfs fi show / Label: none uuid: 07892354-ddaa-4443-90ea-f76a06accaba Total devices 1 FS bytes used 1.40GiB devid 1 size 4.00GiB used 3.00GiB path /dev/sda4 The bug is quite nasty that you can't either unmount /dev/sda4 or /dev/mmcblk0p4. And the problem does not get solved until you take sda out of the system on to another system to change its fsid using the 'btrfstune -u' command. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/volumes.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0ee1cd4b56fb..285f64f2de5f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -850,6 +850,35 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(-EEXIST); } + /* + * We are going to replace the device path for a given devid, + * make sure it's the same device if the device is mounted + */ + if (device->bdev) { + struct block_device *path_bdev; + + path_bdev = lookup_bdev(path); + if (IS_ERR(path_bdev)) { + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_CAST(path_bdev); + } + + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, + "duplicate device fsid:devid for %pU:%llu old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, + "device fsid %pU devid %llu moved old:%s new:%s", + disk_super->fsid, devid, + rcu_str_deref(device->name), path); + } + name = rcu_string_strdup(path, GFP_NOFS); if (!name) { mutex_unlock(&fs_devices->device_list_mutex); -- cgit v1.2.3 From f5d5b54349125f4765b51153960676ea4d81b82b Mon Sep 17 00:00:00 2001 From: Ethan Lien Date: Thu, 1 Nov 2018 14:49:03 +0800 Subject: btrfs: use tagged writepage to mitigate livelock of snapshot [ Upstream commit 3cd24c698004d2f7668e0eb9fc1f096f533c791b ] Snapshot is expected to be fast. But if there are writers steadily creating dirty pages in our subvolume, the snapshot may take a very long time to complete. To fix the problem, we use tagged writepage for snapshot flusher as we do in the generic write_cache_pages(), so we can omit pages dirtied after the snapshot command. This does not change the semantics regarding which data get to the snapshot, if there are pages being dirtied during the snapshotting operation. There's a sync called before snapshot is taken in old/new case, any IO in flight just after that may be in the snapshot but this depends on other system effects that might still sync the IO. We do a simple snapshot speed test on a Intel D-1531 box: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 1m58sec patched: 6.54sec This is the best case for this patch since for a sequential write case, we omit nearly all pages dirtied after the snapshot command. For a multi writers, random write test: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 15.83sec patched: 10.35sec The improvement is smaller compared to the sequential write case, since we omit only half of the pages dirtied after snapshot command. Reviewed-by: Nikolay Borisov Signed-off-by: Ethan Lien Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/btrfs_inode.h | 1 + fs/btrfs/ctree.h | 2 +- fs/btrfs/extent_io.c | 17 +++++++++++++++-- fs/btrfs/inode.c | 11 +++++++---- fs/btrfs/ioctl.c | 2 +- 5 files changed, 25 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7177d1d33584..45f5cf9cd203 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -29,6 +29,7 @@ enum { BTRFS_INODE_IN_DELALLOC_LIST, BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, + BTRFS_INODE_SNAPSHOT_FLUSH, }; /* in memory btrfs inode */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2cddfe7806a4..82682da5a40d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, unsigned int extra_bits, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dd6faab02bb..79f82f2ec4d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3928,12 +3928,25 @@ static int extent_write_cache_pages(struct address_space *mapping, range_whole = 1; scanned = 1; } - if (wbc->sync_mode == WB_SYNC_ALL) + + /* + * We do the tagged writepage as long as the snapshot flush bit is set + * and we are the first one who do the filemap_flush() on this inode. + * + * The nr_to_write == LONG_MAX is needed to make sure other flushers do + * not race in and drop the bit. + */ + if (range_whole && wbc->nr_to_write == LONG_MAX && + test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &BTRFS_I(inode)->runtime_flags)) + wbc->tagged_writepages = 1; + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83b3a626c796..59f361f7d0c1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10005,7 +10005,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr) +static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -10033,6 +10033,9 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr) } spin_unlock(&root->delalloc_lock); + if (snapshot) + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, + &binode->runtime_flags); work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); @@ -10066,7 +10069,7 @@ out: return ret; } -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -10074,7 +10077,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1); + ret = start_delalloc_inodes(root, -1, true); if (ret > 0) ret = 0; return ret; @@ -10103,7 +10106,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr); + ret = start_delalloc_inodes(root, nr, false); btrfs_put_fs_root(root); if (ret < 0) goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c9152155fcbf..8bf9cce11213 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -778,7 +778,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, wait_event(root->subv_writers->wait, percpu_counter_sum(&root->subv_writers->counter) == 0); - ret = btrfs_start_delalloc_inodes(root); + ret = btrfs_start_delalloc_snapshot(root); if (ret) goto dec_and_free; -- cgit v1.2.3 From 8c642d71906b54820ee30c91dbbdd593eabf796e Mon Sep 17 00:00:00 2001 From: Chris Perl Date: Mon, 17 Dec 2018 10:56:38 -0500 Subject: NFS: nfs_compare_mount_options always compare auth flavors. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 594d1644cd59447f4fceb592448d5cd09eb09b5e ] This patch removes the check from nfs_compare_mount_options to see if a `sec' option was passed for the current mount before comparing auth flavors and instead just always compares auth flavors. Consider the following scenario: You have a server with the address 192.168.1.1 and two exports /export/a and /export/b. The first export supports `sys' and `krb5' security, the second just `sys'. Assume you start with no mounts from the server. The following results in EIOs being returned as the kernel nfs client incorrectly thinks it can share the underlying `struct nfs_server's: $ mkdir /tmp/{a,b} $ sudo mount -t nfs -o vers=3,sec=krb5 192.168.1.1:/export/a /tmp/a $ sudo mount -t nfs -o vers=3 192.168.1.1:/export/b /tmp/b $ df >/dev/null df: ‘/tmp/b’: Input/output error Signed-off-by: Chris Perl Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index ac4b2f005778..5ef2c71348bd 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2409,8 +2409,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n goto Ebusy; if (a->acdirmax != b->acdirmax) goto Ebusy; - if (b->auth_info.flavor_len > 0 && - clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) + if (clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor) goto Ebusy; return 1; Ebusy: -- cgit v1.2.3 From 5d3b4cd8734b41ea04e384d231a09244cfcaebca Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 13 Dec 2018 08:06:16 +1000 Subject: cifs: check ntwrk_buf_start for NULL before dereferencing it [ Upstream commit 59a63e479ce36a3f24444c3a36efe82b78e4a8e0 ] RHBZ: 1021460 There is an issue where when multiple threads open/close the same directory ntwrk_buf_start might end up being NULL, causing the call to smbCalcSize later to oops with a NULL deref. The real bug is why this happens and why this can become NULL for an open cfile, which should not be allowed. This patch tries to avoid a oops until the time when we fix the underlying issue. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/readdir.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index e169e1a5fd35..3925a7bfc74d 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -655,7 +655,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, /* scan and find it */ int i; char *cur_ent; - char *end_of_smb = cfile->srch_inf.ntwrk_buf_start + + char *end_of_smb; + + if (cfile->srch_inf.ntwrk_buf_start == NULL) { + cifs_dbg(VFS, "ntwrk_buf_start is NULL during readdir\n"); + return -EIO; + } + + end_of_smb = cfile->srch_inf.ntwrk_buf_start + server->ops->calc_smb_size( cfile->srch_inf.ntwrk_buf_start, server); -- cgit v1.2.3 From 69e7f87745e70ca8d9b8a72e045738236d367670 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Wed, 26 Dec 2018 11:20:29 +0530 Subject: f2fs: fix use-after-free issue when accessing sbi->stat_info [ Upstream commit 60aa4d5536ab7fe32433ca1173bd9d6633851f27 ] iput() on sbi->node_inode can update sbi->stat_info in the below context, if the f2fs_write_checkpoint() has failed with error. f2fs_balance_fs_bg+0x1ac/0x1ec f2fs_write_node_pages+0x4c/0x260 do_writepages+0x80/0xbc __writeback_single_inode+0xdc/0x4ac writeback_single_inode+0x9c/0x144 write_inode_now+0xc4/0xec iput+0x194/0x22c f2fs_put_super+0x11c/0x1e8 generic_shutdown_super+0x70/0xf4 kill_block_super+0x2c/0x5c kill_f2fs_super+0x44/0x50 deactivate_locked_super+0x60/0x8c deactivate_super+0x68/0x74 cleanup_mnt+0x40/0x78 Fix this by moving f2fs_destroy_stats() further below iput() in both f2fs_put_super() and f2fs_fill_super() paths. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/super.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 338138b34993..c9639ef0e8d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1039,9 +1039,6 @@ static void f2fs_put_super(struct super_block *sb) f2fs_write_checkpoint(sbi, &cpc); } - /* f2fs_write_checkpoint can update stat informaion */ - f2fs_destroy_stats(sbi); - /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. @@ -1061,6 +1058,12 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->node_inode); iput(sbi->meta_inode); + /* + * iput() can update stat information, if f2fs_write_checkpoint() + * above failed with error. + */ + f2fs_destroy_stats(sbi); + /* destroy f2fs internal modules */ f2fs_destroy_node_manager(sbi); f2fs_destroy_segment_manager(sbi); @@ -2980,30 +2983,30 @@ try_onemore: f2fs_build_gc_manager(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); - goto free_nm; + goto free_stats; } - err = f2fs_build_stats(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); - goto free_stats; + goto free_node_inode; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; - goto free_stats; + goto free_node_inode; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -3121,12 +3124,12 @@ free_sysfs: free_root_inode: dput(sb->s_root); sb->s_root = NULL; -free_stats: - f2fs_destroy_stats(sbi); free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); +free_stats: + f2fs_destroy_stats(sbi); free_nm: f2fs_destroy_node_manager(sbi); free_sm: -- cgit v1.2.3 From df13b0369bc007b044b6d3c634892b325f783b83 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 18 Dec 2018 16:39:24 +0530 Subject: f2fs: fix sbi->extent_list corruption issue [ Upstream commit e4589fa545e0020dbbc3c9bde35f35f949901392 ] When there is a failure in f2fs_fill_super() after/during the recovery of fsync'd nodes, it frees the current sbi and retries again. This time the mount is successful, but the files that got recovered before retry, still holds the extent tree, whose extent nodes list is corrupted since sbi and sbi->extent_list is freed up. The list_del corruption issue is observed when the file system is getting unmounted and when those recoverd files extent node is being freed up in the below context. list_del corruption. prev->next should be fffffff1e1ef5480, but was (null) <...> kernel BUG at kernel/msm-4.14/lib/list_debug.c:53! lr : __list_del_entry_valid+0x94/0xb4 pc : __list_del_entry_valid+0x94/0xb4 <...> Call trace: __list_del_entry_valid+0x94/0xb4 __release_extent_node+0xb0/0x114 __free_extent_tree+0x58/0x7c f2fs_shrink_extent_tree+0xdc/0x3b0 f2fs_leave_shrinker+0x28/0x7c f2fs_put_super+0xfc/0x1e0 generic_shutdown_super+0x70/0xf4 kill_block_super+0x2c/0x5c kill_f2fs_super+0x44/0x50 deactivate_locked_super+0x60/0x8c deactivate_super+0x68/0x74 cleanup_mnt+0x40/0x78 __cleanup_mnt+0x1c/0x28 task_work_run+0x48/0xd0 do_notify_resume+0x678/0xe98 work_pending+0x8/0x14 Fix this by not creating extents for those recovered files if shrinker is not registered yet. Once mount is successful and shrinker is registered, those files can have extents again. Signed-off-by: Sahitya Tummala Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/shrinker.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ecb735142276..42aef5c94927 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2613,10 +2613,19 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (!test_opt(sbi, EXTENT_CACHE) || is_inode_flag_set(inode, FI_NO_EXTENT)) return false; + /* + * for recovered files during mount do not create extents + * if shrinker is not registered. + */ + if (list_empty(&sbi->s_list)) + return false; + return S_ISREG(inode->i_mode); } diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 36cfd816c160..29042e6d5126 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -138,6 +138,6 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi) f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi)); spin_lock(&f2fs_list_lock); - list_del(&sbi->s_list); + list_del_init(&sbi->s_list); spin_unlock(&f2fs_list_lock); } -- cgit v1.2.3 From 69e63b49ddf9bc786f1930d5aaa9b8673c326ee8 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 28 Dec 2018 00:32:57 -0800 Subject: ocfs2: don't clear bh uptodate for block read [ Upstream commit 70306d9dce75abde855cefaf32b3f71eed8602a3 ] For sync io read in ocfs2_read_blocks_sync(), first clear bh uptodate flag and submit the io, second wait io done, last check whether bh uptodate, if not return io error. If two sync io for the same bh were issued, it could be the first io done and set uptodate flag, but just before check that flag, the second io came in and cleared uptodate, then ocfs2_read_blocks_sync() for the first io will return IO error. Indeed it's not necessary to clear uptodate flag, as the io end handler end_buffer_read_sync() will set or clear it based on io succeed or failed. The following message was found from a nfs server but the underlying storage returned no error. [4106438.567376] (nfsd,7146,3):ocfs2_get_suballoc_slot_bit:2780 ERROR: read block 1238823695 failed -5 [4106438.567569] (nfsd,7146,3):ocfs2_get_suballoc_slot_bit:2812 ERROR: status = -5 [4106438.567611] (nfsd,7146,3):ocfs2_test_inode_bit:2894 ERROR: get alloc slot and bit failed -5 [4106438.567643] (nfsd,7146,3):ocfs2_test_inode_bit:2932 ERROR: status = -5 [4106438.567675] (nfsd,7146,3):ocfs2_get_dentry:94 ERROR: test inode bit failed -5 Same issue in non sync read ocfs2_read_blocks(), fixed it as well. Link: http://lkml.kernel.org/r/20181121020023.3034-4-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Changwei Ge Reviewed-by: Yiwen Jiang Cc: Joel Becker Cc: Joseph Qi Cc: Jun Piao Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/ocfs2/buffer_head_io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 1d098c3c00e0..9f8250df99f1 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -152,7 +152,6 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, #endif } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, 0, bh); @@ -306,7 +305,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, continue; } - clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ if (validate) set_buffer_needs_validate(bh); -- cgit v1.2.3 From 066206bc6e9e3cacd4334703a18d2d6b04fe763a Mon Sep 17 00:00:00 2001 From: Larry Chen Date: Fri, 28 Dec 2018 00:32:46 -0800 Subject: ocfs2: improve ocfs2 Makefile [ Upstream commit 9e6aea22802b5684c7e1d69822aeb0844dd01953 ] Included file path was hard-wired in the ocfs2 makefile, which might causes some confusion when compiling ocfs2 as an external module. Say if we compile ocfs2 module as following. cp -r /kernel/tree/fs/ocfs2 /other/dir/ocfs2 cd /other/dir/ocfs2 make -C /path/to/kernel_source M=`pwd` modules Acutally, the compiler wil try to find included file in /kernel/tree/fs/ocfs2, rather than the directory /other/dir/ocfs2. To fix this little bug, we introduce the var $(src) provided by kbuild. $(src) means the absolute path of the running kbuild file. Link: http://lkml.kernel.org/r/20181108085546.15149-1-lchen@suse.com Signed-off-by: Larry Chen Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/ocfs2/Makefile | 2 +- fs/ocfs2/dlm/Makefile | 2 +- fs/ocfs2/dlmfs/Makefile | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 99ee093182cb..cc9b32b9db7c 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src) obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index bd1aab1f49a4..ef2854422a6e 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile index eed3db8c5b49..33431a0296a3 100644 --- a/fs/ocfs2/dlmfs/Makefile +++ b/fs/ocfs2/dlmfs/Makefile @@ -1,4 +1,4 @@ -ccflags-y := -Ifs/ocfs2 +ccflags-y := -I$(src)/.. obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o -- cgit v1.2.3 From 9cb8f8088d9a105af7e763a315771e39eb8c16a4 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 3 Jan 2019 15:27:09 -0800 Subject: fs/epoll: drop ovflist branch prediction [ Upstream commit 76699a67f3041ff4c7af6d6ee9be2bfbf1ffb671 ] The ep->ovflist is a secondary ready-list to temporarily store events that might occur when doing sproc without holding the ep->wq.lock. This accounts for every time we check for ready events and also send events back to userspace; both callbacks, particularly the latter because of copy_to_user, can account for a non-trivial time. As such, the unlikely() check to see if the pointer is being used, seems both misleading and sub-optimal. In fact, we go to an awful lot of trouble to sync both lists, and populating the ovflist is far from an uncommon scenario. For example, profiling a concurrent epoll_wait(2) benchmark, with CONFIG_PROFILE_ANNOTATED_BRANCHES shows that for a two threads a 33% incorrect rate was seen; and when incrementally increasing the number of epoll instances (which is used, for example for multiple queuing load balancing models), up to a 90% incorrect rate was seen. Similarly, by deleting the prediction, 3% throughput boost was seen across incremental threads. Link: http://lkml.kernel.org/r/20181108051006.18751-4-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 42bbe6824b4b..58f48ea0db23 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1154,7 +1154,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ - if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { + if (ep->ovflist != EP_UNACTIVE_PTR) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; -- cgit v1.2.3 From ab5f7407125778bb05274f4a6999c67c4a72d927 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Jan 2019 15:28:07 -0800 Subject: exec: load_script: don't blindly truncate shebang string [ Upstream commit 8099b047ecc431518b9bb6bdbba3549bbecdc343 ] load_script() simply truncates bprm->buf and this is very wrong if the length of shebang string exceeds BINPRM_BUF_SIZE-2. This can silently truncate i_arg or (worse) we can execute the wrong binary if buf[2:126] happens to be the valid executable path. Change load_script() to return ENOEXEC if it can't find '\n' or zero in bprm->buf. Note that '\0' can come from either prepare_binprm()->memset() or from kernel_read(), we do not care. Link: http://lkml.kernel.org/r/20181112160931.GA28463@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Ben Woodard Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/binfmt_script.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 7cde3f46ad26..d0078cbb718b 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -42,10 +42,14 @@ static int load_script(struct linux_binprm *bprm) fput(bprm->file); bprm->file = NULL; - bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; - if ((cp = strchr(bprm->buf, '\n')) == NULL) - cp = bprm->buf+BINPRM_BUF_SIZE-1; + for (cp = bprm->buf+2;; cp++) { + if (cp >= bprm->buf + BINPRM_BUF_SIZE) + return -ENOEXEC; + if (!*cp || (*cp == '\n')) + break; + } *cp = '\0'; + while (cp > bprm->buf) { cp--; if ((*cp == ' ') || (*cp == '\t')) -- cgit v1.2.3 From 62c7c0a8709b254060cb5f76e9ed6a39f6aae20f Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 4 Feb 2019 08:54:18 -0800 Subject: xfs: Fix xqmstats offsets in /proc/fs/xfs/xqmstat commit 41657e5507b13e963be906d5d874f4f02374fd5c upstream. The addition of FIBT, RMAP and REFCOUNT changed the offsets into __xfssats structure. This caused xqmstat_proc_show() to display garbage data via /proc/fs/xfs/xqmstat, once it relies on the offsets marked via macros. Fix it. Fixes: 00f4e4f9 xfs: add rmap btree stats infrastructure Fixes: aafc3c24 xfs: support the XFS_BTNUM_FINOBT free inode btree type Fixes: 46eeb521 xfs: introduce refcount btree definitions Signed-off-by: Carlos Maiolino Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 4e4423153071..740ac9674848 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -119,7 +119,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) int j; seq_printf(m, "qm"); - for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + for (j = XFSSTAT_END_REFCOUNT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; -- cgit v1.2.3 From a585ac0e767b26e74b0156f014331b48b56f0974 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Feb 2019 08:54:19 -0800 Subject: xfs: cancel COW blocks before swapext commit 96987eea537d6ccd98704a71958f9ba02da80843 upstream. We need to make sure we have no outstanding COW blocks before we swap extents, as there is nothing preventing us from having preallocated COW delalloc on either inode that swapext is called on. That case can easily be reproduced by running generic/324 in always_cow mode: [ 620.760572] XFS: Assertion failed: tip->i_delayed_blks == 0, file: fs/xfs/xfs_bmap_util.c, line: 1669 [ 620.761608] ------------[ cut here ]------------ [ 620.762171] kernel BUG at fs/xfs/xfs_message.c:102! [ 620.762732] invalid opcode: 0000 [#1] SMP PTI [ 620.763272] CPU: 0 PID: 24153 Comm: xfs_fsr Tainted: G W 4.19.0-rc1+ #4182 [ 620.764203] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014 [ 620.765202] RIP: 0010:assfail+0x20/0x28 [ 620.765646] Code: 31 ff e8 83 fc ff ff 0f 0b c3 48 89 f1 41 89 d0 48 c7 c6 48 ca 8d 82 48 89 fa 38 [ 620.767758] RSP: 0018:ffffc9000898bc10 EFLAGS: 00010202 [ 620.768359] RAX: 0000000000000000 RBX: ffff88012f14ba40 RCX: 0000000000000000 [ 620.769174] RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff828560d9 [ 620.769982] RBP: ffff88012f14b300 R08: 0000000000000000 R09: 0000000000000000 [ 620.770788] R10: 000000000000000a R11: f000000000000000 R12: ffffc9000898bc98 [ 620.771638] R13: ffffc9000898bc9c R14: ffff880130b5e2b8 R15: ffff88012a1fa2a8 [ 620.772504] FS: 00007fdc36e0fbc0(0000) GS:ffff88013ba00000(0000) knlGS:0000000000000000 [ 620.773475] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 620.774168] CR2: 00007fdc3604d000 CR3: 0000000132afc000 CR4: 00000000000006f0 [ 620.774978] Call Trace: [ 620.775274] xfs_swap_extent_forks+0x2a0/0x2e0 [ 620.775792] xfs_swap_extents+0x38b/0xab0 [ 620.776256] xfs_ioc_swapext+0x121/0x140 [ 620.776709] xfs_file_ioctl+0x328/0xc90 [ 620.777154] ? rcu_read_lock_sched_held+0x50/0x60 [ 620.777694] ? xfs_iunlock+0x233/0x260 [ 620.778127] ? xfs_setattr_nonsize+0x3be/0x6a0 [ 620.778647] do_vfs_ioctl+0x9d/0x680 [ 620.779071] ? ksys_fchown+0x47/0x80 [ 620.779552] ksys_ioctl+0x35/0x70 [ 620.780040] __x64_sys_ioctl+0x11/0x20 [ 620.780530] do_syscall_64+0x4b/0x190 [ 620.780927] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 620.781467] RIP: 0033:0x7fdc364d0f07 [ 620.781900] Code: b3 66 90 48 8b 05 81 5f 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 28 [ 620.784044] RSP: 002b:00007ffe2a766038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 620.784896] RAX: ffffffffffffffda RBX: 0000000000000025 RCX: 00007fdc364d0f07 [ 620.785667] RDX: 0000560296ca2fc0 RSI: 00000000c0c0586d RDI: 0000000000000005 [ 620.786398] RBP: 0000000000000025 R08: 0000000000001200 R09: 0000000000000000 [ 620.787283] R10: 0000000000000432 R11: 0000000000000246 R12: 0000000000000005 [ 620.788051] R13: 0000000000000000 R14: 0000000000001000 R15: 0000000000000006 [ 620.788927] Modules linked in: [ 620.789340] ---[ end trace 9503b7417ffdbdb0 ]--- [ 620.790065] RIP: 0010:assfail+0x20/0x28 [ 620.790642] Code: 31 ff e8 83 fc ff ff 0f 0b c3 48 89 f1 41 89 d0 48 c7 c6 48 ca 8d 82 48 89 fa 38 [ 620.793038] RSP: 0018:ffffc9000898bc10 EFLAGS: 00010202 [ 620.793609] RAX: 0000000000000000 RBX: ffff88012f14ba40 RCX: 0000000000000000 [ 620.794317] RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff828560d9 [ 620.795025] RBP: ffff88012f14b300 R08: 0000000000000000 R09: 0000000000000000 [ 620.795778] R10: 000000000000000a R11: f000000000000000 R12: ffffc9000898bc98 [ 620.796675] R13: ffffc9000898bc9c R14: ffff880130b5e2b8 R15: ffff88012a1fa2a8 [ 620.797782] FS: 00007fdc36e0fbc0(0000) GS:ffff88013ba00000(0000) knlGS:0000000000000000 [ 620.798908] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 620.799594] CR2: 00007fdc3604d000 CR3: 0000000132afc000 CR4: 00000000000006f0 [ 620.800424] Kernel panic - not syncing: Fatal exception [ 620.801191] Kernel Offset: disabled [ 620.801597] ---[ end Kernel panic - not syncing: Fatal exception ]--- Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_bmap_util.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 6de8d90041ff..9d1e5c3a661e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1824,6 +1824,12 @@ xfs_swap_extents( if (error) goto out_unlock; + if (xfs_inode_has_cow_data(tip)) { + error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); + if (error) + return error; + } + /* * Extent "swapping" with rmap requires a permanent reservation and * a block reservation because it's really just a remap operation -- cgit v1.2.3 From b6095cbd7841fff310c1ad135ae2c7da14df9da4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 4 Feb 2019 08:54:20 -0800 Subject: xfs: Fix error code in 'xfs_ioc_getbmap()' commit 132bf6723749f7219c399831eeb286dbbb985429 upstream. In this function, once 'buf' has been allocated, we unconditionally return 0. However, 'error' is set to some error codes in several error handling paths. Before commit 232b51948b99 ("xfs: simplify the xfs_getbmap interface") this was not an issue because all error paths were returning directly, but now that some cleanup at the end may be needed, we must propagate the error code. Fixes: 232b51948b99 ("xfs: simplify the xfs_getbmap interface") Signed-off-by: Christophe JAILLET Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0ef5ece5634c..bad90479ade2 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1616,7 +1616,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: kmem_free(buf); - return 0; + return error; } struct getfsmap_info { -- cgit v1.2.3 From a96f3a55143a4716ed10088c8637eced24ed1e75 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Feb 2019 08:54:21 -0800 Subject: xfs: fix overflow in xfs_attr3_leaf_verify commit 837514f7a4ca4aca06aec5caa5ff56d33ef06976 upstream. generic/070 on 64k block size filesystems is failing with a verifier corruption on writeback or an attribute leaf block: [ 94.973083] XFS (pmem0): Metadata corruption detected at xfs_attr3_leaf_verify+0x246/0x260, xfs_attr3_leaf block 0x811480 [ 94.975623] XFS (pmem0): Unmount and run xfs_repair [ 94.976720] XFS (pmem0): First 128 bytes of corrupted metadata buffer: [ 94.978270] 000000004b2e7b45: 00 00 00 00 00 00 00 00 3b ee 00 00 00 00 00 00 ........;....... [ 94.980268] 000000006b1db90b: 00 00 00 00 00 81 14 80 00 00 00 00 00 00 00 00 ................ [ 94.982251] 00000000433f2407: 22 7b 5c 82 2d 5c 47 4c bb 31 1c 37 fa a9 ce d6 "{\.-\GL.1.7.... [ 94.984157] 0000000010dc7dfb: 00 00 00 00 00 81 04 8a 00 0a 18 e8 dd 94 01 00 ................ [ 94.986215] 00000000d5a19229: 00 a0 dc f4 fe 98 01 68 f0 d8 07 e0 00 00 00 00 .......h........ [ 94.988171] 00000000521df36c: 0c 2d 32 e2 fe 20 01 00 0c 2d 58 65 fe 0c 01 00 .-2.. ...-Xe.... [ 94.990162] 000000008477ae06: 0c 2d 5b 66 fe 8c 01 00 0c 2d 71 35 fe 7c 01 00 .-[f.....-q5.|.. [ 94.992139] 00000000a4a6bca6: 0c 2d 72 37 fc d4 01 00 0c 2d d8 b8 f0 90 01 00 .-r7.....-...... [ 94.994789] XFS (pmem0): xfs_do_force_shutdown(0x8) called from line 1453 of file fs/xfs/xfs_buf.c. Return address = ffffffff815365f3 This is failing this check: end = ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) >>>>> return __this_address; if (end > mp->m_attr_geo->blksize) return __this_address; And from the buffer output above, the freemap array is: freemap[0].base = 0x00a0 freemap[0].size = 0xdcf4 end = 0xdd94 freemap[1].base = 0xfe98 freemap[1].size = 0x0168 end = 0x10000 freemap[2].base = 0xf0d8 freemap[2].size = 0x07e0 end = 0xf8b8 These all look valid - the block size is 0x10000 and so from the last check in the above verifier fragment we know that the end of freemap[1] is valid. The problem is that end is declared as: uint16_t end; And (uint16_t)0x10000 = 0. So we have a verifier bug here, not a corruption. Fix the verifier to use uint32_t types for the check and hence avoid the overflow. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=201577 Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/libxfs/xfs_attr_leaf.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 6fc5425b1474..2652d00842d6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -243,7 +243,7 @@ xfs_attr3_leaf_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; - uint16_t end; + uint32_t end; /* must be 32bit - see below */ int i; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -293,6 +293,11 @@ xfs_attr3_leaf_verify( /* * Quickly check the freemap information. Attribute data has to be * aligned to 4-byte boundaries, and likewise for the free space. + * + * Note that for 64k block size filesystems, the freemap entries cannot + * overflow as they are only be16 fields. However, when checking end + * pointer of the freemap, we have to be careful to detect overflows and + * so use uint32_t for those checks. */ for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr.freemap[i].base > mp->m_attr_geo->blksize) @@ -303,7 +308,9 @@ xfs_attr3_leaf_verify( return __this_address; if (ichdr.freemap[i].size & 0x3) return __this_address; - end = ichdr.freemap[i].base + ichdr.freemap[i].size; + + /* be care of 16 bit overflows here */ + end = (uint32_t)ichdr.freemap[i].base + ichdr.freemap[i].size; if (end < ichdr.freemap[i].base) return __this_address; if (end > mp->m_attr_geo->blksize) -- cgit v1.2.3 From c3a66bf4ce403f3d4ff083f6fd75604e7d8dc450 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 4 Feb 2019 08:54:22 -0800 Subject: xfs: fix shared extent data corruption due to missing cow reservation commit 59e4293149106fb92530f8e56fa3992d8548c5e6 upstream. Page writeback indirectly handles shared extents via the existence of overlapping COW fork blocks. If COW fork blocks exist, writeback always performs the associated copy-on-write regardless if the underlying blocks are actually shared. If the blocks are shared, then overlapping COW fork blocks must always exist. fstests shared/010 reproduces a case where a buffered write occurs over a shared block without performing the requisite COW fork reservation. This ultimately causes writeback to the shared extent and data corruption that is detected across md5 checks of the filesystem across a mount cycle. The problem occurs when a buffered write lands over a shared extent that crosses an extent size hint boundary and that also happens to have a partial COW reservation that doesn't cover the start and end blocks of the data fork extent. For example, a buffered write occurs across the file offset (in FSB units) range of [29, 57]. A shared extent exists at blocks [29, 35] and COW reservation already exists at blocks [32, 34]. After accommodating a COW extent size hint of 32 blocks and the existing reservation at offset 32, xfs_reflink_reserve_cow() allocates 32 blocks of reservation at offset 0 and returns with COW reservation across the range of [0, 34]. The associated data fork extent is still [29, 35], however, which isn't fully covered by the COW reservation. This leads to a buffered write at file offset 35 over a shared extent without associated COW reservation. Writeback eventually kicks in, performs an overwrite of the underlying shared block and causes the associated data corruption. Update xfs_reflink_reserve_cow() to accommodate the fact that a delalloc allocation request may not fully cover the extent in the data fork. Trim the data fork extent appropriately, just as is done for shared extent boundaries and/or existing COW reservations that happen to overlap the start of the data fork extent. This prevents shared/010 failures due to data corruption on reflink enabled filesystems. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_reflink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 42ea7bab9144..7088f44c0c59 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -302,6 +302,7 @@ xfs_reflink_reserve_cow( if (error) return error; + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); trace_xfs_reflink_cow_alloc(ip, &got); return 0; } -- cgit v1.2.3 From 5a7455e922b4ee1e92390f2f46ccb5269f79728d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Feb 2019 08:54:23 -0800 Subject: xfs: fix transient reference count error in xfs_buf_resubmit_failed_buffers commit d43aaf1685aa471f0593685c9f54d53e3af3cf3f upstream. When retrying a failed inode or dquot buffer, xfs_buf_resubmit_failed_buffers() clears all the failed flags from the inde/dquot log items. In doing so, it also drops all the reference counts on the buffer that the failed log items hold. This means it can drop all the active references on the buffer and hence free the buffer before it queues it for write again. Putting the buffer on the delwri queue takes a reference to the buffer (so that it hangs around until it has been written and completed), but this goes bang if the buffer has already been freed. Hence we need to add the buffer to the delwri queue before we remove the failed flags from the log items attached to the buffer to ensure it always remains referenced during the resubmit process. Reported-by: Josef Bacik Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_buf_item.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 12d8455bfbb2..010db5f8fb00 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1233,9 +1233,23 @@ xfs_buf_iodone( } /* - * Requeue a failed buffer for writeback + * Requeue a failed buffer for writeback. * - * Return true if the buffer has been re-queued properly, false otherwise + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + * + * Return true if the buffer was added to the buffer list, false if it was + * already on the buffer list. */ bool xfs_buf_resubmit_failed_buffers( @@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers( struct list_head *buffer_list) { struct xfs_log_item *lip; + bool ret; + + ret = xfs_buf_delwri_queue(bp, buffer_list); /* - * Clear XFS_LI_FAILED flag from all items before resubmit - * - * XFS_LI_FAILED set/clear is protected by ail_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) xfs_clear_li_failed(lip); - /* Add this buffer back to the delayed write list */ - return xfs_buf_delwri_queue(bp, buffer_list); + return ret; } -- cgit v1.2.3 From 886f0de16239a2596aa08a4fbaed10ee5b21a103 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 4 Feb 2019 08:54:24 -0800 Subject: xfs: delalloc -> unwritten COW fork allocation can go wrong commit 9230a0b65b47fe6856c4468ec0175c4987e5bede upstream. Long saga. There have been days spent following this through dead end after dead end in multi-GB event traces. This morning, after writing a trace-cmd wrapper that enabled me to be more selective about XFS trace points, I discovered that I could get just enough essential tracepoints enabled that there was a 50:50 chance the fsx config would fail at ~115k ops. If it didn't fail at op 115547, I stopped fsx at op 115548 anyway. That gave me two traces - one where the problem manifested, and one where it didn't. After refining the traces to have the necessary information, I found that in the failing case there was a real extent in the COW fork compared to an unwritten extent in the working case. Walking back through the two traces to the point where the CWO fork extents actually diverged, I found that the bad case had an extra unwritten extent in it. This is likely because the bug it led me to had triggered multiple times in those 115k ops, leaving stray COW extents around. What I saw was a COW delalloc conversion to an unwritten extent (as they should always be through xfs_iomap_write_allocate()) resulted in a /written extent/: xfs_writepage: dev 259:0 ino 0x83 pgoff 0x17000 size 0x79a00 offset 0 length 0 xfs_iext_remove: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/2 offset 32 block 152 count 20 flag 1 caller xfs_bmap_add_extent_delay_real xfs_bmap_pre_update: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/1 offset 1 block 4503599627239429 count 31 flag 0 caller xfs_bmap_add_extent_delay_real xfs_bmap_post_update: dev 259:0 ino 0x83 state RC|LF|RF|COW cur 0xffff888247b899c0/1 offset 1 block 121 count 51 flag 0 caller xfs_bmap_add_ex Basically, Cow fork before: 0 1 32 52 +H+DDDDDDDDDDDD+UUUUUUUUUUU+ PREV RIGHT COW delalloc conversion allocates: 1 32 +uuuuuuuuuuuu+ NEW And the result according to the xfs_bmap_post_update trace was: 0 1 32 52 +H+wwwwwwwwwwwwwwwwwwwwwwww+ PREV Which is clearly wrong - it should be a merged unwritten extent, not an unwritten extent. That lead me to look at the LEFT_FILLING|RIGHT_FILLING|RIGHT_CONTIG case in xfs_bmap_add_extent_delay_real(), and sure enough, there's the bug. It takes the old delalloc extent (PREV) and adds the length of the RIGHT extent to it, takes the start block from NEW, removes the RIGHT extent and then updates PREV with the new extent. What it fails to do is update PREV.br_state. For delalloc, this is always XFS_EXT_NORM, while in this case we are converting the delayed allocation to unwritten, so it needs to be updated to XFS_EXT_UNWRITTEN. This LF|RF|RC case does not do this, and so the resultant extent is always written. And that's the bug I've been chasing for a week - a bmap btree bug, not a reflink/dedupe/copy_file_range bug, but a BMBT bug introduced with the recent in core extent tree scalability enhancements. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a47670332326..3a496ffe6551 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1683,10 +1683,13 @@ xfs_bmap_add_extent_delay_real( case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. - * The right neighbor is contiguous, the left is not. + * The right neighbor is contiguous, the left is not. Take care + * with delay -> unwritten extent allocation here because the + * delalloc record we are overwriting is always written. */ PREV.br_startblock = new->br_startblock; PREV.br_blockcount += RIGHT.br_blockcount; + PREV.br_state = new->br_state; xfs_iext_next(ifp, &bma->icur); xfs_iext_remove(bma->ip, &bma->icur, state); -- cgit v1.2.3 From 757332c643ca474dae6cfdacb5a1fce68580c82d Mon Sep 17 00:00:00 2001 From: Ye Yin Date: Mon, 4 Feb 2019 08:54:25 -0800 Subject: fs/xfs: fix f_ffree value for statfs when project quota is set commit de7243057e7cefa923fa5f467c0f1ec24eef41d2 upsream. When project is set, we should use inode limit minus the used count Signed-off-by: Ye Yin Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_qm_bhv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 73a1d77ec187..3091e4bc04ef 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -40,7 +40,7 @@ xfs_fill_statvfs_from_dquot( statp->f_files = limit; statp->f_ffree = (statp->f_files > dqp->q_res_icount) ? - (statp->f_ffree - dqp->q_res_icount) : 0; + (statp->f_files - dqp->q_res_icount) : 0; } } -- cgit v1.2.3 From c6c20af69c617121331e231f797893d9dca0c9c5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 4 Feb 2019 08:54:26 -0800 Subject: xfs: fix PAGE_MASK usage in xfs_free_file_space commit a579121f94aba4e8bad1a121a0fad050d6925296 upstream. In commit e53c4b598, I *tried* to teach xfs to force writeback when we fzero/fpunch right up to EOF so that if EOF is in the middle of a page, the post-EOF part of the page gets zeroed before we return to userspace. Unfortunately, I missed the part where PAGE_MASK is ~(PAGE_SIZE - 1), which means that we totally fail to zero if we're fpunching and EOF is within the first page. Worse yet, the same PAGE_MASK thinko plagues the filemap_write_and_wait_range call, so we'd initiate writeback of the entire file, which (mostly) masked the thinko. Drop the tricky PAGE_MASK and replace it with correct usage of PAGE_SIZE and the proper rounding macros. Fixes: e53c4b598 ("xfs: ensure post-EOF zeroing happens after zeroing part of a file") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/xfs_bmap_util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 9d1e5c3a661e..211b06e4702e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1175,9 +1175,9 @@ xfs_free_file_space( * page could be mmap'd and iomap_zero_range doesn't do that for us. * Writeback of the eof page will do this, albeit clumsily. */ - if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) { + if (offset + len >= XFS_ISIZE(ip) && offset_in_page(offset + len) > 0) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - (offset + len) & ~PAGE_MASK, LLONG_MAX); + round_down(offset + len, PAGE_SIZE), LLONG_MAX); } return error; -- cgit v1.2.3 From 0c802cbaa6db4203fc559829548e19ae5ee8c9c2 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 4 Feb 2019 08:54:27 -0800 Subject: xfs: fix inverted return from xfs_btree_sblock_verify_crc commit 7d048df4e9b05ba89b74d062df59498aa81f3785 upstream. xfs_btree_sblock_verify_crc is a bool so should not be returning a failaddr_t; worse, if xfs_log_check_lsn fails it returns __this_address which looks like a boolean true (i.e. success) to the caller. (interestingly xfs_btree_lblock_verify_crc doesn't have the issue) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Luis Chamberlain Signed-off-by: Sasha Levin --- fs/xfs/libxfs/xfs_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 34c6d7bd4d18..bbdae2b4559f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -330,7 +330,7 @@ xfs_btree_sblock_verify_crc( if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) - return __this_address; + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } -- cgit v1.2.3 From 1f78052b0991b6c19cdff093e1e9cad297d291cf Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 1 Feb 2019 09:36:36 -0800 Subject: xfs: eof trim writeback mapping as soon as it is cached commit aa6ee4ab69293969867ab09b57546d226ace3d7a upstream. The cached writeback mapping is EOF trimmed to try and avoid races between post-eof block management and writeback that result in sending cached data to a stale location. The cached mapping is currently trimmed on the validation check, which leaves a race window between the time the mapping is cached and when it is trimmed against the current inode size. For example, if a new mapping is cached by delalloc conversion on a blocksize == page size fs, we could cycle various locks, perform memory allocations, etc. in the writeback codepath before the associated mapping is eventually trimmed to i_size. This leaves enough time for a post-eof truncate and file append before the cached mapping is trimmed. The former event essentially invalidates a range of the cached mapping and the latter bumps the inode size such the trim on the next writepage event won't trim all of the invalid blocks. fstest generic/464 reproduces this scenario occasionally and causes a lost writeback and stale delalloc blocks warning on inode inactivation. To work around this problem, trim the cached writeback mapping as soon as it is cached in addition to on subsequent validation checks. This is a minor tweak to tighten the race window as much as possible until a proper invalidation mechanism is available. Fixes: 40214d128e07 ("xfs: trim writepage mapping to within eof") Cc: # v4.14+ Signed-off-by: Brian Foster Reviewed-by: Allison Henderson Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_aops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 49f5f5896a43..b697866946d2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -449,6 +449,7 @@ xfs_map_blocks( } wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); return 0; allocate_blocks: @@ -459,6 +460,7 @@ allocate_blocks: ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || imap.br_startoff + imap.br_blockcount <= cow_fsb); wpc->imap = imap; + xfs_trim_extent_eof(&wpc->imap, ip); trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); return 0; } -- cgit v1.2.3 From 48be0eb05e7ebd0560b60938c5b82c93538877ba Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 12 Jan 2019 02:39:05 +0100 Subject: fuse: call pipe_buf_release() under pipe lock commit 9509941e9c534920ccc4771ae70bd6cbbe79df1c upstream. Some of the pipe_buf_release() handlers seem to assume that the pipe is locked - in particular, anon_pipe_buf_release() accesses pipe->tmp_page without taking any extra locks. From a glance through the callers of pipe_buf_release(), it looks like FUSE is the only one that calls pipe_buf_release() without having the pipe locked. This bug should only lead to a memory leak, nothing terrible. Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index bf0da0382c9e..c5e38061a48b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2032,8 +2032,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); + pipe_lock(pipe); for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); + pipe_unlock(pipe); out: kvfree(bufs); -- cgit v1.2.3 From f99027ab63bee0cbbde549ff663aef8b1c1944b5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: fuse: decrement NR_WRITEBACK_TEMP on the right page commit a2ebba824106dabe79937a9f29a875f837e1b6d4 upstream. NR_WRITEBACK_TEMP is accounted on the temporary page in the request, not the page cache page. Fixes: 8b284dc47291 ("fuse: writepages: handle same page rewrites") Cc: # v3.13 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fbd6978479cb..bd500c3b7858 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1778,7 +1778,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, spin_unlock(&fc->lock); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); -- cgit v1.2.3 From 6ccc9e1128e50e518ecdf9e1b120c735e40de25a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 16 Jan 2019 10:27:59 +0100 Subject: fuse: handle zero sized retrieve correctly commit 97e1532ef81acb31c30f9e75bf00306c33a77812 upstream. Dereferencing req->page_descs[0] will Oops if req->max_pages is zero. Reported-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com Tested-by: syzbot+c1e36d30ee3416289cc0@syzkaller.appspotmail.com Fixes: b2430d7567a3 ("fuse: add per-page descriptor to fuse_req") Cc: # v3.9 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c5e38061a48b..baaed4d05b22 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1698,7 +1698,6 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, req->in.h.nodeid = outarg->nodeid; req->in.numargs = 2; req->in.argpages = 1; - req->page_descs[0].offset = offset; req->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; @@ -1713,6 +1712,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, this_num = min_t(unsigned, num, PAGE_SIZE - offset); req->pages[req->num_pages] = page; + req->page_descs[req->num_pages].offset = offset; req->page_descs[req->num_pages].length = this_num; req->num_pages++; -- cgit v1.2.3 From c619140d4843e1e54b32fadadc79e0fba909f55d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 23 Jan 2019 11:27:02 +0100 Subject: debugfs: fix debugfs_rename parameter checking commit d88c93f090f708c18195553b352b9f205e65418f upstream. debugfs_rename() needs to check that the dentries passed into it really are valid, as sometimes they are not (i.e. if the return value of another debugfs call is passed into this one.) So fix this up by properly checking if the two parent directories are errors (they are allowed to be NULL), and if the dentry to rename is not NULL or an error. Cc: stable Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 13b01351dd1c..41ef452c1fcf 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -787,6 +787,13 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *trap; struct name_snapshot old_name; + if (IS_ERR(old_dir)) + return old_dir; + if (IS_ERR(new_dir)) + return new_dir; + if (IS_ERR_OR_NULL(old_dentry)) + return old_dentry; + trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) -- cgit v1.2.3 From 28f49e768d212445dc8d5964df6f37df8d73c70d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 31 Jan 2019 23:41:11 -0500 Subject: Revert "ext4: use ext4_write_inode() when fsyncing w/o a journal" commit 8fdd60f2ae3682caf2a7258626abc21eb4711892 upstream. This reverts commit ad211f3e94b314a910d4af03178a0b52a7d1ee0a. As Jan Kara pointed out, this change was unsafe since it means we lose the call to sync_mapping_buffers() in the nojournal case. The original point of the commit was avoid taking the inode mutex (since it causes a lockdep warning in generic/113); but we need the mutex in order to call sync_mapping_buffers(). The real fix to this problem was discussed here: https://lore.kernel.org/lkml/20181025150540.259281-4-bvanassche@acm.org The proposed patch was to fix a syzbot complaint, but the problem can also demonstrated via "kvm-xfstests -c nojournal generic/113". Multiple solutions were discused in the e-mail thread, but none have landed in the kernel as of this writing. Anyway, commit ad211f3e94b314 is absolutely the wrong way to suppress the lockdep, so revert it. Fixes: ad211f3e94b314a910d4af03178a0b52a7d1ee0a ("ext4: use ext4_write_inode() when fsyncing w/o a journal") Signed-off-by: Theodore Ts'o Reported: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/fsync.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 712f00995390..5508baa11bb6 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -116,16 +116,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = file_write_and_wait_range(file, start, end); - if (ret) - return ret; - if (!journal) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL - }; - - ret = ext4_write_inode(inode, &wbc); + ret = __generic_file_fsync(file, start, end, datasync); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) @@ -133,6 +125,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } + ret = file_write_and_wait_range(file, start, end); + if (ret) + return ret; /* * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. -- cgit v1.2.3 From c2109f05b7fd5d2e5472f45dbcc343786c504719 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Feb 2019 15:02:18 -0800 Subject: Revert "exec: load_script: don't blindly truncate shebang string" commit cb5b020a8d38f77209d0472a0fea755299a8ec78 upstream. This reverts commit 8099b047ecc431518b9bb6bdbba3549bbecdc343. It turns out that people do actually depend on the shebang string being truncated, and on the fact that an interpreter (like perl) will often just re-interpret it entirely to get the full argument list. Reported-by: Samuel Dionne-Riel Acked-by: Kees Cook Cc: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_script.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index d0078cbb718b..7cde3f46ad26 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -42,14 +42,10 @@ static int load_script(struct linux_binprm *bprm) fput(bprm->file); bprm->file = NULL; - for (cp = bprm->buf+2;; cp++) { - if (cp >= bprm->buf + BINPRM_BUF_SIZE) - return -ENOEXEC; - if (!*cp || (*cp == '\n')) - break; - } + bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; + if ((cp = strchr(bprm->buf, '\n')) == NULL) + cp = bprm->buf+BINPRM_BUF_SIZE-1; *cp = '\0'; - while (cp > bprm->buf) { cp--; if ((*cp == ' ') || (*cp == '\t')) -- cgit v1.2.3 From 63715c1f0a67a1e92631fca7f4b4dff89eb6be23 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Tue, 8 Jan 2019 18:30:56 +0000 Subject: cifs: Limit memory used by lock request calls to a page [ Upstream commit 92a8109e4d3a34fb6b115c9098b51767dc933444 ] The code tries to allocate a contiguous buffer with a size supplied by the server (maxBuf). This could fail if memory is fragmented since it results in high order allocations for commonly used server implementations. It is also wasteful since there are probably few locks in the usual case. Limit the buffer to be no larger than a page to avoid memory allocation failures due to fragmentation. Signed-off-by: Ross Lagerwall Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/file.c | 8 ++++++++ fs/cifs/smb2file.c | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7b637fc27990..23db881daab5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1128,6 +1128,10 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); @@ -1466,6 +1470,10 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (max_buf < (sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE))) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb_hdr) + sizeof(LOCKING_ANDX_RANGE) > + PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf - sizeof(struct smb_hdr), + PAGE_SIZE); max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 2fc3d31967ee..b204e84b87fb 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -128,6 +128,8 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (max_buf < sizeof(struct smb2_lock_element)) return -EINVAL; + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) @@ -264,6 +266,8 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) return -EINVAL; } + BUILD_BUG_ON(sizeof(struct smb2_lock_element) > PAGE_SIZE); + max_buf = min_t(unsigned int, max_buf, PAGE_SIZE); max_num = max_buf / sizeof(struct smb2_lock_element); buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { -- cgit v1.2.3 From b1765ebd9d12c3aa46e1c94764723a373ad57341 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 15 Jan 2019 15:08:48 -0800 Subject: CIFS: Do not assume one credit for async responses [ Upstream commit 0fd1d37b0501efc6e295f56ab55cdaff784aa50c ] If we don't receive a response we can't assume that the server granted one credit. Assume zero credits in such cases. Signed-off-by: Pavel Shilovsky Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/smb2pdu.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8a01e89ff827..1e5a1171212f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2814,9 +2814,10 @@ smb2_echo_callback(struct mid_q_entry *mid) { struct TCP_Server_Info *server = mid->callback_data; struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; - if (mid->mid_state == MID_RESPONSE_RECEIVED) + if (mid->mid_state == MID_RESPONSE_RECEIVED + || mid->mid_state == MID_RESPONSE_MALFORMED) credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); DeleteMidQEntry(mid); @@ -3073,7 +3074,7 @@ smb2_readv_callback(struct mid_q_entry *mid) struct TCP_Server_Info *server = tcon->ses->server; struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)rdata->iov[0].iov_base; - unsigned int credits_received = 1; + unsigned int credits_received = 0; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 2, .rq_pages = rdata->pages, @@ -3112,6 +3113,9 @@ smb2_readv_callback(struct mid_q_entry *mid) task_io_account_read(rdata->got_bytes); cifs_stats_bytes_read(tcon, rdata->got_bytes); break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(shdr->CreditRequest); + /* fall through */ default: if (rdata->result != -ENODATA) rdata->result = -EIO; @@ -3305,7 +3309,7 @@ smb2_writev_callback(struct mid_q_entry *mid) struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); unsigned int written; struct smb2_write_rsp *rsp = (struct smb2_write_rsp *)mid->resp_buf; - unsigned int credits_received = 1; + unsigned int credits_received = 0; switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: @@ -3333,6 +3337,9 @@ smb2_writev_callback(struct mid_q_entry *mid) case MID_RETRY_NEEDED: wdata->result = -EAGAIN; break; + case MID_RESPONSE_MALFORMED: + credits_received = le16_to_cpu(rsp->sync_hdr.CreditRequest); + /* fall through */ default: wdata->result = -EIO; break; -- cgit v1.2.3 From 93769fef8d6122285b94b13fc8872847182f5440 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 14 Feb 2019 12:33:19 -0500 Subject: Revert "nfsd4: return default lease period" commit 3bf6b57ec2ec945e5a6edf5c202a754f1e852ecd upstream. This reverts commit d6ebf5088f09472c1136cd506bdc27034a6763f8. I forgot that the kernel's default lease period should never be decreased! After a kernel upgrade, the kernel has no way of knowing on its own what the previous lease time was. Unless userspace tells it otherwise, it will assume the previous lease period was the same. So if we decrease this value in a kernel upgrade, we end up enforcing a grace period that's too short, and clients will fail to reclaim state in time. Symptoms may include EIO and log messages like "NFS: nfs4_reclaim_open_state: Lock reclaim failed!" There was no real justification for the lease period decrease anyway. Reported-by: Donald Buczek Fixes: d6ebf5088f09 "nfsd4: return default lease period" Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 899174c7a8ae..39b835d7c445 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1239,8 +1239,8 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - nn->nfsd4_lease = 45; /* default lease time */ - nn->nfsd4_grace = 45; + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); -- cgit v1.2.3 From 8d485d3a628bb46a86f232d9960ef64533aaf29d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 12 Feb 2019 15:35:51 -0800 Subject: Revert "mm: don't reclaim inodes with many attached pages" commit 69056ee6a8a3d576ed31e38b3b14c70d6c74edcc upstream. This reverts commit a76cf1a474d7d ("mm: don't reclaim inodes with many attached pages"). This change causes serious changes to page cache and inode cache behaviour and balance, resulting in major performance regressions when combining worklaods such as large file copies and kernel compiles. https://bugzilla.kernel.org/show_bug.cgi?id=202441 This change is a hack to work around the problems introduced by changing how agressive shrinkers are on small caches in commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects"). It creates more problems than it solves, wasn't adequately reviewed or tested, so it needs to be reverted. Link: http://lkml.kernel.org/r/20190130041707.27750-2-david@fromorbit.com Fixes: a76cf1a474d7d ("mm: don't reclaim inodes with many attached pages") Signed-off-by: Dave Chinner Cc: Wolfgang Walter Cc: Roman Gushchin Cc: Spock Cc: Rik van Riel Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 65ae154df760..42f6d25f32a5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* - * Recently referenced inodes and inodes with many attached pages - * get one more pass. - */ - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; -- cgit v1.2.3 From dd5f4d067a2cdda47b9f3b6c119642a2b81edee7 Mon Sep 17 00:00:00 2001 From: Sandeep Patil Date: Tue, 12 Feb 2019 15:36:11 -0800 Subject: mm: proc: smaps_rollup: fix pss_locked calculation commit 27dd768ed8db48beefc4d9e006c58e7a00342bde upstream. The 'pss_locked' field of smaps_rollup was being calculated incorrectly. It accumulated the current pss everytime a locked VMA was found. Fix that by adding to 'pss_locked' the same time as that of 'pss' if the vma being walked is locked. Link: http://lkml.kernel.org/r/20190203065425.14650-1-sspatil@android.com Fixes: 493b0e9d945f ("mm: add /proc/pid/smaps_rollup") Signed-off-by: Sandeep Patil Acked-by: Vlastimil Babka Reviewed-by: Joel Fernandes (Google) Cc: Alexey Dobriyan Cc: Daniel Colascione Cc: [4.14.x, 4.19.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a027473561c6..d76fe166f6ce 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -423,7 +423,7 @@ struct mem_size_stats { }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - bool compound, bool young, bool dirty) + bool compound, bool young, bool dirty, bool locked) { int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; @@ -450,24 +450,31 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + if (locked) + mss->pss_locked += (u64)size << PSS_SHIFT; return; } for (i = 0; i < nr; i++, page++) { int mapcount = page_mapcount(page); + unsigned long pss = (PAGE_SIZE << PSS_SHIFT); if (mapcount >= 2) { if (dirty || PageDirty(page)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; - mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + mss->pss += pss / mapcount; + if (locked) + mss->pss_locked += pss / mapcount; } else { if (dirty || PageDirty(page)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; - mss->pss += PAGE_SIZE << PSS_SHIFT; + mss->pss += pss; + if (locked) + mss->pss_locked += pss; } } } @@ -490,6 +497,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; if (pte_present(*pte)) { @@ -532,7 +540,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -541,6 +549,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; + bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page; /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -555,7 +564,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, /* pass */; else VM_BUG_ON_PAGE(1, page); - smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -737,11 +746,8 @@ static void smap_gather_stats(struct vm_area_struct *vma, } } #endif - /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); - if (vma->vm_flags & VM_LOCKED) - mss->pss_locked += mss->pss; } #define SEQ_PUT_DEC(str, val) \ -- cgit v1.2.3 From b8d7fb1efb960f52730b231f5f8c6c63d11bc00b Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Feb 2019 15:18:52 +0800 Subject: ceph: avoid repeatedly adding inode to mdsc->snap_flush_list commit 04242ff3ac0abbaa4362f97781dac268e6c3541a upstream. Otherwise, mdsc->snap_flush_list may get corrupted. Cc: stable@vger.kernel.org Signed-off-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/snap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 041c27ea8de1..f74193da0e09 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + if (list_empty(&ci->i_snap_flush_item)) + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } -- cgit v1.2.3 From a89e0d5c603ac3154670df31a4f871df629507c7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 20 Feb 2019 22:19:42 -0800 Subject: proc, oom: do not report alien mms when setting oom_score_adj commit b2b469939e93458753cfbf8282ad52636495965e upstream. Tetsuo has reported that creating a thousands of processes sharing MM without SIGHAND (aka alien threads) and setting /proc//oom_score_adj will swamp the kernel log and takes ages [1] to finish. This is especially worrisome that all that printing is done under RCU lock and this can potentially trigger RCU stall or softlockup detector. The primary reason for the printk was to catch potential users who might depend on the behavior prior to 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") but after more than 2 years without a single report I guess it is safe to simply remove the printk altogether. The next step should be moving oom_score_adj over to the mm struct and remove all the tasks crawling as suggested by [2] [1] http://lkml.kernel.org/r/97fce864-6f75-bca5-14bc-12c9f890e740@i-love.sakura.ne.jp [2] http://lkml.kernel.org/r/20190117155159.GA4087@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20190212102129.26288-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Acked-by: Johannes Weiner Cc: David Rientjes Cc: Yong-Taek Lee Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 7e9f07bf260d..81d77b15b347 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1084,10 +1084,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { - pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", - task_pid_nr(p), p->comm, - p->signal->oom_score_adj, oom_adj, - task_pid_nr(task), task->comm); p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; -- cgit v1.2.3 From dc4ec1bad9e2dfc6308c7c53f95f491b22cef897 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Don't set vnode->cb_s_break in afs_validate() [ Upstream commit 4882a27cec24319d10f95e978ecc80050e3e3e15 ] A cb_interest record is not necessarily attached to the vnode on entry to afs_validate(), which can cause an oops when we try to bring the vnode's cb_s_break up to date in the default case (ie. no current callback promise and the vnode has not been deleted). Fix this by simply removing the line, as vnode->cb_s_break will be set when needed by afs_register_server_cb_interest() when we next get a callback promise from RPC call. The oops looks something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 ... RIP: 0010:afs_validate+0x66/0x250 [kafs] ... Call Trace: afs_d_revalidate+0x8d/0x340 [kafs] ? __d_lookup+0x61/0x150 lookup_dcache+0x44/0x70 ? lookup_dcache+0x44/0x70 __lookup_hash+0x24/0xa0 do_unlinkat+0x11d/0x2c0 __x64_sys_unlink+0x23/0x30 do_syscall_64+0x4d/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: ae3b7361dc0e ("afs: Fix validation/callback interaction") Signed-off-by: Marc Dionne Signed-off-by: David Howells Signed-off-by: Sasha Levin --- fs/afs/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 071075d775a9..26aa2d111a28 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -411,7 +411,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { valid = true; } else { - vnode->cb_s_break = vnode->cb_interest->server->cb_s_break; vnode->cb_v_break = vnode->volume->cb_v_break; valid = false; } -- cgit v1.2.3 From 5d6af6f9dd2f99381b147431e30a72fbd625fe70 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Jan 2019 17:23:54 +0000 Subject: afs: Fix key refcounting in file locking code [ Upstream commit 59d49076ae3e6912e6d7df2fd68e2337f3d02036 ] Fix the refcounting of the authentication keys in the file locking code. The vnode->lock_key member points to a key on which it expects to be holding a ref, but it isn't always given an extra ref, however. Fixes: 0fafdc9f888b ("afs: Fix file locking") Signed-off-by: David Howells Signed-off-by: Sasha Levin --- fs/afs/flock.c | 4 ++-- fs/afs/inode.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/flock.c b/fs/afs/flock.c index dc62d15a964b..1bb300ef362b 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -208,7 +208,7 @@ again: /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; goto again; @@ -413,7 +413,7 @@ static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) /* The new front of the queue now owns the state variables. */ next = list_entry(vnode->pending_locks.next, struct file_lock, fl_u.afs.link); - vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_key = key_get(afs_file_key(next->fl_file)); vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 26aa2d111a28..0726e40db0f8 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -542,6 +542,8 @@ void afs_evict_inode(struct inode *inode) #endif afs_put_permits(rcu_access_pointer(vnode->permit_cache)); + key_put(vnode->lock_key); + vnode->lock_key = NULL; _leave(""); } -- cgit v1.2.3 From c5a1dc256cc2ac6c08f637994e513a8cb65be3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ernesto=20A=2E=20Fern=C3=A1ndez?= Date: Mon, 8 Oct 2018 20:58:23 -0300 Subject: direct-io: allow direct writes to empty inodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8b9433eb4de3c26a9226c981c283f9f4896ae030 ] On a DIO_SKIP_HOLES filesystem, the ->get_block() method is currently not allowed to create blocks for an empty inode. This confusion comes from trying to bit shift a negative number, so check the size of the inode first. The problem is most visible for hfsplus, because the fallback to buffered I/O doesn't happen and the write fails with EIO. This is in part the fault of the module, because it gives a wrong return value on ->get_block(); that will be fixed in a separate patch. Reviewed-by: Jeff Moyer Reviewed-by: Jan Kara Signed-off-by: Ernesto A. Fernández Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/direct-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 199146036093..1abb7634b2d5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -679,6 +679,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, unsigned long fs_count; /* Number of filesystem-sized blocks */ int create; unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; + loff_t i_size; /* * If there was a memory error and we've overwritten all the @@ -708,8 +709,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> - i_blkbits)) + i_size = i_size_read(dio->inode); + if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) create = 0; } -- cgit v1.2.3 From edca54b897bb16d2ad7c0549b7bce2e5e2bc36e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Dec 2017 08:38:30 -0800 Subject: writeback: synchronize sync(2) against cgroup writeback membership switches [ Upstream commit 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 ] sync_inodes_sb() can race against cgwb (cgroup writeback) membership switches and fail to writeback some inodes. For example, if an inode switches to another wb while sync_inodes_sb() is in progress, the new wb might not be visible to bdi_split_work_to_wbs() at all or the inode might jump from a wb which hasn't issued writebacks yet to one which already has. This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb switch path against sync_inodes_sb() so that sync_inodes_sb() is guaranteed to see all the target wbs and inodes can't jump wbs to escape syncing. v2: Fixed misplaced rwsem init. Spotted by Jiufei. Signed-off-by: Tejun Heo Reported-by: Jiufei Xue Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com Acked-by: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/fs-writeback.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..82ce6d4f7e31 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -331,11 +331,22 @@ struct inode_switch_wbs_context { struct work_struct work; }; +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + down_write(&bdi->wb_switch_rwsem); +} + +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + up_write(&bdi->wb_switch_rwsem); +} + static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) bool switched = false; void **slot; + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -435,6 +452,8 @@ skip_switch: spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); + up_read(&bdi->wb_switch_rwsem); + if (switched) { wb_wakeup(new_wb); wb_put(old_wb); @@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; + /* + * Avoid starting new switches while sync_inodes_sb() is in + * progress. Otherwise, if the down_write protected issue path + * blocks heavily, we might end up starting a large number of + * switches which will block on the rwsem. + */ + if (!down_read_trylock(&bdi->wb_switch_rwsem)) + return; + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - return; + goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); @@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - return; + goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); +out_unlock: + up_read(&bdi->wb_switch_rwsem); } /** @@ -894,6 +924,9 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -2420,8 +2453,11 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(bdi, &done); + bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } -- cgit v1.2.3 From 527cabfffbc5f55c111d510a918b33fb9fbe537d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 28 Feb 2019 16:22:02 -0800 Subject: hugetlbfs: fix races and page leaks during migration commit cb6acd01e2e43fd8bad11155752b7699c3d0fb76 upstream. hugetlb pages should only be migrated if they are 'active'. The routines set/clear_page_huge_active() modify the active state of hugetlb pages. When a new hugetlb page is allocated at fault time, set_page_huge_active is called before the page is locked. Therefore, another thread could race and migrate the page while it is being added to page table by the fault code. This race is somewhat hard to trigger, but can be seen by strategically adding udelay to simulate worst case scheduling behavior. Depending on 'how' the code races, various BUG()s could be triggered. To address this issue, simply delay the set_page_huge_active call until after the page is successfully added to the page table. Hugetlb pages can also be leaked at migration time if the pages are associated with a file in an explicitly mounted hugetlbfs filesystem. For example, consider a two node system with 4GB worth of huge pages available. A program mmaps a 2G file in a hugetlbfs filesystem. It then migrates the pages associated with the file from one node to another. When the program exits, huge page counts are as follows: node0 1024 free_hugepages 1024 nr_hugepages node1 0 free_hugepages 1024 nr_hugepages Filesystem Size Used Avail Use% Mounted on nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool That is as expected. 2G of huge pages are taken from the free_hugepages counts, and 2G is the size of the file in the explicitly mounted filesystem. If the file is then removed, the counts become: node0 1024 free_hugepages 1024 nr_hugepages node1 1024 free_hugepages 1024 nr_hugepages Filesystem Size Used Avail Use% Mounted on nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool Note that the filesystem still shows 2G of pages used, while there actually are no huge pages in use. The only way to 'fix' the filesystem accounting is to unmount the filesystem If a hugetlb page is associated with an explicitly mounted filesystem, this information in contained in the page_private field. At migration time, this information is not preserved. To fix, simply transfer page_private from old to new page at migration time if necessary. There is a related race with removing a huge page from a file and migration. When a huge page is removed from the pagecache, the page_mapping() field is cleared, yet page_private remains set until the page is actually freed by free_huge_page(). A page could be migrated while in this state. However, since page_mapping() is not set the hugetlbfs specific routine to transfer page_private is not called and we leak the page count in the filesystem. To fix that, check for this condition before migrating a huge page. If the condition is detected, return EBUSY for the page. Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Mel Gorman Cc: Davidlohr Bueso Cc: [mike.kravetz@oracle.com: v2] Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com [mike.kravetz@oracle.com: update comment and changelog] Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hugetlbfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32920a10100e..a7fa037b876b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -859,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; + + /* + * page_private is subpool pointer in hugetlb pages. Transfer to + * new page. PagePrivate is not associated with page_private for + * hugetlb pages and can not be set here as only page_huge_active + * pages can be migrated. + */ + if (page_private(page)) { + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + } + if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else -- cgit v1.2.3 From f5e66cdb51fd352d7091cb2516b24734e524cb33 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 8 Feb 2019 16:59:49 -0800 Subject: aio: Fix locking in aio_poll() commit d3d6a18d7d351cbcc9b33dbedf710e65f8ce1595 upstream. wake_up_locked() may but does not have to be called with interrupts disabled. Since the fuse filesystem calls wake_up_locked() without disabling interrupts aio_poll_wake() may be called with interrupts enabled. Since the kioctx.ctx_lock may be acquired from IRQ context, all code that acquires that lock from thread context must disable interrupts. Hence change the spin_trylock() call in aio_poll_wake() into a spin_trylock_irqsave() call. This patch fixes the following lockdep complaint: ===================================================== WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected 5.0.0-rc4-next-20190131 #23 Not tainted ----------------------------------------------------- syz-executor2/13779 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: 0000000098ac1230 (&fiq->waitq){+.+.}, at: spin_lock include/linux/spinlock.h:329 [inline] 0000000098ac1230 (&fiq->waitq){+.+.}, at: aio_poll fs/aio.c:1772 [inline] 0000000098ac1230 (&fiq->waitq){+.+.}, at: __io_submit_one fs/aio.c:1875 [inline] 0000000098ac1230 (&fiq->waitq){+.+.}, at: io_submit_one+0xedf/0x1cf0 fs/aio.c:1908 and this task is already holding: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: spin_lock_irq include/linux/spinlock.h:354 [inline] 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: aio_poll fs/aio.c:1771 [inline] 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: __io_submit_one fs/aio.c:1875 [inline] 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: io_submit_one+0xeb6/0x1cf0 fs/aio.c:1908 which would create a new lock dependency: (&(&ctx->ctx_lock)->rlock){..-.} -> (&fiq->waitq){+.+.} but this new dependency connects a SOFTIRQ-irq-safe lock: (&(&ctx->ctx_lock)->rlock){..-.} ... which became SOFTIRQ-irq-safe at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160 spin_lock_irq include/linux/spinlock.h:354 [inline] free_ioctx_users+0x2d/0x4a0 fs/aio.c:610 percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline] percpu_ref_put include/linux/percpu-refcount.h:301 [inline] percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline] percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2486 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2799 [inline] rcu_core+0x928/0x1390 kernel/rcu/tree.c:2780 __do_softirq+0x266/0x95a kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:654 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164 kthread+0x357/0x430 kernel/kthread.c:247 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 to a SOFTIRQ-irq-unsafe lock: (&fiq->waitq){+.+.} ... which became SOFTIRQ-irq-unsafe at: ... lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415 fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676 fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687 fuse_send_init fs/fuse/inode.c:989 [inline] fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214 mount_nodev+0x68/0x110 fs/super.c:1392 fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239 legacy_get_tree+0xf2/0x200 fs/fs_context.c:590 vfs_get_tree+0x123/0x450 fs/super.c:1481 do_new_mount fs/namespace.c:2610 [inline] do_mount+0x1436/0x2c40 fs/namespace.c:2932 ksys_mount+0xdb/0x150 fs/namespace.c:3148 __do_sys_mount fs/namespace.c:3162 [inline] __se_sys_mount fs/namespace.c:3159 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fiq->waitq); local_irq_disable(); lock(&(&ctx->ctx_lock)->rlock); lock(&fiq->waitq); lock(&(&ctx->ctx_lock)->rlock); *** DEADLOCK *** 1 lock held by syz-executor2/13779: #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: spin_lock_irq include/linux/spinlock.h:354 [inline] #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: aio_poll fs/aio.c:1771 [inline] #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: __io_submit_one fs/aio.c:1875 [inline] #0: 000000003c46111c (&(&ctx->ctx_lock)->rlock){..-.}, at: io_submit_one+0xeb6/0x1cf0 fs/aio.c:1908 the dependencies between SOFTIRQ-irq-safe lock and the holding lock: -> (&(&ctx->ctx_lock)->rlock){..-.} { IN-SOFTIRQ-W at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160 spin_lock_irq include/linux/spinlock.h:354 [inline] free_ioctx_users+0x2d/0x4a0 fs/aio.c:610 percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline] percpu_ref_put include/linux/percpu-refcount.h:301 [inline] percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline] percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2486 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2799 [inline] rcu_core+0x928/0x1390 kernel/rcu/tree.c:2780 __do_softirq+0x266/0x95a kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:654 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164 kthread+0x357/0x430 kernel/kthread.c:247 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 INITIAL USE at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160 spin_lock_irq include/linux/spinlock.h:354 [inline] __do_sys_io_cancel fs/aio.c:2052 [inline] __se_sys_io_cancel fs/aio.c:2035 [inline] __x64_sys_io_cancel+0xd5/0x5a0 fs/aio.c:2035 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe } ... key at: [] __key.52370+0x0/0x40 ... acquired at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] aio_poll fs/aio.c:1772 [inline] __io_submit_one fs/aio.c:1875 [inline] io_submit_one+0xedf/0x1cf0 fs/aio.c:1908 __do_sys_io_submit fs/aio.c:1953 [inline] __se_sys_io_submit fs/aio.c:1923 [inline] __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe the dependencies between the lock to be acquired and SOFTIRQ-irq-unsafe lock: -> (&fiq->waitq){+.+.} { HARDIRQ-ON-W at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415 fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676 fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687 fuse_send_init fs/fuse/inode.c:989 [inline] fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214 mount_nodev+0x68/0x110 fs/super.c:1392 fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239 legacy_get_tree+0xf2/0x200 fs/fs_context.c:590 vfs_get_tree+0x123/0x450 fs/super.c:1481 do_new_mount fs/namespace.c:2610 [inline] do_mount+0x1436/0x2c40 fs/namespace.c:2932 ksys_mount+0xdb/0x150 fs/namespace.c:3148 __do_sys_mount fs/namespace.c:3162 [inline] __se_sys_mount fs/namespace.c:3159 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe SOFTIRQ-ON-W at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415 fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676 fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687 fuse_send_init fs/fuse/inode.c:989 [inline] fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214 mount_nodev+0x68/0x110 fs/super.c:1392 fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239 legacy_get_tree+0xf2/0x200 fs/fs_context.c:590 vfs_get_tree+0x123/0x450 fs/super.c:1481 do_new_mount fs/namespace.c:2610 [inline] do_mount+0x1436/0x2c40 fs/namespace.c:2932 ksys_mount+0xdb/0x150 fs/namespace.c:3148 __do_sys_mount fs/namespace.c:3162 [inline] __se_sys_mount fs/namespace.c:3159 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe INITIAL USE at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] flush_bg_queue+0x1f3/0x3c0 fs/fuse/dev.c:415 fuse_request_queue_background+0x2d1/0x580 fs/fuse/dev.c:676 fuse_request_send_background+0x58/0x120 fs/fuse/dev.c:687 fuse_send_init fs/fuse/inode.c:989 [inline] fuse_fill_super+0x13bb/0x1730 fs/fuse/inode.c:1214 mount_nodev+0x68/0x110 fs/super.c:1392 fuse_mount+0x2d/0x40 fs/fuse/inode.c:1239 legacy_get_tree+0xf2/0x200 fs/fs_context.c:590 vfs_get_tree+0x123/0x450 fs/super.c:1481 do_new_mount fs/namespace.c:2610 [inline] do_mount+0x1436/0x2c40 fs/namespace.c:2932 ksys_mount+0xdb/0x150 fs/namespace.c:3148 __do_sys_mount fs/namespace.c:3162 [inline] __se_sys_mount fs/namespace.c:3159 [inline] __x64_sys_mount+0xbe/0x150 fs/namespace.c:3159 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe } ... key at: [] __key.43450+0x0/0x40 ... acquired at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] aio_poll fs/aio.c:1772 [inline] __io_submit_one fs/aio.c:1875 [inline] io_submit_one+0xedf/0x1cf0 fs/aio.c:1908 __do_sys_io_submit fs/aio.c:1953 [inline] __se_sys_io_submit fs/aio.c:1923 [inline] __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe stack backtrace: CPU: 0 PID: 13779 Comm: syz-executor2 Not tainted 5.0.0-rc4-next-20190131 #23 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_bad_irq_dependency kernel/locking/lockdep.c:1573 [inline] check_usage.cold+0x60f/0x940 kernel/locking/lockdep.c:1605 check_irq_usage kernel/locking/lockdep.c:1650 [inline] check_prev_add_irq kernel/locking/lockdep_states.h:8 [inline] check_prev_add kernel/locking/lockdep.c:1860 [inline] check_prevs_add kernel/locking/lockdep.c:1968 [inline] validate_chain kernel/locking/lockdep.c:2339 [inline] __lock_acquire+0x1f12/0x4790 kernel/locking/lockdep.c:3320 lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3826 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] aio_poll fs/aio.c:1772 [inline] __io_submit_one fs/aio.c:1875 [inline] io_submit_one+0xedf/0x1cf0 fs/aio.c:1908 __do_sys_io_submit fs/aio.c:1953 [inline] __se_sys_io_submit fs/aio.c:1923 [inline] __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Reported-by: syzbot Cc: Christoph Hellwig Cc: Avi Kivity Cc: Miklos Szeredi Cc: Fixes: e8693bcfa0b4 ("aio: allow direct aio poll comletions for keyed wakeups") # v4.19 Signed-off-by: Miklos Szeredi [ bvanassche: added a comment ] Reluctantly-Acked-by: Christoph Hellwig Signed-off-by: Bart Van Assche Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 44551d96eaa4..45d5ef8dd0a8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1661,6 +1661,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct poll_iocb *req = container_of(wait, struct poll_iocb, wait); struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll); __poll_t mask = key_to_poll(key); + unsigned long flags; req->woken = true; @@ -1669,10 +1670,15 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (!(mask & req->events)) return 0; - /* try to complete the iocb inline if we can: */ - if (spin_trylock(&iocb->ki_ctx->ctx_lock)) { + /* + * Try to complete the iocb inline if we can. Use + * irqsave/irqrestore because not all filesystems (e.g. fuse) + * call this function with IRQs disabled and because IRQs + * have to be disabled before ctx_lock is obtained. + */ + if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { list_del(&iocb->ki_list); - spin_unlock(&iocb->ki_ctx->ctx_lock); + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); list_del_init(&req->wait.entry); aio_poll_complete(iocb, mask); -- cgit v1.2.3 From b60d90b2d3d14c426693a0a34041db11be66d29e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 19 Feb 2019 10:10:38 +0800 Subject: exec: Fix mem leak in kernel_read_file commit f612acfae86af7ecad754ae6a46019be9da05b8e upstream. syzkaller report this: BUG: memory leak unreferenced object 0xffffc9000488d000 (size 9195520): comm "syz-executor.0", pid 2752, jiffies 4294787496 (age 18.757s) hex dump (first 32 bytes): ff ff ff ff ff ff ff ff a8 00 00 00 01 00 00 00 ................ 02 00 00 00 00 00 00 00 80 a1 7a c1 ff ff ff ff ..........z..... backtrace: [<000000000863775c>] __vmalloc_node mm/vmalloc.c:1795 [inline] [<000000000863775c>] __vmalloc_node_flags mm/vmalloc.c:1809 [inline] [<000000000863775c>] vmalloc+0x8c/0xb0 mm/vmalloc.c:1831 [<000000003f668111>] kernel_read_file+0x58f/0x7d0 fs/exec.c:924 [<000000002385813f>] kernel_read_file_from_fd+0x49/0x80 fs/exec.c:993 [<0000000011953ff1>] __do_sys_finit_module+0x13b/0x2a0 kernel/module.c:3895 [<000000006f58491f>] do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 [<00000000ee78baf4>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<00000000241f889b>] 0xffffffffffffffff It should goto 'out_free' lable to free allocated buf while kernel_read fails. Fixes: 39d637af5aa7 ("vfs: forbid write access when reading a file into memory") Signed-off-by: YueHaibing Signed-off-by: Al Viro Cc: Thibaut Sautereau Signed-off-by: Greg Kroah-Hartman --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 1ebf6e5a521d..433b1257694a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -929,7 +929,7 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, bytes = kernel_read(file, *buf + pos, i_size - pos, &pos); if (bytes < 0) { ret = bytes; - goto out; + goto out_free; } if (bytes == 0) -- cgit v1.2.3 From d23792f53f0de5706520f1302a8a4143e13a1680 Mon Sep 17 00:00:00 2001 From: Piotr Jaroszynski Date: Sun, 27 Jan 2019 08:46:45 -0800 Subject: iomap: get/put the page in iomap_page_create/release() [ Upstream commit 8e47a457321ca1a74ad194ab5dcbca764bc70731 ] migrate_page_move_mapping() expects pages with private data set to have a page_count elevated by 1. This is what used to happen for xfs through the buffer_heads code before the switch to iomap in commit 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads"). Not having the count elevated causes move_pages() to fail on memory mapped files coming from xfs. Make iomap compatible with the migrate_page_move_mapping() assumption by elevating the page count as part of iomap_page_create() and lowering it in iomap_page_release(). It causes the move_pages() syscall to misbehave on memory mapped files from xfs. It does not not move any pages, which I suppose is "just" a perf issue, but it also ends up returning a positive number which is out of spec for the syscall. Talking to Michal Hocko, it sounds like returning positive numbers might be a necessary update to move_pages() anyway though. Fixes: 82cb14175e7d ("xfs: add support for sub-pagesize writeback without buffer_heads") Signed-off-by: Piotr Jaroszynski [hch: actually get/put the page iomap_migrate_page() to make it work properly] Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Sasha Levin --- fs/iomap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index e57fb1e534c5..2e3e64012db7 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -117,6 +117,12 @@ iomap_page_create(struct inode *inode, struct page *page) atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); + + /* + * migrate_page_move_mapping() assumes that pages with private data have + * their count elevated by 1. + */ + get_page(page); set_page_private(page, (unsigned long)iop); SetPagePrivate(page); return iop; @@ -133,6 +139,7 @@ iomap_page_release(struct page *page) WARN_ON_ONCE(atomic_read(&iop->write_count)); ClearPagePrivate(page); set_page_private(page, 0); + put_page(page); kfree(iop); } @@ -565,8 +572,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (page_has_private(page)) { ClearPagePrivate(page); + get_page(newpage); set_page_private(newpage, page_private(page)); set_page_private(page, 0); + put_page(page); SetPagePrivate(newpage); } -- cgit v1.2.3 From d9ba842efdf068c53095b15f68377327259c3d16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Jan 2019 08:58:58 -0800 Subject: iomap: fix a use after free in iomap_dio_rw [ Upstream commit 4ea899ead2786a30aaa8181fefa81a3df4ad28f6 ] Introduce a local wait_for_completion variable to avoid an access to the potentially freed dio struture after dropping the last reference count. Also use the chance to document the completion behavior to make the refcounting clear to the reader of the code. Fixes: ff6a9292e6 ("iomap: implement direct I/O") Reported-by: Chandan Rajendra Reported-by: Darrick J. Wong Signed-off-by: Christoph Hellwig Tested-by: Chandan Rajendra Tested-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Signed-off-by: Sasha Levin --- fs/iomap.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 2e3e64012db7..fac45206418a 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -1787,6 +1787,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos, start = pos; loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; + bool wait_for_completion = is_sync_kiocb(iocb); struct blk_plug plug; struct iomap_dio *dio; @@ -1806,7 +1807,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->end_io = end_io; dio->error = 0; dio->flags = 0; - dio->wait_for_completion = is_sync_kiocb(iocb); dio->submit.iter = iter; dio->submit.waiter = current; @@ -1861,7 +1861,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; - if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && + if (iov_iter_rw(iter) == WRITE && !wait_for_completion && !inode->i_sb->s_dio_done_wq) { ret = sb_init_dio_done_wq(inode->i_sb); if (ret < 0) @@ -1877,7 +1877,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret <= 0) { /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { - dio->wait_for_completion = true; + wait_for_completion = true; ret = 0; } break; @@ -1899,8 +1899,24 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; + /* + * We are about to drop our additional submission reference, which + * might be the last reference to the dio. There are three three + * different ways we can progress here: + * + * (a) If this is the last reference we will always complete and free + * the dio ourselves. + * (b) If this is not the last reference, and we serve an asynchronous + * iocb, we must never touch the dio after the decrement, the + * I/O completion handler will complete and free it. + * (c) If this is not the last reference, but we serve a synchronous + * iocb, the I/O completion handler will wake us up on the drop + * of the final reference, and we will complete and free it here + * after we got woken by the I/O completion handler. + */ + dio->wait_for_completion = wait_for_completion; if (!atomic_dec_and_test(&dio->ref)) { - if (!dio->wait_for_completion) + if (!wait_for_completion) return -EIOCBQUEUED; for (;;) { @@ -1917,9 +1933,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } - ret = iomap_dio_complete(dio); - - return ret; + return iomap_dio_complete(dio); out_free_dio: kfree(dio); -- cgit v1.2.3 From 5c72ca3bf62502ace23aa6281b6dd6867417d642 Mon Sep 17 00:00:00 2001 From: Yao Liu Date: Mon, 28 Jan 2019 19:44:14 +0800 Subject: nfs: Fix NULL pointer dereference of dev_name [ Upstream commit 80ff00172407e0aad4b10b94ef0816fc3e7813cb ] There is a NULL pointer dereference of dev_name in nfs_parse_devname() The oops looks something like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 ... RIP: 0010:nfs_fs_mount+0x3b6/0xc20 [nfs] ... Call Trace: ? ida_alloc_range+0x34b/0x3d0 ? nfs_clone_super+0x80/0x80 [nfs] ? nfs_free_parsed_mount_data+0x60/0x60 [nfs] mount_fs+0x52/0x170 ? __init_waitqueue_head+0x3b/0x50 vfs_kern_mount+0x6b/0x170 do_mount+0x216/0xdc0 ksys_mount+0x83/0xd0 __x64_sys_mount+0x25/0x30 do_syscall_64+0x65/0x220 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix this by adding a NULL check on dev_name Signed-off-by: Yao Liu Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 5ef2c71348bd..6b666d187907 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1906,6 +1906,11 @@ static int nfs_parse_devname(const char *dev_name, size_t len; char *end; + if (unlikely(!dev_name || !*dev_name)) { + dfprintk(MOUNT, "NFS: device name not specified\n"); + return -EINVAL; + } + /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); -- cgit v1.2.3 From 6efd69d63339904fce3095723e470fd8f86a49a8 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 29 Jan 2019 12:46:16 +1000 Subject: cifs: fix computation for MAX_SMB2_HDR_SIZE [ Upstream commit 58d15ed1203f4d858c339ea4d7dafa94bd2a56d3 ] The size of the fixed part of the create response is 88 bytes not 56. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky Signed-off-by: Sasha Levin --- fs/cifs/smb2pdu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 8fb7887f2b3d..437257d1116f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ -#define MAX_SMB2_HDR_SIZE 0x00b0 +/* 52 transform hdr + 64 hdr + 88 create rsp */ +#define MAX_SMB2_HDR_SIZE 204 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) -- cgit v1.2.3 From 845d73be1b4cba6c353e99f79eff0f17e0d7c1d8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 1 Feb 2019 14:20:01 -0800 Subject: proc: fix /proc/net/* after setns(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1fde6f21d90f8ba5da3cb9c54ca991ed72696c43 ] /proc entries under /proc/net/* can't be cached into dcache because setns(2) can change current net namespace. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: avoid vim miscolorization] [adobriyan@gmail.com: write test, add dummy ->d_revalidate hook: necessary if /proc/net/* is pinned at setns time] Link: http://lkml.kernel.org/r/20190108192350.GA12034@avx2 Link: http://lkml.kernel.org/r/20190107162336.GA9239@avx2 Fixes: 1da4d377f943fe4194ffb9fb9c26cc58fad4dd24 ("proc: revalidate misc dentries") Signed-off-by: Alexey Dobriyan Reported-by: Mateusz Stępień Reported-by: Ahmad Fatoum Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/proc/generic.c | 4 +++- fs/proc/internal.h | 1 + fs/proc/proc_net.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ae109429a88..e39bac94dead 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -256,7 +256,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &proc_misc_dentry_ops); + d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); @@ -429,6 +429,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); + ent->proc_dops = &proc_misc_dentry_ops; + out: return ent; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5185d7f6a51e..95b14196f284 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -44,6 +44,7 @@ struct proc_dir_entry { struct completion *pde_unload_completion; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; + const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d5e0fcb3439e..a7b12435519e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -38,6 +38,22 @@ static struct net *get_proc_net(const struct inode *inode) return maybe_get_net(PDE_NET(PDE(inode))); } +static int proc_net_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + return 0; +} + +static const struct dentry_operations proc_net_dentry_ops = { + .d_revalidate = proc_net_d_revalidate, + .d_delete = always_delete_dentry, +}; + +static void pde_force_lookup(struct proc_dir_entry *pde) +{ + /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ + pde->proc_dops = &proc_net_dentry_ops; +} + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; @@ -90,6 +106,7 @@ struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -133,6 +150,7 @@ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_seq_fops; p->seq_ops = ops; p->state_size = state_size; @@ -181,6 +199,7 @@ struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; return proc_register(parent, p); @@ -223,6 +242,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; + pde_force_lookup(p); p->proc_fops = &proc_net_single_fops; p->single_show = show; p->write = write; -- cgit v1.2.3 From 3d0acc076f5ffb40d28d1210a6b987034d17b46c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Feb 2019 14:21:23 -0800 Subject: fs/drop_caches.c: avoid softlockups in drop_pagecache_sb() [ Upstream commit c27d82f52f75fc9d8d9d40d120d2a96fdeeada5e ] When superblock has lots of inodes without any pagecache (like is the case for /proc), drop_pagecache_sb() will iterate through all of them without dropping sb->s_inode_list_lock which can lead to softlockups (one of our customers hit this). Fix the problem by going to the slow path and doing cond_resched() in case the process needs rescheduling. Link: http://lkml.kernel.org/r/20190114085343.15011-1-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/drop_caches.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 82377017130f..d31b6c72b476 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -21,8 +21,13 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); + /* + * We must skip inodes in unusual state. We may also skip + * inodes without pages but we deliberately won't in case + * we need to reschedule to avoid softlockups. + */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { + (inode->i_mapping->nrpages == 0 && !need_resched())) { spin_unlock(&inode->i_lock); continue; } @@ -30,6 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); + cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; -- cgit v1.2.3 From 1efb234ec25193233d71997c769f6504d283c671 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Fri, 1 Feb 2019 14:21:26 -0800 Subject: autofs: drop dentry reference only when it is never used [ Upstream commit 63ce5f552beb9bdb41546b3a26c4374758b21815 ] autofs_expire_run() calls dput(dentry) to drop the reference count of dentry. However, dentry is read via autofs_dentry_ino(dentry) after that. This may result in a use-free-bug. The patch drops the reference count of dentry only when it is never used. Link: http://lkml.kernel.org/r/154725122396.11260.16053424107144453867.stgit@pluto-themaw-net Signed-off-by: Pan Bian Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/autofs/expire.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index d441244b79df..28d9c2b1b3bb 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -596,7 +596,6 @@ int autofs_expire_run(struct super_block *sb, pkt.len = dentry->d_name.len; memcpy(pkt.name, dentry->d_name.name, pkt.len); pkt.name[pkt.len] = '\0'; - dput(dentry); if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) ret = -EFAULT; @@ -609,6 +608,8 @@ int autofs_expire_run(struct super_block *sb, complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); + dput(dentry); + return ret; } -- cgit v1.2.3 From 929278903367a8a9ca7b701b4d0af9a6ba5110a3 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Fri, 1 Feb 2019 14:21:29 -0800 Subject: autofs: fix error return in autofs_fill_super() [ Upstream commit f585b283e3f025754c45bbe7533fc6e5c4643700 ] In autofs_fill_super() on error of get inode/make root dentry the return should be ENOMEM as this is the only failure case of the called functions. Link: http://lkml.kernel.org/r/154725123240.11260.796773942606871359.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/autofs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 846c052569dd..3c14a8e45ffb 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -255,8 +255,10 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) } root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); - if (!root) + if (!root) { + ret = -ENOMEM; goto fail_ino; + } pipe = NULL; root->d_fsdata = ino; -- cgit v1.2.3 From 72426ed2a149c3cd0c33b409ca10b3918f399cc0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 21 Jan 2019 22:49:37 +0900 Subject: fs: ratelimit __find_get_block_slow() failure message. [ Upstream commit 43636c804df0126da669c261fc820fb22f62bfc2 ] When something let __find_get_block_slow() hit all_mapped path, it calls printk() for 100+ times per a second. But there is no need to print same message with such high frequency; it is just asking for stall warning, or at least bloating log files. [ 399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8 [ 399.873324][T15342] b_state=0x00000029, b_size=512 [ 399.878403][T15342] device loop0 blocksize: 4096 [ 399.883296][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8 [ 399.890400][T15342] b_state=0x00000029, b_size=512 [ 399.895595][T15342] device loop0 blocksize: 4096 [ 399.900556][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8 [ 399.907471][T15342] b_state=0x00000029, b_size=512 [ 399.912506][T15342] device loop0 blocksize: 4096 This patch reduces frequency to up to once per a second, in addition to concatenating three lines into one. [ 399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8, b_state=0x00000029, b_size=512, device loop0 blocksize: 4096 Signed-off-by: Tetsuo Handa Reviewed-by: Jan Kara Cc: Dmitry Vyukov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/buffer.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 6f1ae3ac9789..c083c4b3c1e7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) struct buffer_head *head; struct page *page; int all_mapped = 1; + static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1); index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); @@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */ - if (all_mapped) { - printk("__find_get_block_slow() failed. " - "block=%llu, b_blocknr=%llu\n", - (unsigned long long)block, - (unsigned long long)bh->b_blocknr); - printk("b_state=0x%08lx, b_size=%zu\n", - bh->b_state, bh->b_size); - printk("device %pg blocksize: %d\n", bdev, - 1 << bd_inode->i_blkbits); + ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE); + if (all_mapped && __ratelimit(&last_warned)) { + printk("__find_get_block_slow() failed. block=%llu, " + "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, " + "device %pg blocksize: %d\n", + (unsigned long long)block, + (unsigned long long)bh->b_blocknr, + bh->b_state, bh->b_size, bdev, + 1 << bd_inode->i_blkbits); } out_unlock: spin_unlock(&bd_mapping->private_lock); -- cgit v1.2.3 From 4f5a4c8881064c2eac658528390a6f2e7253c9b5 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 6 Mar 2019 15:41:57 +0100 Subject: gfs2: Fix missed wakeups in find_insert_glock commit 605b0487f0bc1ae9963bf52ece0f5c8055186f81 upstream. Mark Syms has reported seeing tasks that are stuck waiting in find_insert_glock. It turns out that struct lm_lockname contains four padding bytes on 64-bit architectures that function glock_waitqueue doesn't skip when hashing the glock name. As a result, we can end up waking up the wrong waitqueue, and the waiting tasks may be stuck forever. Fix that by using ht_parms.key_len instead of sizeof(struct lm_lockname) for the key length. Reported-by: Mark Syms Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/glock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4614ee25f621..9d566e62684c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -107,7 +107,7 @@ static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { - u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } -- cgit v1.2.3 From b4d965a37d89cd8611984ea16c85903d01ac967a Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 24 Oct 2018 11:50:33 +1000 Subject: cifs: allow calling SMB2_xxx_free(NULL) commit 32a1fb36f6e50183871c2c1fcf5493c633e84732 upstream. Change these free functions to allow passing NULL as the argument and treat it as a no-op just like free(NULL) would. Or, if rqst->rq_iov is NULL. The second scenario could happen for smb2_queryfs() if the call to SMB2_query_info_init() fails and we go to qfs_exit to clean up and free all resources. In that case we have not yet assigned rqst[2].rq_iov and thus the rq_iov dereference in SMB2_close_free() will cause a NULL pointer dereference. [ bp: upstream patch also fixes SMB2_set_info_free which was introduced in 4.20 ] Fixes: 1eb9fb52040f ("cifs: create SMB2_open_init()/SMB2_open_free() helpers") Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Reviewed-by: Aurelien Aptel CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1e5a1171212f..a2d701775c49 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2243,10 +2243,12 @@ SMB2_open_free(struct smb_rqst *rqst) { int i; - cifs_small_buf_release(rqst->rq_iov[0].iov_base); - for (i = 1; i < rqst->rq_nvec; i++) - if (rqst->rq_iov[i].iov_base != smb2_padding) - kfree(rqst->rq_iov[i].iov_base); + if (rqst && rqst->rq_iov) { + cifs_small_buf_release(rqst->rq_iov[0].iov_base); + for (i = 1; i < rqst->rq_nvec; i++) + if (rqst->rq_iov[i].iov_base != smb2_padding) + kfree(rqst->rq_iov[i].iov_base); + } } int @@ -2535,7 +2537,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, void SMB2_close_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } int @@ -2685,7 +2688,8 @@ SMB2_query_info_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, void SMB2_query_info_free(struct smb_rqst *rqst) { - cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ } static int -- cgit v1.2.3 From 2835c059726a100c1de78fd5219cbe4395a894b1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 28 Dec 2018 11:00:38 -0800 Subject: f2fs: wait on atomic writes to count F2FS_CP_WB_DATA commit 31867b23d7d1ee3535136c6a410a6cf56f666bfc upstream. Otherwise, we can get wrong counts incurring checkpoint hang. IO_W (CP: -24, Data: 24, Flush: ( 0 0 1), Discard: ( 0 0)) Thread A Thread B - f2fs_write_data_pages - __write_data_page - f2fs_submit_page_write - inc_page_count(F2FS_WB_DATA) type is F2FS_WB_DATA due to file is non-atomic one - f2fs_ioc_start_atomic_write - set_inode_flag(FI_ATOMIC_FILE) - f2fs_write_end_io - dec_page_count(F2FS_WB_CP_DATA) type is F2FS_WB_DATA due to file becomes atomic one Cc: Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/file.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fd36aa6569dc..81c1dd635a8d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1736,10 +1736,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1747,7 +1749,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); -- cgit v1.2.3 From e08ba890dc29250fafdfa7c9dba62ccfeec8ef7f Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Jan 2019 14:35:13 +0800 Subject: 9p: use inode->i_lock to protect i_size_write() under 32-bit commit 5e3cc1ee1405a7eb3487ed24f786dec01b4cbe1f upstream. Use inode->i_lock to protect i_size_write(), else i_size_read() in generic_fillattr() may loop infinitely in read_seqcount_begin() when multiple processes invoke v9fs_vfs_getattr() or v9fs_vfs_getattr_dotl() simultaneously under 32-bit SMP environment, and a soft lockup will be triggered as show below: watchdog: BUG: soft lockup - CPU#5 stuck for 22s! [stat:2217] Modules linked in: CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4 Hardware name: Generic DT based system PC is at generic_fillattr+0x104/0x108 LR is at 0xec497f00 pc : [<802b8898>] lr : [] psr: 200c0013 sp : ec497e20 ip : ed608030 fp : ec497e3c r10: 00000000 r9 : ec497f00 r8 : ed608030 r7 : ec497ebc r6 : ec497f00 r5 : ee5c1550 r4 : ee005780 r3 : 0000052d r2 : 00000000 r1 : ec497f00 r0 : ed608030 Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: ac48006a DAC: 00000051 CPU: 5 PID: 2217 Comm: stat Not tainted 5.0.0-rc1-00005-g7f702faf5a9e #4 Hardware name: Generic DT based system Backtrace: [<8010d974>] (dump_backtrace) from [<8010dc88>] (show_stack+0x20/0x24) [<8010dc68>] (show_stack) from [<80a1d194>] (dump_stack+0xb0/0xdc) [<80a1d0e4>] (dump_stack) from [<80109f34>] (show_regs+0x1c/0x20) [<80109f18>] (show_regs) from [<801d0a80>] (watchdog_timer_fn+0x280/0x2f8) [<801d0800>] (watchdog_timer_fn) from [<80198658>] (__hrtimer_run_queues+0x18c/0x380) [<801984cc>] (__hrtimer_run_queues) from [<80198e60>] (hrtimer_run_queues+0xb8/0xf0) [<80198da8>] (hrtimer_run_queues) from [<801973e8>] (run_local_timers+0x28/0x64) [<801973c0>] (run_local_timers) from [<80197460>] (update_process_times+0x3c/0x6c) [<80197424>] (update_process_times) from [<801ab2b8>] (tick_nohz_handler+0xe0/0x1bc) [<801ab1d8>] (tick_nohz_handler) from [<80843050>] (arch_timer_handler_virt+0x38/0x48) [<80843018>] (arch_timer_handler_virt) from [<80180a64>] (handle_percpu_devid_irq+0x8c/0x240) [<801809d8>] (handle_percpu_devid_irq) from [<8017ac20>] (generic_handle_irq+0x34/0x44) [<8017abec>] (generic_handle_irq) from [<8017b344>] (__handle_domain_irq+0x6c/0xc4) [<8017b2d8>] (__handle_domain_irq) from [<801022e0>] (gic_handle_irq+0x4c/0x88) [<80102294>] (gic_handle_irq) from [<80101a30>] (__irq_svc+0x70/0x98) [<802b8794>] (generic_fillattr) from [<8056b284>] (v9fs_vfs_getattr_dotl+0x74/0xa4) [<8056b210>] (v9fs_vfs_getattr_dotl) from [<802b8904>] (vfs_getattr_nosec+0x68/0x7c) [<802b889c>] (vfs_getattr_nosec) from [<802b895c>] (vfs_getattr+0x44/0x48) [<802b8918>] (vfs_getattr) from [<802b8a74>] (vfs_statx+0x9c/0xec) [<802b89d8>] (vfs_statx) from [<802b9428>] (sys_lstat64+0x48/0x78) [<802b93e0>] (sys_lstat64) from [<80101000>] (ret_fast_syscall+0x0/0x28) [dominique.martinet@cea.fr: updated comment to not refer to a function in another subsystem] Link: http://lkml.kernel.org/r/20190124063514.8571-2-houtao1@huawei.com Cc: stable@vger.kernel.org Fixes: 7549ae3e81cc ("9p: Use the i_size_[read, write]() macros instead of using inode->i_size directly.") Reported-by: Xing Gaopeng Signed-off-by: Hou Tao Signed-off-by: Dominique Martinet Signed-off-by: Greg Kroah-Hartman --- fs/9p/v9fs_vfs.h | 23 +++++++++++++++++++++-- fs/9p/vfs_file.c | 6 +++++- fs/9p/vfs_inode.c | 23 +++++++++++------------ fs/9p/vfs_inode_dotl.c | 27 ++++++++++++++------------- fs/9p/vfs_super.c | 4 ++-- 5 files changed, 53 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 5a0db6dec8d1..aaee1e6584e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,6 +40,9 @@ */ #define P9_LOCK_TIMEOUT (30*HZ) +/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */ +#define V9FS_STAT2INODE_KEEP_ISIZE 1 + extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; @@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); -void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb, unsigned int flags); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); @@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) } int v9fs_open_to_dotl_flags(int flags); + +static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) +{ + /* + * 32-bit need the lock, concurrent updates could break the + * sequences and make i_size_read() loop forever. + * 64-bit updates are atomic and can skip the locking. + */ + if (sizeof(i_size) > sizeof(long)) + spin_lock(&inode->i_lock); + i_size_write(inode, i_size); + if (sizeof(i_size) > sizeof(long)) + spin_unlock(&inode->i_lock); +} #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index ab3d5f5dbb00..c87e6d6ec069 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -442,7 +442,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) i_size = i_size_read(inode); if (iocb->ki_pos > i_size) { inode_add_bytes(inode, iocb->ki_pos - i_size); - i_size_write(inode, iocb->ki_pos); + /* + * Need to serialize against i_size_write() in + * v9fs_stat2inode() + */ + v9fs_i_size_write(inode, iocb->ki_pos); } return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 85ff859d3af5..72b779bc0942 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode(st, inode, sb); + v9fs_stat2inode(st, inode, sb, 0); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); @@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem + * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, - struct super_block *sb) + struct super_block *sb, unsigned int flags) { umode_t mode; char ext[32]; @@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->length); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + inode->i_blocks = (stat->length + 512 - 1) >> 9; v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } @@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { int umode; dev_t rdev; - loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_stat(fid); @@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode(st, inode, inode->i_sb, flags); out: p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4823e1c46999..a950a927a626 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode_dotl(st, inode); + v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, d_inode(dentry)); + v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate + * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { @@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) } if (stat->st_result_mask & P9_STATS_RDEV) inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && + stat->st_result_mask & P9_STATS_SIZE) + v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } @@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { - loff_t i_size; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); @@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48ce50484e80..eeab9953af89 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root)); + v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; @@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb); + v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); -- cgit v1.2.3 From 6c023d86b3645fc2671bf2a7677fd208744edbb1 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 6 Feb 2019 06:09:43 -0500 Subject: NFS: Don't use page_file_mapping after removing the page [ Upstream commit d2ceb7e57086750ea6198a31fd942d98099a0786 ] If nfs_page_async_flush() removes the page from the mapping, then we can't use page_file_mapping() on it as nfs_updatepate() is wont to do when receiving an error. Instead, push the mapping to the stack before the page is possibly truncated. Fixes: 8fc75bed96bb ("NFS: Fix up return value on fatal errors in nfs_page_async_flush()") Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/write.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d790faff8e47..51d0b7913c04 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -238,9 +238,9 @@ out: } /* A writeback failed: mark the page as bad, and invalidate the page cache */ -static void nfs_set_pageerror(struct page *page) +static void nfs_set_pageerror(struct address_space *mapping) { - nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); + nfs_zap_mapping(mapping->host, mapping); } /* @@ -994,7 +994,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - nfs_set_pageerror(req->wb_page); + nfs_set_pageerror(page_file_mapping(req->wb_page)); nfs_context_set_write_error(req->wb_context, hdr->error); goto remove_req; } @@ -1330,7 +1330,8 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page_file_mapping(page)->host; + struct address_space *mapping = page_file_mapping(page); + struct inode *inode = mapping->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1348,7 +1349,7 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) - nfs_set_pageerror(page); + nfs_set_pageerror(mapping); else __set_page_dirty_nobuffers(page); out: -- cgit v1.2.3 From 726832821903bdd75ea9a409ba12fa60069a8268 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Feb 2019 16:20:25 +0000 Subject: keys: Fix dependency loop between construction record and auth key [ Upstream commit 822ad64d7e46a8e2c8b8a796738d7b657cbb146d ] In the request_key() upcall mechanism there's a dependency loop by which if a key type driver overrides the ->request_key hook and the userspace side manages to lose the authorisation key, the auth key and the internal construction record (struct key_construction) can keep each other pinned. Fix this by the following changes: (1) Killing off the construction record and using the auth key instead. (2) Including the operation name in the auth key payload and making the payload available outside of security/keys/. (3) The ->request_key hook is given the authkey instead of the cons record and operation name. Changes (2) and (3) allow the auth key to naturally be cleaned up if the keyring it is in is destroyed or cleared or the auth key is unlinked. Fixes: 7ee02a316600 ("keys: Fix dependency loop between construction record and auth key") Signed-off-by: David Howells Signed-off-by: James Morris Signed-off-by: Sasha Levin --- fs/nfs/nfs4idmap.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 3f23b6840547..bf34ddaa2ad7 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; - struct key_construction *key_cons; + struct key *authkey; struct idmap *idmap; }; @@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = { { Opt_find_err, NULL } }; -static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); @@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, static void nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) { - struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + struct key *authkey = idmap->idmap_upcall_data->authkey; kfree(idmap->idmap_upcall_data); idmap->idmap_upcall_data = NULL; - complete_request_key(cons, ret); + complete_request_key(authkey, ret); + key_put(authkey); } static void @@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) nfs_idmap_complete_pipe_upcall_locked(idmap, ret); } -static int nfs_idmap_legacy_upcall(struct key_construction *cons, - const char *op, - void *aux) +static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; + struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; - struct key *key = cons->key; + struct key *key = rka->target_key; int ret = -ENOKEY; if (!aux) @@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; - data->key_cons = cons; + data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) @@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, out2: kfree(data); out1: - complete_request_key(cons, ret); + complete_request_key(authkey, ret); return ret; } @@ -651,9 +652,10 @@ out: static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { + struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons; + struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; @@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (idmap->idmap_upcall_data == NULL) goto out_noupcall; - cons = idmap->idmap_upcall_data->key_cons; + authkey = idmap->idmap_upcall_data->authkey; + rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ret = nfs_idmap_read_and_verify_message(&im, &idmap->idmap_upcall_data->idmap_msg, - cons->key, cons->authkey); + rka->target_key, authkey); if (ret >= 0) { - key_set_timeout(cons->key, nfs_idmap_cache_timeout); + key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } -- cgit v1.2.3 From 7a8b048430c124175a28514e2ed7cb8230998ccc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jan 2019 11:41:29 -0500 Subject: fix cgroup_do_mount() handling of failure exits commit 399504e21a10be16dd1408ba0147367d9d82a10c upstream. same story as with last May fixes in sysfs (7b745a4e4051 "unfuck sysfs_mount()"); new_sb is left uninitialized in case of early errors in kernfs_mount_ns() and papering over it by treating any error from kernfs_mount_ns() as equivalent to !new_ns ends up conflating the cases when objects had never been transferred to a superblock with ones when that has happened and resulting new superblock had been dropped. Easily fixed (same way as in sysfs case). Additionally, there's a superblock leak on kernfs_node_dentry() failure *and* a dentry leak inside kernfs_node_dentry() itself - the latter on probably impossible errors, but the former not impossible to trigger (as the matter of fact, injecting allocation failures at that point *does* trigger it). Cc: stable@kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/mount.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index ff2716f9322e..0b22c39dad47 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -196,8 +196,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, return dentry; knparent = find_next_ancestor(kn, NULL); - if (WARN_ON(!knparent)) + if (WARN_ON(!knparent)) { + dput(dentry); return ERR_PTR(-EINVAL); + } do { struct dentry *dtmp; @@ -206,8 +208,10 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, if (kn == knparent) return dentry; kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) + if (WARN_ON(!kntmp)) { + dput(dentry); return ERR_PTR(-EINVAL); + } dtmp = lookup_one_len_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); -- cgit v1.2.3 From 3ed9f22e28dd3a2b18f65ee5442882ebded6413e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 13 Feb 2019 15:43:08 -0800 Subject: CIFS: Do not reset lease state to NONE on lease break commit 7b9b9edb49ad377b1e06abf14354c227e9ac4b06 upstream. Currently on lease break the client sets a caching level twice: when oplock is detected and when oplock is processed. While the 1st attempt sets the level to the value provided by the server, the 2nd one resets the level to None unconditionally. This happens because the oplock/lease processing code was changed to avoid races between page cache flushes and oplock breaks. The commit c11f1df5003d534 ("cifs: Wait for writebacks to complete before attempting write.") fixed the races for oplocks but didn't apply the same changes for leases resulting in overwriting the server granted value to None. Fix this by properly processing lease breaks. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2misc.c | 17 ++++++++++++++--- fs/cifs/smb2ops.c | 15 ++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 7b8b58fb4d3f..58700d2ba8cd 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -517,7 +517,6 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, __u8 lease_state; struct list_head *tmp; struct cifsFileInfo *cfile; - struct TCP_Server_Info *server = tcon->ses->server; struct cifs_pending_open *open; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & @@ -537,13 +536,25 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, cifs_dbg(FYI, "lease key match, lease break 0x%x\n", le32_to_cpu(rsp->NewLeaseState)); - server->ops->set_oplock_level(cinode, lease_state, 0, NULL); - if (ack_req) cfile->oplock_break_cancelled = false; else cfile->oplock_break_cancelled = true; + set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); + + /* + * Set or clear flags depending on the lease state being READ. + * HANDLE caching flag should be added when the client starts + * to defer closing remote file handles with HANDLE leases. + */ + if (lease_state & SMB2_LEASE_READ_CACHING_HE) + set_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + else + clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + queue_work(cifsoplockd_wq, &cfile->oplock_break); kfree(lw); return true; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 237d7281ada3..b66d630e0058 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2300,6 +2300,15 @@ smb2_downgrade_oplock(struct TCP_Server_Info *server, server->ops->set_oplock_level(cinode, 0, 0, NULL); } +static void +smb21_downgrade_oplock(struct TCP_Server_Info *server, + struct cifsInodeInfo *cinode, bool set_level2) +{ + server->ops->set_oplock_level(cinode, + set_level2 ? SMB2_LEASE_READ_CACHING_HE : + 0, 0, NULL); +} + static void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) @@ -3351,7 +3360,7 @@ struct smb_version_operations smb21_operations = { .print_stats = smb2_print_stats, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -3447,7 +3456,7 @@ struct smb_version_operations smb30_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, @@ -3551,7 +3560,7 @@ struct smb_version_operations smb311_operations = { .dump_share_caps = smb2_dump_share_caps, .is_oplock_break = smb2_is_valid_oplock_break, .handle_cancelled_mid = smb2_handle_cancelled_mid, - .downgrade_oplock = smb2_downgrade_oplock, + .downgrade_oplock = smb21_downgrade_oplock, .need_neg = smb2_need_neg, .negotiate = smb2_negotiate, .negotiate_wsize = smb2_negotiate_wsize, -- cgit v1.2.3 From dc8e8ad962a8add17737ec29396bb7a754907e52 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 4 Mar 2019 14:02:50 -0800 Subject: CIFS: Do not skip SMB2 message IDs on send failures commit c781af7e0c1fed9f1d0e0ec31b86f5b21a8dca17 upstream. When we hit failures during constructing MIDs or sending PDUs through the network, we end up not using message IDs assigned to the packet. The next SMB packet will skip those message IDs and continue with the next one. This behavior may lead to a server not granting us credits until we use the skipped IDs. Fix this by reverting the current ID to the original value if any errors occur before we push the packet through the network stack. This patch fixes the generic/310 test from the xfs-tests. Cc: # 4.19.x Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 19 +++++++++++++++++++ fs/cifs/smb2ops.c | 13 +++++++++++++ fs/cifs/smb2transport.c | 14 ++++++++++++-- fs/cifs/transport.c | 6 +++++- 4 files changed, 49 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9dcaed031843..80f33582059e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -235,6 +235,8 @@ struct smb_version_operations { int * (*get_credits_field)(struct TCP_Server_Info *, const int); unsigned int (*get_credits)(struct mid_q_entry *); __u64 (*get_next_mid)(struct TCP_Server_Info *); + void (*revert_current_mid)(struct TCP_Server_Info *server, + const unsigned int val); /* data offset from read response message */ unsigned int (*read_data_offset)(char *); /* @@ -756,6 +758,22 @@ get_next_mid(struct TCP_Server_Info *server) return cpu_to_le16(mid); } +static inline void +revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + if (server->ops->revert_current_mid) + server->ops->revert_current_mid(server, val); +} + +static inline void +revert_current_mid_from_hdr(struct TCP_Server_Info *server, + const struct smb2_sync_hdr *shdr) +{ + unsigned int num = le16_to_cpu(shdr->CreditCharge); + + return revert_current_mid(server, num > 0 ? num : 1); +} + static inline __u16 get_mid(const struct smb_hdr *smb) { @@ -1391,6 +1409,7 @@ struct mid_q_entry { struct kref refcount; struct TCP_Server_Info *server; /* server corresponding to this mid */ __u64 mid; /* multiplex id */ + __u16 credits; /* number of credits consumed by this mid */ __u32 pid; /* process id */ __u32 sequence_number; /* for CIFS signing */ unsigned long when_alloc; /* when mid was created */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b66d630e0058..d4d7d61a6fe2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -204,6 +204,15 @@ smb2_get_next_mid(struct TCP_Server_Info *server) return mid; } +static void +smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) +{ + spin_lock(&GlobalMid_Lock); + if (server->CurrentMid >= val) + server->CurrentMid -= val; + spin_unlock(&GlobalMid_Lock); +} + static struct mid_q_entry * smb2_find_mid(struct TCP_Server_Info *server, char *buf) { @@ -3256,6 +3265,7 @@ struct smb_version_operations smb20_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = cifs_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3350,6 +3360,7 @@ struct smb_version_operations smb21_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3445,6 +3456,7 @@ struct smb_version_operations smb30_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, @@ -3549,6 +3561,7 @@ struct smb_version_operations smb311_operations = { .get_credits = smb2_get_credits, .wait_mtu_credits = smb2_wait_mtu_credits, .get_next_mid = smb2_get_next_mid, + .revert_current_mid = smb2_revert_current_mid, .read_data_offset = smb2_read_data_offset, .read_data_length = smb2_read_data_length, .map_error = map_smb2_to_linux_error, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 7b351c65ee46..63264db78b89 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -576,6 +576,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; + unsigned int credits = le16_to_cpu(shdr->CreditCharge); if (server == NULL) { cifs_dbg(VFS, "Null TCP session in smb2_mid_entry_alloc\n"); @@ -586,6 +587,7 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, memset(temp, 0, sizeof(struct mid_q_entry)); kref_init(&temp->refcount); temp->mid = le64_to_cpu(shdr->MessageId); + temp->credits = credits > 0 ? credits : 1; temp->pid = current->pid; temp->command = shdr->Command; /* Always LE */ temp->when_alloc = jiffies; @@ -674,13 +676,18 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) smb2_seq_num_into_buf(ses->server, shdr); rc = smb2_get_mid_entry(ses, shdr, &mid); - if (rc) + if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); return ERR_PTR(rc); + } + rc = smb2_sign_rqst(rqst, ses->server); if (rc) { + revert_current_mid_from_hdr(ses->server, shdr); cifs_delete_mid(mid); return ERR_PTR(rc); } + return mid; } @@ -695,11 +702,14 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) smb2_seq_num_into_buf(server, shdr); mid = smb2_mid_entry_alloc(shdr, server); - if (mid == NULL) + if (mid == NULL) { + revert_current_mid_from_hdr(server, shdr); return ERR_PTR(-ENOMEM); + } rc = smb2_sign_rqst(rqst, server); if (rc) { + revert_current_mid_from_hdr(server, shdr); DeleteMidQEntry(mid); return ERR_PTR(rc); } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 66348b3d28e6..f2938bd95c40 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -638,6 +638,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); if (rc < 0) { + revert_current_mid(server, mid->credits); server->sequence_number -= 2; cifs_delete_mid(mid); } @@ -842,6 +843,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) { midQ[i] = ses->server->ops->setup_request(ses, &rqst[i]); if (IS_ERR(midQ[i])) { + revert_current_mid(ses->server, i); for (j = 0; j < i; j++) cifs_delete_mid(midQ[j]); mutex_unlock(&ses->server->srv_mutex); @@ -867,8 +869,10 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, for (i = 0; i < num_rqst; i++) cifs_save_when_sent(midQ[i]); - if (rc < 0) + if (rc < 0) { + revert_current_mid(ses->server, num_rqst); ses->server->sequence_number -= 2; + } mutex_unlock(&ses->server->srv_mutex); -- cgit v1.2.3 From 43eaa6cc17753f31768ddf1ed314e36abc0f2580 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 4 Mar 2019 17:48:01 -0800 Subject: CIFS: Fix read after write for files with read caching commit 6dfbd84684700cb58b34e8602c01c12f3d2595c8 upstream. When we have a READ lease for a file and have just issued a write operation to the server we need to purge the cache and set oplock/lease level to NONE to avoid reading stale data. Currently we do that only if a write operation succedeed thus not covering cases when a request was sent to the server but a negative error code was returned later for some other reasons (e.g. -EIOCBQUEUED or -EINTR). Fix this by turning off caching regardless of the error code being returned. The patches fixes generic tests 075 and 112 from the xfs-tests. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 23db881daab5..08761a6a039d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2871,14 +2871,16 @@ cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from) * these pages but not on the region from pos to ppos+len-1. */ written = cifs_user_writev(iocb, from); - if (written > 0 && CIFS_CACHE_READ(cinode)) { + if (CIFS_CACHE_READ(cinode)) { /* - * Windows 7 server can delay breaking level2 oplock if a write - * request comes - break it on the client to prevent reading - * an old data. + * We have read level caching and we have just sent a write + * request to the server thus making data in the cache stale. + * Zap the cache and set oplock/lease level to NONE to avoid + * reading stale data from the cache. All subsequent read + * operations will read new data from the server. */ cifs_zap_mapping(inode); - cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n", + cifs_dbg(FYI, "Set Oplock/Lease to NONE for inode=%p after write\n", inode); cinode->oplock = 0; } -- cgit v1.2.3 From 1c2123ff43122bb9e0578198391ee3cf752049d6 Mon Sep 17 00:00:00 2001 From: Varad Gautam Date: Thu, 24 Jan 2019 14:03:06 +0100 Subject: fs/devpts: always delete dcache dentry-s in dput() commit 73052b0daee0b750b39af18460dfec683e4f5887 upstream. d_delete only unhashes an entry if it is reached with dentry->d_lockref.count != 1. Prior to commit 8ead9dd54716 ("devpts: more pty driver interface cleanups"), d_delete was called on a dentry from devpts_pty_kill with two references held, which would trigger the unhashing, and the subsequent dputs would release it. Commit 8ead9dd54716 reworked devpts_pty_kill to stop acquiring the second reference from d_find_alias, and the d_delete call left the dentries still on the hashed list without actually ever being dropped from dcache before explicit cleanup. This causes the number of negative dentries for devpts to pile up, and an `ls /dev/pts` invocation can take seconds to return. Provide always_delete_dentry() from simple_dentry_operations as .d_delete for devpts, to make the dentry be dropped from dcache. Without this cleanup, the number of dentries in /dev/pts/ can be grown arbitrarily as: `python -c 'import pty; pty.spawn(["ls", "/dev/pts"])'` A systemtap probe on dcache_readdir to count d_subdirs shows this count to increase with each pty spawn invocation above: probe kernel.function("dcache_readdir") { subdirs = &@cast($file->f_path->dentry, "dentry")->d_subdirs; p = subdirs; p = @cast(p, "list_head")->next; i = 0 while (p != subdirs) { p = @cast(p, "list_head")->next; i = i+1; } printf("number of dentries: %d\n", i); } Fixes: 8ead9dd54716 ("devpts: more pty driver interface cleanups") Signed-off-by: Varad Gautam Reported-by: Zheng Wang Reported-by: Brandon Schwartz Root-caused-by: Maximilian Heyne Root-caused-by: Nicolas Pernas Maradei CC: David Woodhouse CC: Maximilian Heyne CC: Stefan Nuernberger CC: Amit Shah CC: Linus Torvalds CC: Greg Kroah-Hartman CC: Al Viro CC: Christian Brauner CC: Eric W. Biederman CC: Matthew Wilcox CC: Eric Biggers CC: # 4.9+ Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/devpts/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c53814539070..553a3f3300ae 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; + s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; error = -ENOMEM; -- cgit v1.2.3 From 2af926fd52fc287a92fc782ce074f349d86966aa Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 23 Jan 2019 15:19:17 +0100 Subject: splice: don't merge into linked buffers commit a0ce2f0aa6ad97c3d4927bf2ca54bcebdf062d55 upstream. Before this patch, it was possible for two pipes to affect each other after data had been transferred between them with tee(): ============ $ cat tee_test.c int main(void) { int pipe_a[2]; if (pipe(pipe_a)) err(1, "pipe"); int pipe_b[2]; if (pipe(pipe_b)) err(1, "pipe"); if (write(pipe_a[1], "abcd", 4) != 4) err(1, "write"); if (tee(pipe_a[0], pipe_b[1], 2, 0) != 2) err(1, "tee"); if (write(pipe_b[1], "xx", 2) != 2) err(1, "write"); char buf[5]; if (read(pipe_a[0], buf, 4) != 4) err(1, "read"); buf[4] = 0; printf("got back: '%s'\n", buf); } $ gcc -o tee_test tee_test.c $ ./tee_test got back: 'abxx' $ ============ As suggested by Al Viro, fix it by creating a separate type for non-mergeable pipe buffers, then changing the types of buffers in splice_pipe_to_pipe() and link_pipe(). Cc: Fixes: 7c77f0b3f920 ("splice: implement pipe to pipe splicing") Fixes: 70524490ee2e ("[PATCH] splice: add support for sys_tee()") Suggested-by: Al Viro Signed-off-by: Jann Horn Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/pipe.c | 14 ++++++++++++++ fs/splice.c | 4 ++++ 2 files changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index bdc5d3c0977d..c51750ed4011 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -234,6 +234,14 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = anon_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, @@ -242,6 +250,12 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) +{ + if (buf->ops == &anon_pipe_buf_ops) + buf->ops = &anon_pipe_buf_nomerge_ops; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { diff --git a/fs/splice.c b/fs/splice.c index b3daa971f597..29e92b506394 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1593,6 +1593,8 @@ retry: */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + obuf->len = len; opipe->nrbufs++; ibuf->offset += obuf->len; @@ -1667,6 +1669,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + if (obuf->len > len) obuf->len = len; -- cgit v1.2.3 From 6f048ae2d25f94490c15320ccc6ed8b6587f1258 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 11 Jan 2019 19:37:00 +0100 Subject: ovl: During copy up, first copy up data and then xattrs commit 5f32879ea35523b9842bdbdc0065e13635caada2 upstream. If a file with capability set (and hence security.capability xattr) is written kernel clears security.capability xattr. For overlay, during file copy up if xattrs are copied up first and then data is, copied up. This means data copy up will result in clearing of security.capability xattr file on lower has. And this can result into surprises. If a lower file has CAP_SETUID, then it should not be cleared over copy up (if nothing was actually written to file). This also creates problems with chown logic where it first copies up file and then tries to clear setuid bit. But by that time security.capability xattr is already gone (due to data copy up), and caller gets -ENODATA. This has been reported by Giuseppe here. https://github.com/containers/libpod/issues/2015#issuecomment-447824842 Fix this by copying up data first and then metadta. This is a regression which has been introduced by my commit as part of metadata only copy up patches. TODO: There will be some corner cases where a file is copied up metadata only and later data copy up happens and that will clear security.capability xattr. Something needs to be done about that too. Fixes: bd64e57586d3 ("ovl: During copy up, first copy up metadata and then data") Cc: # v4.19+ Reported-by: Giuseppe Scrivano Signed-off-by: Vivek Goyal Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/copy_up.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 1cc797a08a5b..59080f7ba02f 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -501,6 +501,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + err = ovl_copy_xattr(c->lowerpath.dentry, temp); if (err) return err; @@ -518,19 +536,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = temp; - - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); - if (err) - return err; - } - if (c->metacopy) { err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); -- cgit v1.2.3 From 205f149f1a358fc2a054461dd1432e52263cbc1e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 30 Jan 2019 14:01:57 -0500 Subject: ovl: Do not lose security.capability xattr over metadata file copy-up commit 993a0b2aec52754f0897b1dab4c453be8217cae5 upstream. If a file has been copied up metadata only, and later data is copied up, upper loses any security.capability xattr it has (underlying filesystem clears it as upon file write). From a user's point of view, this is just a file copy-up and that should not result in losing security.capability xattr. Hence, before data copy up, save security.capability xattr (if any) and restore it on upper after data copy up is complete. Signed-off-by: Vivek Goyal Reviewed-by: Amir Goldstein Fixes: 0c2888749363 ("ovl: A new xattr OVL_XATTR_METACOPY for file on upper") Cc: # v4.19+ Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/copy_up.c | 28 ++++++++++++++++++++++-- fs/overlayfs/overlayfs.h | 2 ++ fs/overlayfs/util.c | 55 ++++++++++++++++++++++++++++++------------------ 3 files changed, 63 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 59080f7ba02f..75eeee08d848 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -711,6 +711,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct path upperpath, datapath; int err; + char *capability = NULL; + ssize_t uninitialized_var(cap_size); ovl_path_upper(c->dentry, &upperpath); if (WARN_ON(upperpath.dentry == NULL)) @@ -720,15 +722,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(datapath.dentry == NULL)) return -EIO; + if (c->stat.size) { + err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, + &capability, 0); + if (err < 0 && err != -ENODATA) + goto out; + } + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); if (err) - return err; + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); if (err) - return err; + goto out_free; ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: return err; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index a3c0d9584312..d9c16ceebfe7 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index ace4fe4c39a9..c9a2e3c6d537 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -867,28 +867,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding) { - int res; - char *s, *next, *buf = NULL; + ssize_t res; + char *buf = NULL; - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = vfs_getxattr(dentry, name, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) - return NULL; + return -ENODATA; goto fail; } - buf = kzalloc(res + padding + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); + if (res != 0) { + buf = kzalloc(res + padding, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (res == 0) - goto invalid; + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + goto fail; + } + *value = buf; + + return res; + +fail: + pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + name, res); + kfree(buf); + return res; +} - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); + if (res == -ENODATA) + return NULL; if (res < 0) - goto fail; + return ERR_PTR(res); if (res == 0) goto invalid; @@ -904,15 +925,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) } return buf; - -err_free: - kfree(buf); - return ERR_PTR(res); -fail: - pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); - goto err_free; invalid: pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); res = -EINVAL; - goto err_free; + kfree(buf); + return ERR_PTR(res); } -- cgit v1.2.3 From 61f9209676e854973a8b7aaed93de16f727c4a32 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Dec 2018 21:16:45 +0000 Subject: Btrfs: setup a nofs context for memory allocation at btrfs_create_tree() commit b89f6d1fcb30a8cbdc18ce00c7d93792076af453 upstream. We are holding a transaction handle when creating a tree, therefore we can not allocate the root using GFP_KERNEL, as we could deadlock if reclaim is triggered by the allocation, therefore setup a nofs context. Fixes: 74e4d82757f74 ("btrfs: let callers of btrfs_alloc_root pass gfp flags") CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d96d1390068a..b4f61a3d560a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "ctree.h" #include "disk-io.h" @@ -1236,10 +1237,17 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key key; + unsigned int nofs_flag; int ret = 0; uuid_le uuid = NULL_UUID_LE; + /* + * We're holding a transaction handle, so use a NOFS memory allocation + * context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); root = btrfs_alloc_root(fs_info, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!root) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 6e24f5a1ebb199c8dcd84f7b91fe8ba120ecfe32 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Dec 2018 21:16:56 +0000 Subject: Btrfs: setup a nofs context for memory allocation at __btrfs_set_acl commit a0873490660246db587849a9e172f2b7b21fa88a upstream. We are holding a transaction handle when setting an acl, therefore we can not allocate the xattr value buffer using GFP_KERNEL, as we could deadlock if reclaim is triggered by the allocation, therefore setup a nofs context. Fixes: 39a27ec1004e8 ("btrfs: use GFP_KERNEL for xattr and acl allocations") CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/acl.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 3b66c957ea6f..5810463dc6d2 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "ctree.h" @@ -72,8 +73,16 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, } if (acl) { + unsigned int nofs_flag; + size = posix_acl_xattr_size(acl->a_count); + /* + * We're holding a transaction handle, so use a NOFS memory + * allocation context to avoid deadlock if reclaim happens. + */ + nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); + memalloc_nofs_restore(nofs_flag); if (!value) { ret = -ENOMEM; goto out; -- cgit v1.2.3 From 1a00f7fd0fbf87faca8737e45193bada759744cb Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 18 Feb 2019 11:28:37 +0100 Subject: btrfs: ensure that a DUP or RAID1 block group has exactly two stripes commit 349ae63f40638a28c6fce52e8447c2d14b84cc0c upstream. We recently had a customer issue with a corrupted filesystem. When trying to mount this image btrfs panicked with a division by zero in calc_stripe_length(). The corrupt chunk had a 'num_stripes' value of 1. calc_stripe_length() takes this value and divides it by the number of copies the RAID profile is expected to have to calculate the amount of data stripes. As a DUP profile is expected to have 2 copies this division resulted in 1/2 = 0. Later then the 'data_stripes' variable is used as a divisor in the stripe length calculation which results in a division by 0 and thus a kernel panic. When encountering a filesystem with a DUP block group and a 'num_stripes' value unequal to 2, refuse mounting as the image is corrupted and will lead to unexpected behaviour. Code inspection showed a RAID1 block group has the same issues. Fixes: e06cd3dd7cea ("Btrfs: add validadtion checks for chunk loading") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Qu Wenruo Reviewed-by: Nikolay Borisov Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 285f64f2de5f..c13f62182513 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6425,10 +6425,10 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, } if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { btrfs_err(fs_info, -- cgit v1.2.3 From 898488e2988c83555309743d7ce6ffe1f135c6a8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 14 Feb 2019 15:17:20 +0000 Subject: Btrfs: fix corruption reading shared and compressed extents after hole punching commit 8e928218780e2f1cf2f5891c7575e8f0b284fcce upstream. In the past we had data corruption when reading compressed extents that are shared within the same file and they are consecutive, this got fixed by commit 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared extents") and by commit 808f80b46790f ("Btrfs: update fix for read corruption of compressed and shared extents"). However there was a case that was missing in those fixes, which is when the shared and compressed extents are referenced with a non-zero offset. The following shell script creates a reproducer for this issue: #!/bin/bash mkfs.btrfs -f /dev/sdc &> /dev/null mount -o compress /dev/sdc /mnt/sdc # Create a file with 3 consecutive compressed extents, each has an # uncompressed size of 128Kb and a compressed size of 4Kb. for ((i = 1; i <= 3; i++)); do head -c 4096 /dev/zero for ((j = 1; j <= 31; j++)); do head -c 4096 /dev/zero | tr '\0' "\377" done done > /mnt/sdc/foobar sync echo "Digest after file creation: $(md5sum /mnt/sdc/foobar)" # Clone the first extent into offsets 128K and 256K. xfs_io -c "reflink /mnt/sdc/foobar 0 128K 128K" /mnt/sdc/foobar xfs_io -c "reflink /mnt/sdc/foobar 0 256K 128K" /mnt/sdc/foobar sync echo "Digest after cloning: $(md5sum /mnt/sdc/foobar)" # Punch holes into the regions that are already full of zeroes. xfs_io -c "fpunch 0 4K" /mnt/sdc/foobar xfs_io -c "fpunch 128K 4K" /mnt/sdc/foobar xfs_io -c "fpunch 256K 4K" /mnt/sdc/foobar sync echo "Digest after hole punching: $(md5sum /mnt/sdc/foobar)" echo "Dropping page cache..." sysctl -q vm.drop_caches=1 echo "Digest after hole punching: $(md5sum /mnt/sdc/foobar)" umount /dev/sdc When running the script we get the following output: Digest after file creation: 5a0888d80d7ab1fd31c229f83a3bbcc8 /mnt/sdc/foobar linked 131072/131072 bytes at offset 131072 128 KiB, 1 ops; 0.0033 sec (36.960 MiB/sec and 295.6830 ops/sec) linked 131072/131072 bytes at offset 262144 128 KiB, 1 ops; 0.0015 sec (78.567 MiB/sec and 628.5355 ops/sec) Digest after cloning: 5a0888d80d7ab1fd31c229f83a3bbcc8 /mnt/sdc/foobar Digest after hole punching: 5a0888d80d7ab1fd31c229f83a3bbcc8 /mnt/sdc/foobar Dropping page cache... Digest after hole punching: fba694ae8664ed0c2e9ff8937e7f1484 /mnt/sdc/foobar This happens because after reading all the pages of the extent in the range from 128K to 256K for example, we read the hole at offset 256K and then when reading the page at offset 260K we don't submit the existing bio, which is responsible for filling all the page in the range 128K to 256K only, therefore adding the pages from range 260K to 384K to the existing bio and submitting it after iterating over the entire range. Once the bio completes, the uncompressed data fills only the pages in the range 128K to 256K because there's no more data read from disk, leaving the pages in the range 260K to 384K unfilled. It is just a slightly different variant of what was solved by commit 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared extents"). Fix this by forcing a bio submit, during readpages(), whenever we find a compressed extent map for a page that is different from the extent map for the previous page or has a different starting offset (in case it's the same compressed extent), instead of the extent map's original start offset. A test case for fstests follows soon. Reported-by: Zygo Blaxell Fixes: 808f80b46790f ("Btrfs: update fix for read corruption of compressed and shared extents") Fixes: 005efedf2c7d0 ("Btrfs: fix read corruption of compressed and shared extents") Cc: stable@vger.kernel.org # 4.3+ Tested-by: Zygo Blaxell Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 79f82f2ec4d5..90b0a6eff535 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3002,11 +3002,11 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->orig_start) + *prev_em_start != em->start) force_bio_submit = true; if (prev_em_start) - *prev_em_start = em->orig_start; + *prev_em_start = em->start; free_extent_map(em); em = NULL; -- cgit v1.2.3 From cdf9941b770748eb007bbe98df618b0c705b7745 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 11 Feb 2019 00:02:05 -0500 Subject: ext4: fix check of inode in swap_inode_boot_loader commit 67a11611e1a5211f6569044fbf8150875764d1d0 upstream. Before really do swap between inode and boot inode, something need to check to avoid invalid or not permitted operation, like does this inode has inline data. But the condition check should be protected by inode lock to avoid change while swapping. Also some other condition will not change between swapping, but there has no problem to do this under inode lock. Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d37dafa1d133..597e8b617f92 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -116,15 +116,6 @@ static long swap_inode_boot_loader(struct super_block *sb, struct inode *inode_bl; struct ext4_inode_info *ei_bl; - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || - IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || - ext4_has_inline_data(inode)) - return -EINVAL; - - if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) - return -EPERM; - inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); @@ -137,6 +128,19 @@ static long swap_inode_boot_loader(struct super_block *sb, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); -- cgit v1.2.3 From 071f68163cc0e98d543685a188dc2d909b6bb464 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 11 Feb 2019 00:05:24 -0500 Subject: ext4: cleanup pagecache before swap i_data commit a46c68a318b08f819047843abf349aeee5d10ac2 upstream. While do swap, we should make sure there has no new dirty page since we should swap i_data between two inode: 1.We should lock i_mmap_sem with write to avoid new pagecache from mmap read/write; 2.Change filemap_flush to filemap_write_and_wait and move them to the space protected by inode lock to avoid new pagecache from buffer read/write. Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 597e8b617f92..ea05e8d641e9 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -121,9 +121,6 @@ static long swap_inode_boot_loader(struct super_block *sb, return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); - filemap_flush(inode->i_mapping); - filemap_flush(inode_bl->i_mapping); - /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); @@ -141,6 +138,15 @@ static long swap_inode_boot_loader(struct super_block *sb, goto journal_err_out; } + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -151,7 +157,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto journal_err_out; + goto err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -208,6 +214,8 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); +err_out: + up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); -- cgit v1.2.3 From 048bfb5bc05f7be392f025cdcd655878e88a4eaf Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 11 Feb 2019 00:14:02 -0500 Subject: ext4: update quota information while swapping boot loader inode commit aa507b5faf38784defe49f5e64605ac3c4425e26 upstream. While do swap between two inode, they swap i_data without update quota information. Also, swap_inode_boot_loader can do "revert" somtimes, so update the quota while all operations has been finished. Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 56 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ea05e8d641e9..eff68358fae7 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -68,8 +68,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_blocks, inode2->i_blocks); - swap(inode1->i_bytes, inode2->i_bytes); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); @@ -115,6 +113,9 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) @@ -180,6 +181,13 @@ static long swap_inode_boot_loader(struct super_block *sb, memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); @@ -193,24 +201,46 @@ static long swap_inode_boot_loader(struct super_block *sb, err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { + /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); - } else { - err = ext4_mark_inode_dirty(handle, inode_bl); - if (err < 0) { - ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", - inode_bl->i_ino, err); - /* Revert all changes: */ - swap_inode_data(inode, inode_bl); - ext4_mark_inode_dirty(handle, inode); - ext4_mark_inode_dirty(handle, inode_bl); - } + goto err_out1; + } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); + } + +err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); -- cgit v1.2.3 From a0d876c77705c1dd2df2d89e040bf570755cd676 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Mon, 11 Feb 2019 00:35:06 -0500 Subject: ext4: add mask of ext4 flags to swap commit abdc644e8cbac2e9b19763680e5a7cf9bab2bee7 upstream. The reason is that while swapping two inode, we swap the flags too. Some flags such as EXT4_JOURNAL_DATA_FL can really confuse the things since we're not resetting the address operations structure. The simplest way to keep things sane is to restrict the flags that can be swapped. Signed-off-by: yangerkun Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 3 +++ fs/ext4/ioctl.c | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 032cf9b92665..2ddf7833350d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -435,6 +435,9 @@ struct flex_groups { /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eff68358fae7..2e76fb55d94a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -63,6 +63,7 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; + unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); @@ -72,7 +73,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - swap(ei1->i_flags, ei2->i_flags); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); -- cgit v1.2.3 From 8a4fdc649ca9ac3b7b4fd363ec713d6d8ef38046 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Feb 2019 13:30:32 -0500 Subject: ext4: fix crash during online resizing commit f96c3ac8dfc24b4e38fc4c2eba5fea2107b929d1 upstream. When computing maximum size of filesystem possible with given number of group descriptor blocks, we forget to include s_first_data_block into the number of blocks. Thus for filesystems with non-zero s_first_data_block it can happen that computed maximum filesystem size is actually lower than current filesystem size which confuses the code and eventually leads to a BUG_ON in ext4_alloc_group_tables() hitting on flex_gd->count == 0. The problem can be reproduced like: truncate -s 100g /tmp/image mkfs.ext4 -b 1024 -E resize=262144 /tmp/image 32768 mount -t ext4 -o loop /tmp/image /mnt resize2fs /dev/loop0 262145 resize2fs /dev/loop0 300000 Fix the problem by properly including s_first_data_block into the computed number of filesystem blocks. Fixes: 1c6bd7173d66 "ext4: convert file system to meta_bg if needed..." Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 48421de803b7..3d9b18505c0c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1960,7 +1960,8 @@ retry: le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * - EXT4_BLOCKS_PER_GROUP(sb); + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } -- cgit v1.2.3 From 62600af3a7cb11d20439f745451db96bde7ff08c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 29 Jan 2019 17:17:24 +0100 Subject: ext2: Fix underflow in ext2_max_size() commit 1c2d14212b15a60300a2d4f6364753e87394c521 upstream. When ext2 filesystem is created with 64k block size, ext2_max_size() will return value less than 0. Also, we cannot write any file in this fs since the sb->maxbytes is less than 0. The core of the problem is that the size of block index tree for such large block size is more than i_blocks can carry. So fix the computation to count with this possibility. File size limits computed with the new function for the full range of possible block sizes look like: bits file_size 10 17247252480 11 275415851008 12 2196873666560 13 2197948973056 14 2198486220800 15 2198754754560 16 2198888906752 CC: stable@vger.kernel.org Reported-by: yangerkun Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext2/super.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0c38e31ec938..364e647d87c0 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -761,7 +761,8 @@ static loff_t ext2_max_size(int bits) { loff_t res = EXT2_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; + unsigned int upper_limit; + unsigned int ppb = 1 << (bits-2); /* This is calculated to be the largest file size for a * dense, file such that the total number of @@ -775,24 +776,34 @@ static loff_t ext2_max_size(int bits) /* total blocks in file system block size */ upper_limit >>= (bits - 9); + /* Compute how many blocks we can address by block tree */ + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + /* Does block tree limit file size? */ + if (res < upper_limit) + goto check_lfs; + res = upper_limit; + /* How many metadata blocks are needed for addressing upper_limit? */ + upper_limit -= EXT2_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; + upper_limit -= ppb; /* double indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)); - /* tripple indirect blocks */ - meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); - - upper_limit -= meta_blocks; - upper_limit <<= bits; - - res += 1LL << (bits-2); - res += 1LL << (2*(bits-2)); - res += 1LL << (3*(bits-2)); + if (upper_limit < ppb * ppb) { + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb); + res -= meta_blocks; + goto check_lfs; + } + meta_blocks += 1 + ppb; + upper_limit -= ppb * ppb; + /* tripple indirect blocks for the rest */ + meta_blocks += 1 + DIV_ROUND_UP(upper_limit, ppb) + + DIV_ROUND_UP(upper_limit, ppb*ppb); + res -= meta_blocks; +check_lfs: res <<= bits; - if (res > upper_limit) - res = upper_limit; - if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; -- cgit v1.2.3 From dbe4bc993836f64e8c26b109addc1ac6eb997bab Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sun, 10 Feb 2019 23:23:04 -0500 Subject: jbd2: clear dirty flag when revoking a buffer from an older transaction commit 904cdbd41d749a476863a0ca41f6f396774f26e4 upstream. Now, we capture a data corruption problem on ext4 while we're truncating an extent index block. Imaging that if we are revoking a buffer which has been journaled by the committing transaction, the buffer's jbddirty flag will not be cleared in jbd2_journal_forget(), so the commit code will set the buffer dirty flag again after refile the buffer. fsx kjournald2 jbd2_journal_commit_transaction jbd2_journal_revoke commit phase 1~5... jbd2_journal_forget belongs to older transaction commit phase 6 jbddirty not clear __jbd2_journal_refile_buffer __jbd2_journal_unfile_buffer test_clear_buffer_jbddirty mark_buffer_dirty Finally, if the freed extent index block was allocated again as data block by some other files, it may corrupt the file data after writing cached pages later, such as during unmount time. (In general, clean_bdev_aliases() related helpers should be invoked after re-allocation to prevent the above corruption, but unfortunately we missed it when zeroout the head of extra extent blocks in ext4_ext_handle_unwritten_extents()). This patch mark buffer as freed and set j_next_transaction to the new transaction when it already belongs to the committing transaction in jbd2_journal_forget(), so that commit code knows it should clear dirty bits when it is done with the buffer. This problem can be reproduced by xfstests generic/455 easily with seeds (3246 3247 3248 3249). Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/transaction.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index c0b66a7a795b..acb052c62c24 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1576,14 +1576,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); - jh->b_next_transaction = NULL; + jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified -- cgit v1.2.3 From 584f390d1039e06512f716f5c30567fdc3763a17 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 21 Feb 2019 11:24:09 -0500 Subject: jbd2: fix compile warning when using JBUFFER_TRACE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 01215d3edb0f384ddeaa5e4a22c1ae5ff634149f upstream. The jh pointer may be used uninitialized in the two cases below and the compiler complain about it when enabling JBUFFER_TRACE macro, fix them. In file included from fs/jbd2/transaction.c:19:0: fs/jbd2/transaction.c: In function ‘jbd2_journal_get_undo_access’: ./include/linux/jbd2.h:1637:38: warning: ‘jh’ is used uninitialized in this function [-Wuninitialized] #define JBUFFER_TRACE(jh, info) do { printk("%s: %d\n", __func__, jh->b_jcount);} while (0) ^ fs/jbd2/transaction.c:1219:23: note: ‘jh’ was declared here struct journal_head *jh; ^ In file included from fs/jbd2/transaction.c:19:0: fs/jbd2/transaction.c: In function ‘jbd2_journal_dirty_metadata’: ./include/linux/jbd2.h:1637:38: warning: ‘jh’ may be used uninitialized in this function [-Wmaybe-uninitialized] #define JBUFFER_TRACE(jh, info) do { printk("%s: %d\n", __func__, jh->b_jcount);} while (0) ^ fs/jbd2/transaction.c:1332:23: note: ‘jh’ was declared here struct journal_head *jh; ^ Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/transaction.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index acb052c62c24..914e725c82c4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1219,11 +1219,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; - JBUFFER_TRACE(jh, "entry"); if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1334,15 +1335,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - if (!buffer_jbd(bh)) { - ret = -EUCLEAN; - goto out; - } + if (!buffer_jbd(bh)) + return -EUCLEAN; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is @@ -1376,9 +1379,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } journal = transaction->t_journal; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - jbd_lock_bh_state(bh); if (jh->b_modified == 0) { -- cgit v1.2.3 From be74fddc976e40a2a535168182783d82154d66e8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Feb 2019 09:21:38 -0500 Subject: NFS: Fix I/O request leakages commit f57dcf4c72113c745d83f1c65f7291299f65c14f upstream. When we fail to add the request to the I/O queue, we currently leave it to the caller to free the failed request. However since some of the requests that fail are actually created by nfs_pageio_add_request() itself, and are not passed back the caller, this leads to a leakage issue, which can again cause page locks to leak. This commit addresses the leakage by freeing the created requests on error, using desc->pg_completion_ops->error_cleanup() Signed-off-by: Trond Myklebust Fixes: a7d42ddb30997 ("nfs: add mirroring support to pgio layer") Cc: stable@vger.kernel.org # v4.0: c18b96a1b862: nfs: clean up rest of reqs Cc: stable@vger.kernel.org # v4.0: d600ad1f2bdb: NFS41: pop some layoutget Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Greg Kroah-Hartman --- fs/nfs/pagelist.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 3dbd15b47c27..5dfefae1b47d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -989,6 +989,17 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) } } +static void +nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + LIST_HEAD(head); + + nfs_list_remove_request(req); + nfs_list_add_request(req, &head); + desc->pg_completion_ops->error_cleanup(&head); +} + /** * nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor @@ -1026,10 +1037,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - if (mirror->pg_recoalesce) - return 0; + if (desc->pg_error < 0 || mirror->pg_recoalesce) + goto out_cleanup_subreq; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; @@ -1062,6 +1071,10 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; +out_cleanup_subreq: + if (req != subreq) + nfs_pageio_cleanup_request(desc, subreq); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1169,11 +1182,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - goto out_failed; + goto out_cleanup_subreq; } return 1; +out_cleanup_subreq: + if (req != dupreq) + nfs_pageio_cleanup_request(desc, dupreq); out_failed: /* remember fatal errors */ if (nfs_error_is_fatal(desc->pg_error)) -- cgit v1.2.3 From 63b0ee126f7ea398e3e68ae7e57598e080865a18 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Feb 2019 14:59:52 -0500 Subject: NFS: Fix an I/O request leakage in nfs_do_recoalesce commit 4d91969ed4dbcefd0e78f77494f0cb8fada9048a upstream. Whether we need to exit early, or just reprocess the list, we must not lost track of the request which failed to get recoalesced. Fixes: 03d5eb65b538 ("NFS: Fix a memory leak in nfs_do_recoalesce") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Greg Kroah-Hartman --- fs/nfs/pagelist.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 5dfefae1b47d..7c9b75c546e6 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1093,7 +1093,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); - nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) { -- cgit v1.2.3 From 2c648caf630db6f2e3429459b6a7fa4760275c58 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Feb 2019 16:08:25 -0500 Subject: NFS: Don't recoalesce on error in nfs_pageio_complete_mirror() commit 8127d82705998568b52ac724e28e00941538083d upstream. If the I/O completion failed with a fatal error, then we should just exit nfs_pageio_complete_mirror() rather than try to recoalesce. Fixes: a7d42ddb3099 ("nfs: add mirroring support to pgio layer") Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Greg Kroah-Hartman --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7c9b75c546e6..0ec6bce3dd69 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -1214,7 +1214,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); - if (!mirror->pg_recoalesce) + if (desc->pg_error < 0 || !mirror->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; -- cgit v1.2.3 From 10a68cdf1035689fe9bfb5e413110cefad510508 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Feb 2019 10:47:00 -0500 Subject: nfsd: fix performance-limiting session calculation commit c54f24e338ed2a35218f117a4a1afb5f9e2b4e64 upstream. We're unintentionally limiting the number of slots per nfsv4.1 session to 10. Often more than 10 simultaneous RPCs are needed for the best performance. This calculation was meant to prevent any one client from using up more than a third of the limit we set for total memory use across all clients and sessions. Instead, it's limiting the client to a third of the maximum for a single session. Fix this. Reported-by: Chris Tracy Cc: stable@vger.kernel.org Fixes: de766e570413 "nfsd: give out fewer session slots as limit approaches" Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9c6d1d57b598..bec75600e692 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1514,16 +1514,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; - int avail; + unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, * unless it's the only way to give this client a slot: */ - avail = clamp_t(int, avail, slotsize, avail/3); + avail = clamp_t(int, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); -- cgit v1.2.3 From 8056912c1c75fc27127dd7d092c197e01c63b6ad Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 4 Mar 2019 14:08:22 +1100 Subject: nfsd: fix memory corruption caused by readdir commit b602345da6cbb135ba68cf042df8ec9a73da7981 upstream. If the result of an NFSv3 readdir{,plus} request results in the "offset" on one entry having to be split across 2 pages, and is sized so that the next directory entry doesn't fit in the requested size, then memory corruption can happen. When encode_entry() is called after encoding the last entry that fits, it notices that ->offset and ->offset1 are set, and so stores the offset value in the two pages as required. It clears ->offset1 but *does not* clear ->offset. Normally this omission doesn't matter as encode_entry_baggage() will be called, and will set ->offset to a suitable value (not on a page boundary). But in the case where cd->buflen < elen and nfserr_toosmall is returned, ->offset is not reset. This means that nfsd3proc_readdirplus will see ->offset with a value 4 bytes before the end of a page, and ->offset1 set to NULL. It will try to write 8bytes to ->offset. If we are lucky, the next page will be read-only, and the system will BUG: unable to handle kernel paging request at... If we are unlucky, some innocent page will have the first 4 bytes corrupted. nfsd3proc_readdir() doesn't even check for ->offset1, it just blindly writes 8 bytes to the offset wherever it is. Fix this by clearing ->offset after it is used, and copying the ->offset handling code from nfsd3_proc_readdirplus into nfsd3_proc_readdir. (Note that the commit hash in the Fixes tag is from the 'history' tree - this bug predates git). Fixes: 0b1d57cf7654 ("[PATCH] kNFSd: Fix nfs3 dentry encoding") Fixes-URL: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=0b1d57cf7654 Cc: stable@vger.kernel.org (v2.6.12+) Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs3proc.c | 16 ++++++++++++++-- fs/nfsd/nfs3xdr.c | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841..c9cf46e0c040 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } RETURN_STATUS(nfserr); } @@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } else { xdr_encode_hyper(resp->offset, offset); } + resp->offset = NULL; } RETURN_STATUS(nfserr); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01..83919116d5cb 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -921,6 +921,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } else { xdr_encode_hyper(cd->offset, offset64); } + cd->offset = NULL; } /* -- cgit v1.2.3 From ecab6ab1c22726d2c439b587d103700f72210b4b Mon Sep 17 00:00:00 2001 From: Yihao Wu Date: Wed, 6 Mar 2019 21:03:50 +0800 Subject: nfsd: fix wrong check in write_v4_end_grace() commit dd838821f0a29781b185cd8fb8e48d5c177bd838 upstream. Commit 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before nfsd startup" is trying to fix a NULL dereference issue, but it mistakenly checks if the nfsd server is started. So fix it. Fixes: 62a063b8e7d1 "nfsd4: fix crash on writing v4_end_grace before nfsd startup" Cc: stable@vger.kernel.org Reviewed-by: Joseph Qi Signed-off-by: Yihao Wu Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 39b835d7c445..cb69660d0779 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': - if (nn->nfsd_serv) + if (!nn->nfsd_serv) return -EBUSY; nfsd4_end_grace(nn); break; -- cgit v1.2.3 From 4af185feb9df8fae93717df2fe31e51d89139172 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 1 Mar 2019 12:13:34 -0500 Subject: NFSv4.1: Reinitialise sequence results before retransmitting a request commit c1dffe0bf7f9c3d57d9f237a7cb2a81e62babd2b upstream. If we have to retransmit a request, we should ensure that we reinitialise the sequence results structure, since in the event of a signal we need to treat the request as if it had not been sent. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8220a168282e..e7abcf7629b3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -947,6 +947,13 @@ nfs4_sequence_process_interrupted(struct nfs_client *client, #endif /* !CONFIG_NFS_V4_1 */ +static void nfs41_sequence_res_init(struct nfs4_sequence_res *res) +{ + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; +} + static void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -958,10 +965,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, args->sa_slot = slot; res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } int nfs4_setup_sequence(struct nfs_client *client, @@ -1007,6 +1010,7 @@ int nfs4_setup_sequence(struct nfs_client *client, trace_nfs4_setup_sequence(session, args); out_start: + nfs41_sequence_res_init(res); rpc_call_start(task); return 0; -- cgit v1.2.3 From c72e90d94a148be5b04dd47d84303006d645d36e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Mar 2019 15:04:18 +0100 Subject: udf: Fix crash on IO error during truncate commit d3ca4651d05c0ff7259d087d8c949bcf3e14fb46 upstream. When truncate(2) hits IO error when reading indirect extent block the code just bugs with: kernel BUG at linux-4.15.0/fs/udf/truncate.c:249! ... Fix the problem by bailing out cleanly in case of IO error. CC: stable@vger.kernel.org Reported-by: jean-luc malet Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/truncate.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index b647f0bd150c..94220ba85628 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> -- cgit v1.2.3 From 14c52acaac86ec7fb99c3a1287227664cfc5eb54 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 21 Mar 2019 14:59:02 +1000 Subject: cifs: allow guest mounts to work for smb3.11 commit e71ab2aa06f731a944993120b0eef1556c63b81c upstream. Fix Guest/Anonymous sessions so that they work with SMB 3.11. The commit noted below tightened the conditions and forced signing for the SMB2-TreeConnect commands as per MS-SMB2. However, this should only apply to normal user sessions and not for Guest/Anonumous sessions. Fixes: 6188f28bf608 ("Tree connect for SMB3.1.1 must be signed for non-encrypted shares") Signed-off-by: Ronnie Sahlberg CC: Stable Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index a2d701775c49..3d8db0504f5c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1500,9 +1500,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, iov[1].iov_base = unc_path; iov[1].iov_len = unc_path_len; - /* 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 */ + /* + * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 + * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + */ if ((ses->server->dialect == SMB311_PROT_ID) && - !smb3_encryption_required(tcon)) + !smb3_encryption_required(tcon) && + !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); -- cgit v1.2.3 From 38bd575b9aef70c54a2fc9559b01f077b335aaa4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Mar 2019 22:31:17 -0500 Subject: SMB3: Fix SMB3.1.1 guest mounts to Samba commit 8c11a607d1d9cd6e7f01fd6b03923597fb0ef95a upstream. Workaround problem with Samba responses to SMB3.1.1 null user (guest) mounts. The server doesn't set the expected flag in the session setup response so we have to do a similar check to what is done in smb3_validate_negotiate where we also check if the user is a null user (but not sec=krb5 since username might not be passed in on mount for Kerberos case). Note that the commit below tightened the conditions and forced signing for the SMB2-TreeConnect commands as per MS-SMB2. However, this should only apply to normal user sessions and not for cases where there is no user (even if server forgets to set the flag in the response) since we don't have anything useful to sign with. This is especially important now that the more secure SMB3.1.1 protocol is in the default dialect list. An earlier patch ("cifs: allow guest mounts to work for smb3.11") fixed the guest mounts to Windows. Fixes: 6188f28bf608 ("Tree connect for SMB3.1.1 must be signed for non-encrypted shares") Reviewed-by: Ronnie Sahlberg Reviewed-by: Paulo Alcantara CC: Stable Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 3d8db0504f5c..3d0db37d64ad 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1503,10 +1503,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, /* * 3.11 tcon req must be signed if not encrypted. See MS-SMB2 3.2.4.1.1 * unless it is guest or anonymous user. See MS-SMB2 3.2.5.3.1 + * (Samba servers don't always set the flag so also check if null user) */ if ((ses->server->dialect == SMB311_PROT_ID) && !smb3_encryption_required(tcon) && - !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL))) + !(ses->session_flags & + (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && + ((ses->user_name != NULL) || (ses->sectype == Kerberos))) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); -- cgit v1.2.3 From 558331d0205bea630477c2046af32dd1a5540f07 Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Thu, 14 Mar 2019 23:19:22 -0400 Subject: ext4: fix NULL pointer dereference while journal is aborted commit fa30dde38aa8628c73a6dded7cb0bba38c27b576 upstream. We see the following NULL pointer dereference while running xfstests generic/475: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 PGD 8000000c84bad067 P4D 8000000c84bad067 PUD c84e62067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 7 PID: 9886 Comm: fsstress Kdump: loaded Not tainted 5.0.0-rc8 #10 RIP: 0010:ext4_do_update_inode+0x4ec/0x760 ... Call Trace: ? jbd2_journal_get_write_access+0x42/0x50 ? __ext4_journal_get_write_access+0x2c/0x70 ? ext4_truncate+0x186/0x3f0 ext4_mark_iloc_dirty+0x61/0x80 ext4_mark_inode_dirty+0x62/0x1b0 ext4_truncate+0x186/0x3f0 ? unmap_mapping_pages+0x56/0x100 ext4_setattr+0x817/0x8b0 notify_change+0x1df/0x430 do_truncate+0x5e/0x90 ? generic_permission+0x12b/0x1a0 This is triggered because the NULL pointer handle->h_transaction was dereferenced in function ext4_update_inode_fsync_trans(). I found that the h_transaction was set to NULL in jbd2__journal_restart but failed to attached to a new transaction while the journal is aborted. Fix this by checking the handle before updating the inode. Fixes: b436b9bef84d ("ext4: Wait for proper transaction commit on fsync") Signed-off-by: Jiufei Xue Signed-off-by: Theodore Ts'o Reviewed-by: Joseph Qi Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4_jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 15b6dd733780..df908ef79cce 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -384,7 +384,7 @@ static inline void ext4_update_inode_fsync_trans(handle_t *handle, { struct ext4_inode_info *ei = EXT4_I(inode); - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) { ei->i_sync_tid = handle->h_transaction->t_tid; if (datasync) ei->i_datasync_tid = handle->h_transaction->t_tid; -- cgit v1.2.3 From 76c9ee6bd5d23caf5e57cd91282b8ff374e60437 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 14 Mar 2019 23:20:25 -0400 Subject: ext4: fix data corruption caused by unaligned direct AIO commit 372a03e01853f860560eade508794dd274e9b390 upstream. Ext4 needs to serialize unaligned direct AIO because the zeroing of partial blocks of two competing unaligned AIOs can result in data corruption. However it decides not to serialize if the potentially unaligned aio is past i_size with the rationale that no pending writes are possible past i_size. Unfortunately if the i_size is not block aligned and the second unaligned write lands past i_size, but still into the same block, it has the potential of corrupting the previous unaligned write to the same block. This is (very simplified) reproducer from Frank // 41472 = (10 * 4096) + 512 // 37376 = 41472 - 4096 ftruncate(fd, 41472); io_prep_pwrite(iocbs[0], fd, buf[0], 4096, 37376); io_prep_pwrite(iocbs[1], fd, buf[1], 4096, 41472); io_submit(io_ctx, 1, &iocbs[1]); io_submit(io_ctx, 1, &iocbs[2]); io_getevents(io_ctx, 2, 2, events, NULL); Without this patch the 512B range from 40960 up to the start of the second unaligned write (41472) is going to be zeroed overwriting the data written by the first write. This is a data corruption. 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 00009200 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 * 0000a000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0000a200 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 With this patch the data corruption is avoided because we will recognize the unaligned_aio and wait for the unwritten extent conversion. 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 00009200 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 * 0000a200 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 * 0000b200 Reported-by: Frank Sorenson Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Fixes: e9e3bcecf44c ("ext4: serialize unaligned asynchronous DIO") Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 69d65d49837b..98ec11f69cd4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -125,7 +125,7 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; - if (pos >= i_size_read(inode)) + if (pos >= ALIGN(i_size_read(inode), sb->s_blocksize)) return 0; if ((pos | iov_iter_alignment(from)) & blockmask) -- cgit v1.2.3 From d12d86411c03d9127f0bc0766e25618b9e5c7568 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 23 Mar 2019 11:43:05 -0400 Subject: ext4: brelse all indirect buffer in ext4_ind_remove_space() commit 674a2b27234d1b7afcb0a9162e81b2e53aeef217 upstream. All indirect buffers get by ext4_find_shared() should be released no mater the branch should be freed or not. But now, we forget to release the lower depth indirect buffers when removing space from the same higher depth indirect block. It will lead to buffer leak and futher more, it may lead to quota information corruption when using old quota, consider the following case. - Create and mount an empty ext4 filesystem without extent and quota features, - quotacheck and enable the user & group quota, - Create some files and write some data to them, and then punch hole to some files of them, it may trigger the buffer leak problem mentioned above. - Disable quota and run quotacheck again, it will create two new aquota files and write the checked quota information to them, which probably may reuse the freed indirect block(the buffer and page cache was not freed) as data block. - Enable quota again, it will invoke vfs_load_quota_inode()->invalidate_bdev() to try to clean unused buffers and pagecache. Unfortunately, because of the buffer of quota data block is still referenced, quota code cannot read the up to date quota info from the device and lead to quota information corruption. This problem can be reproduced by xfstests generic/231 on ext3 file system or ext4 file system without extent and quota features. This patch fix this problem by releasing the missing indirect buffers, in ext4_ind_remove_space(). Reported-by: Hulk Robot Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/indirect.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..9e96a0bd08d9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1387,10 +1387,14 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + } + while (partial2 > chain2) { + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + } return 0; } -- cgit v1.2.3 From 1fd916e879a9b2fabca931c012a59409e783d241 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 25 Feb 2019 17:11:03 +0800 Subject: f2fs: fix to avoid deadlock of atomic file operations commit 48432984d718c95cf13e26d487c2d1b697c3c01f upstream. Thread A Thread B - __fput - f2fs_release_file - drop_inmem_pages - mutex_lock(&fi->inmem_lock) - __revoke_inmem_pages - lock_page(page) - open - f2fs_setattr - truncate_setsize - truncate_inode_pages_range - lock_page(page) - truncate_cleanup_page - f2fs_invalidate_page - drop_inmem_page - mutex_lock(&fi->inmem_lock); We may encounter above ABBA deadlock as reported by Kyungtae Kim: I'm reporting a bug in linux-4.17.19: "INFO: task hung in drop_inmem_page" (no reproducer) I think this might be somehow related to the following: https://groups.google.com/forum/#!searchin/syzkaller-bugs/INFO$3A$20task$20hung$20in$20%7Csort:date/syzkaller-bugs/c6soBTrdaIo/AjAzPeIzCgAJ ========================================= INFO: task syz-executor7:10822 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D27024 10822 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 schedule_preempt_disabled+0x18/0x30 kernel/sched/core.c:3617 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0x5bd/0x1410 kernel/locking/mutex.c:893 mutex_lock_nested+0x1b/0x20 kernel/locking/mutex.c:908 drop_inmem_page+0xcb/0x810 fs/f2fs/segment.c:327 f2fs_invalidate_page+0x337/0x5e0 fs/f2fs/data.c:2401 do_invalidatepage mm/truncate.c:165 [inline] truncate_cleanup_page+0x261/0x330 mm/truncate.c:187 truncate_inode_pages_range+0x552/0x1610 mm/truncate.c:367 truncate_inode_pages mm/truncate.c:478 [inline] truncate_pagecache+0x6d/0x90 mm/truncate.c:801 truncate_setsize+0x81/0xa0 mm/truncate.c:826 f2fs_setattr+0x44f/0x1270 fs/f2fs/file.c:781 notify_change+0xa62/0xe80 fs/attr.c:313 do_truncate+0x12e/0x1e0 fs/open.c:63 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e459c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e45a6cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e45a700 INFO: task syz-executor7:10858 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor7 D28880 10858 6346 0x00000004 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 __rwsem_down_write_failed_common kernel/locking/rwsem-xadd.c:565 [inline] rwsem_down_write_failed+0x5e6/0xc90 kernel/locking/rwsem-xadd.c:594 call_rwsem_down_write_failed+0x17/0x30 arch/x86/lib/rwsem.S:117 __down_write arch/x86/include/asm/rwsem.h:142 [inline] down_write+0x58/0xa0 kernel/locking/rwsem.c:72 inode_lock include/linux/fs.h:713 [inline] do_truncate+0x120/0x1e0 fs/open.c:61 do_last fs/namei.c:2955 [inline] path_openat+0x2042/0x29f0 fs/namei.c:3505 do_filp_open+0x1bd/0x2c0 fs/namei.c:3540 do_sys_open+0x35e/0x4e0 fs/open.c:1101 __do_sys_open fs/open.c:1119 [inline] __se_sys_open fs/open.c:1114 [inline] __x64_sys_open+0x89/0xc0 fs/open.c:1114 do_syscall_64+0xc4/0x4e0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f734e3b4c68 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 RAX: ffffffffffffffda RBX: 00007f734e3b56cc RCX: 00000000004497b9 RDX: 0000000000000104 RSI: 00000000000a8280 RDI: 0000000020000080 RBP: 000000000071c238 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 0000000000007230 R14: 00000000006f02d0 R15: 00007f734e3b5700 INFO: task syz-executor5:10829 blocked for more than 120 seconds. Not tainted 4.17.19 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor5 D28760 10829 6308 0x80000002 Call Trace: context_switch kernel/sched/core.c:2867 [inline] __schedule+0x721/0x1e60 kernel/sched/core.c:3515 schedule+0x88/0x1c0 kernel/sched/core.c:3559 io_schedule+0x21/0x80 kernel/sched/core.c:5179 wait_on_page_bit_common mm/filemap.c:1100 [inline] __lock_page+0x2b5/0x390 mm/filemap.c:1273 lock_page include/linux/pagemap.h:483 [inline] __revoke_inmem_pages+0xb35/0x11c0 fs/f2fs/segment.c:231 drop_inmem_pages+0xa3/0x3e0 fs/f2fs/segment.c:306 f2fs_release_file+0x2c7/0x330 fs/f2fs/file.c:1556 __fput+0x2c7/0x780 fs/file_table.c:209 ____fput+0x1a/0x20 fs/file_table.c:243 task_work_run+0x151/0x1d0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x8ba/0x30a0 kernel/exit.c:865 do_group_exit+0x13b/0x3a0 kernel/exit.c:968 get_signal+0x6bb/0x1650 kernel/signal.c:2482 do_signal+0x84/0x1b70 arch/x86/kernel/signal.c:810 exit_to_usermode_loop+0x155/0x190 arch/x86/entry/common.c:162 prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline] syscall_return_slowpath arch/x86/entry/common.c:265 [inline] do_syscall_64+0x445/0x4e0 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4497b9 RSP: 002b:00007f1c68e74ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: fffffffffffffe00 RBX: 000000000071bf80 RCX: 00000000004497b9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000071bf80 RBP: 000000000071bf80 R08: 0000000000000000 R09: 000000000071bf58 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007f1c68e759c0 R15: 00007f1c68e75700 This patch tries to use trylock_page to mitigate such deadlock condition for fix. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/segment.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30779aaa9dba..1fa6f8185766 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -216,7 +216,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -228,7 +229,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true); @@ -317,13 +327,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -427,12 +443,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; -- cgit v1.2.3 From 22dcb30fb9d8199b9a105f699b828dedb5b59d36 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Mar 2019 14:06:12 +0000 Subject: Btrfs: fix incorrect file size after shrinking truncate and fsync commit bf504110bc8aa05df48b0e5f0aa84bfb81e0574b upstream. If we do a shrinking truncate against an inode which is already present in the respective log tree and then rename it, as part of logging the new name we end up logging an inode item that reflects the old size of the file (the one which we previously logged) and not the new smaller size. The decision to preserve the size previously logged was added by commit 1a4bcf470c886b ("Btrfs: fix fsync data loss after adding hard link to inode") in order to avoid data loss after replaying the log. However that decision is only needed for the case the logged inode size is smaller then the current size of the inode, as explained in that commit's change log. If the current size of the inode is smaller then the previously logged size, we know a shrinking truncate happened and therefore need to use that smaller size. Example to trigger the problem: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ xfs_io -f -c "pwrite -S 0xab 0 8000" /mnt/foo $ xfs_io -c "fsync" /mnt/foo $ xfs_io -c "truncate 3000" /mnt/foo $ mv /mnt/foo /mnt/bar $ xfs_io -c "fsync" /mnt/bar $ mount /dev/sdb /mnt $ od -t x1 -A d /mnt/bar 0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab * 0008000 Once we rename the file, we log its name (and inode item), and because the inode was already logged before in the current transaction, we log it with a size of 8000 bytes because that is the size we previously logged (with the first fsync). As part of the rename, besides logging the inode, we do also sync the log, which is done since commit d4682ba03ef618 ("Btrfs: sync log after logging new name"), so the next fsync against our inode is effectively a no-op, since no new changes happened since the rename operation. Even if did not sync the log during the rename operation, the same problem (fize size of 8000 bytes instead of 3000 bytes) would be visible after replaying the log if the log ended up getting synced to disk through some other means, such as for example by fsyncing some other modified file. In the example above the fsync after the rename operation is there just because not every filesystem may guarantee logging/journalling the inode (and syncing the log/journal) during the rename operation, for example it is needed for f2fs, but not for ext4 and xfs. Fix this scenario by, when logging a new name (which is triggered by rename and link operations), using the current size of the inode instead of the previously logged inode size. A test case for fstests follows soon. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202695 CC: stable@vger.kernel.org # 4.4+ Reported-by: Seulbae Kim Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 0805f8c5e72d..b08c47a7976c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4504,6 +4504,19 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode + * size stored in the btree, return the inode's i_size, so + * that we get a correct inode size after replaying the log + * when before a power failure we had a shrinking truncate + * followed by addition of a new name (rename / new hard link). + * Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a + * write that expands the inode's size and logging a new name + * immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; } btrfs_release_path(path); -- cgit v1.2.3 From b57220cc982033dc8e1e8d398cb006b72b275213 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 6 Mar 2019 17:13:04 -0500 Subject: btrfs: remove WARN_ON in log_dir_items commit 2cc8334270e281815c3850c3adea363c51f21e0d upstream. When Filipe added the recursive directory logging stuff in 2f2ff0ee5e430 ("Btrfs: fix metadata inconsistencies after directory fsync") he specifically didn't take the directory i_mutex for the children directories that we need to log because of lockdep. This is generally fine, but can lead to this WARN_ON() tripping if we happen to run delayed deletion's in between our first search and our second search of dir_item/dir_indexes for this directory. We expect this to happen, so the WARN_ON() isn't necessary. Drop the WARN_ON() and add a comment so we know why this case can happen. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b08c47a7976c..5af65d53618b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3532,9 +3532,16 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* find the first key from this transaction again */ + /* + * Find the first key from this transaction again. See the note for + * log_new_dir_dentries, if we're logging a directory recursively we + * won't be holding its i_mutex, which means we can modify the directory + * while we're logging it. If we remove an entry between our first + * search and this search we'll not find the key again and can just + * bail. + */ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); - if (WARN_ON(ret != 0)) + if (ret != 0) goto done; /* -- cgit v1.2.3 From d952c337b25d3f3d3e4b2156b2ab83118c7951fd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 7 Mar 2019 15:40:50 +0100 Subject: btrfs: don't report readahead errors and don't update statistics commit 0cc068e6ee59c1fffbfa977d8bf868b7551d80ac upstream. As readahead is an optimization, all errors are usually filtered out, but still properly handled when the real read call is done. The commit 5e9d398240b2 ("btrfs: readpages() should submit IO as read-ahead") added REQ_RAHEAD to readpages() because that's only used for readahead (despite what one would expect from the callback name). This causes a flood of messages and inflated read error stats, so skip reporting in case it's readahead. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202403 Reported-by: LimeTech Fixes: 5e9d398240b2 ("btrfs: readpages() should submit IO as read-ahead") CC: stable@vger.kernel.org # 4.19+ Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c13f62182513..207f4e87445d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6051,7 +6051,7 @@ static void btrfs_end_bio(struct bio *bio) if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - else + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) -- cgit v1.2.3 From 1cf4ab01eb5aa72fab9acf405b4a9037dcc7cefc Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 14 Mar 2019 08:56:28 +0100 Subject: btrfs: raid56: properly unmap parity page in finish_parity_scrub() commit 3897b6f0a859288c22fb793fad11ec2327e60fcd upstream. Parity page is incorrectly unmapped in finish_parity_scrub(), triggering a reference counter bug on i386, i.e.: [ 157.662401] kernel BUG at mm/highmem.c:349! [ 157.666725] invalid opcode: 0000 [#1] SMP PTI The reason is that kunmap(p_page) was completely left out, so we never did an unmap for the p_page and the loop unmapping the rbio page was iterating over the wrong number of stripes: unmapping should be done with nr_data instead of rbio->real_stripes. Test case to reproduce the bug: - create a raid5 btrfs filesystem: # mkfs.btrfs -m raid5 -d raid5 /dev/sdb /dev/sdc /dev/sdd /dev/sde - mount it: # mount /dev/sdb /mnt - run btrfs scrub in a loop: # while :; do btrfs scrub start -BR /mnt; done BugLink: https://bugs.launchpad.net/bugs/1812845 Fixes: 5a6ac9eacb49 ("Btrfs, raid56: support parity scrub on raid56") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Johannes Thumshirn Signed-off-by: Andrea Righi Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/raid56.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index df41d7049936..927f9f3daddb 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2429,8 +2429,9 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, bitmap_clear(rbio->dbitmap, pagenr, 1); kunmap(p); - for (stripe = 0; stripe < rbio->real_stripes; stripe++) + for (stripe = 0; stripe < nr_data; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + kunmap(p_page); } __free_page(p_page); -- cgit v1.2.3 From 0ae3b84b3fa6cd988c8ee65053f6fa34bfda3034 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 18 Mar 2019 17:45:20 +0200 Subject: btrfs: Avoid possible qgroup_rsv_size overflow in btrfs_calculate_inode_block_rsv_size commit 139a56170de67101791d6e6c8e940c6328393fe9 upstream. qgroup_rsv_size is calculated as the product of outstanding_extent * fs_info->nodesize. The product is calculated with 32 bit precision since both variables are defined as u32. Yet qgroup_rsv_size expects a 64 bit result. Avoid possible multiplication overflow by casting outstanding_extent to u64. Such overflow would in the worst case (64K nodesize) require more than 65536 extents, which is quite large and i'ts not likely that it would happen in practice. Fixes-coverity-id: 1435101 Fixes: ff6bc37eb7f6 ("btrfs: qgroup: Use independent and accurate per inode qgroup rsv") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a16760b410b1..c0db7785cede 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5872,7 +5872,7 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, * * This is overestimating in most cases. */ - qgroup_rsv_size = outstanding_extents * fs_info->nodesize; + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; spin_lock(&block_rsv->lock); block_rsv->size = reserve_size; -- cgit v1.2.3 From fd1b25364fef1cb5d7898b751f531bf754a39af5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 19 Mar 2019 17:18:13 +0000 Subject: Btrfs: fix assertion failure on fsync with NO_HOLES enabled commit 0ccc3876e4b2a1559a4dbe3126dda4459d38a83b upstream. Back in commit a89ca6f24ffe4 ("Btrfs: fix fsync after truncate when no_holes feature is enabled") I added an assertion that is triggered when an inline extent is found to assert that the length of the (uncompressed) data the extent represents is the same as the i_size of the inode, since that is true most of the time I couldn't find or didn't remembered about any exception at that time. Later on the assertion was expanded twice to deal with a case of a compressed inline extent representing a range that matches the sector size followed by an expanding truncate, and another case where fallocate can update the i_size of the inode without adding or updating existing extents (if the fallocate range falls entirely within the first block of the file). These two expansion/fixes of the assertion were done by commit 7ed586d0a8241 ("Btrfs: fix assertion on fsync of regular file when using no-holes feature") and commit 6399fb5a0b69a ("Btrfs: fix assertion failure during fsync in no-holes mode"). These however missed the case where an falloc expands the i_size of an inode to exactly the sector size and inline extent exists, for example: $ mkfs.btrfs -f -O no-holes /dev/sdc $ mount /dev/sdc /mnt $ xfs_io -f -c "pwrite -S 0xab 0 1096" /mnt/foobar wrote 1096/1096 bytes at offset 0 1 KiB, 1 ops; 0.0002 sec (4.448 MiB/sec and 4255.3191 ops/sec) $ xfs_io -c "falloc 1096 3000" /mnt/foobar $ xfs_io -c "fsync" /mnt/foobar Segmentation fault $ dmesg [701253.602385] assertion failed: len == i_size || (len == fs_info->sectorsize && btrfs_file_extent_compression(leaf, extent) != BTRFS_COMPRESS_NONE) || (len < i_size && i_size < fs_info->sectorsize), file: fs/btrfs/tree-log.c, line: 4727 [701253.602962] ------------[ cut here ]------------ [701253.603224] kernel BUG at fs/btrfs/ctree.h:3533! [701253.603503] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [701253.603774] CPU: 2 PID: 7192 Comm: xfs_io Tainted: G W 5.0.0-rc8-btrfs-next-45 #1 [701253.604054] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [701253.604650] RIP: 0010:assfail.constprop.23+0x18/0x1a [btrfs] (...) [701253.605591] RSP: 0018:ffffbb48c186bc48 EFLAGS: 00010286 [701253.605914] RAX: 00000000000000de RBX: ffff921d0a7afc08 RCX: 0000000000000000 [701253.606244] RDX: 0000000000000000 RSI: ffff921d36b16868 RDI: ffff921d36b16868 [701253.606580] RBP: ffffbb48c186bcf0 R08: 0000000000000000 R09: 0000000000000000 [701253.606913] R10: 0000000000000003 R11: 0000000000000000 R12: ffff921d05d2de18 [701253.607247] R13: ffff921d03b54000 R14: 0000000000000448 R15: ffff921d059ecf80 [701253.607769] FS: 00007f14da906700(0000) GS:ffff921d36b00000(0000) knlGS:0000000000000000 [701253.608163] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [701253.608516] CR2: 000056087ea9f278 CR3: 00000002268e8001 CR4: 00000000003606e0 [701253.608880] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [701253.609250] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [701253.609608] Call Trace: [701253.609994] btrfs_log_inode+0xdfb/0xe40 [btrfs] [701253.610383] btrfs_log_inode_parent+0x2be/0xa60 [btrfs] [701253.610770] ? do_raw_spin_unlock+0x49/0xc0 [701253.611150] btrfs_log_dentry_safe+0x4a/0x70 [btrfs] [701253.611537] btrfs_sync_file+0x3b2/0x440 [btrfs] [701253.612010] ? do_sysinfo+0xb0/0xf0 [701253.612552] do_fsync+0x38/0x60 [701253.612988] __x64_sys_fsync+0x10/0x20 [701253.613360] do_syscall_64+0x60/0x1b0 [701253.613733] entry_SYSCALL_64_after_hwframe+0x49/0xbe [701253.614103] RIP: 0033:0x7f14da4e66d0 (...) [701253.615250] RSP: 002b:00007fffa670fdb8 EFLAGS: 00000246 ORIG_RAX: 000000000000004a [701253.615647] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f14da4e66d0 [701253.616047] RDX: 000056087ea9c260 RSI: 000056087ea9c260 RDI: 0000000000000003 [701253.616450] RBP: 0000000000000001 R08: 0000000000000020 R09: 0000000000000010 [701253.616854] R10: 000000000000009b R11: 0000000000000246 R12: 000056087ea9c260 [701253.617257] R13: 000056087ea9c240 R14: 0000000000000000 R15: 000056087ea9dd10 (...) [701253.619941] ---[ end trace e088d74f132b6da5 ]--- Updating the assertion again to allow for this particular case would result in a meaningless assertion, plus there is currently no risk of logging content that would result in any corruption after a log replay if the size of the data encoded in an inline extent is greater than the inode's i_size (which is not currently possibe either with or without compression), therefore just remove the assertion. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5af65d53618b..2f4f0958e5f2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4685,15 +4685,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(leaf, extent); - ASSERT(len == i_size || - (len == fs_info->sectorsize && - btrfs_file_extent_compression(leaf, extent) != - BTRFS_COMPRESS_NONE) || - (len < i_size && i_size < fs_info->sectorsize)); + BTRFS_FILE_EXTENT_INLINE) return 0; - } len = btrfs_file_extent_num_bytes(leaf, extent); /* Last extent goes beyond i_size, no need to log a hole. */ -- cgit v1.2.3 From da57cba4f3f1c63303cffcee97cf4d7bf1ff386a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 19 Mar 2019 11:33:24 +1100 Subject: NFS: fix mount/umount race in nlmclnt. commit 4a9be28c45bf02fa0436808bb6c0baeba30e120e upstream. If the last NFSv3 unmount from a given host races with a mount from the same host, we can destroy an nlm_host that is still in use. Specifically nlmclnt_lookup_host() can increment h_count on an nlm_host that nlmclnt_release_host() has just successfully called refcount_dec_and_test() on. Once nlmclnt_lookup_host() drops the mutex, nlm_destroy_host_lock() will be called to destroy the nlmclnt which is now in use again. The cause of the problem is that the dec_and_test happens outside the locked region. This is easily fixed by using refcount_dec_and_mutex_lock(). Fixes: 8ea6ecc8b075 ("lockd: Create client-side nlm_host cache") Cc: stable@vger.kernel.org (v2.6.38+) Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/lockd/host.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 93fb7cf0b92b..f0b5c987d6ae 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -290,12 +290,11 @@ void nlmclnt_release_host(struct nlm_host *host) WARN_ON_ONCE(host->h_server); - if (refcount_dec_and_test(&host->h_count)) { + if (refcount_dec_and_mutex_lock(&host->h_count, &nlm_host_mutex)) { WARN_ON_ONCE(!list_empty(&host->h_lockowners)); WARN_ON_ONCE(!list_empty(&host->h_granted)); WARN_ON_ONCE(!list_empty(&host->h_reclaim)); - mutex_lock(&nlm_host_mutex); nlm_destroy_host_locked(host); mutex_unlock(&nlm_host_mutex); } -- cgit v1.2.3 From 64751542d3f3a6e9ccf88dab1fd05cc47bcba301 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 19 Mar 2019 12:12:13 -0400 Subject: NFSv4.1 don't free interrupted slot on open commit 0cb98abb5bd13b9a636bde603d952d722688b428 upstream. Allow the async rpc task for finish and update the open state if needed, then free the slot. Otherwise, the async rpc unable to decode the reply. Signed-off-by: Olga Kornievskaia Fixes: ae55e59da0e4 ("pnfs: Don't release the sequence slot...") Cc: stable@vger.kernel.org # v4.18+ Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e7abcf7629b3..580e37bc3fe2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2909,7 +2909,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - nfs4_sequence_free_slot(&opendata->o_res.seq_res); + if (!opendata->cancelled) + nfs4_sequence_free_slot(&opendata->o_res.seq_res); return ret; } -- cgit v1.2.3 From 72b790c417b9f6ffec9d04b77b29103f451725f6 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 28 Mar 2019 20:43:30 -0700 Subject: fs/open.c: allow opening only regular files during execve() commit 73601ea5b7b18eb234219ae2adf77530f389da79 upstream. syzbot is hitting lockdep warning [1] due to trying to open a fifo during an execve() operation. But we don't need to open non regular files during an execve() operation, for all files which we will need are the executable file itself and the interpreter programs like /bin/sh and ld-linux.so.2 . Since the manpage for execve(2) says that execve() returns EACCES when the file or a script interpreter is not a regular file, and the manpage for uselib(2) says that uselib() can return EACCES, and we use FMODE_EXEC when opening for execve()/uselib(), we can bail out if a non regular file is requested with FMODE_EXEC set. Since this deadlock followed by khungtaskd warnings is trivially reproducible by a local unprivileged user, and syzbot's frequent crash due to this deadlock defers finding other bugs, let's workaround this deadlock until we get a chance to find a better solution. [1] https://syzkaller.appspot.com/bug?id=b5095bfec44ec84213bac54742a82483aad578ce Link: http://lkml.kernel.org/r/1552044017-7890-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Reported-by: syzbot Fixes: 8924feff66f35fe2 ("splice: lift pipe_lock out of splice_to_pipe()") Signed-off-by: Tetsuo Handa Acked-by: Kees Cook Cc: Al Viro Cc: Eric Biggers Cc: Dmitry Vyukov Cc: [4.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/open.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 0285ce7dbd51..f1c2f855fd43 100644 --- a/fs/open.c +++ b/fs/open.c @@ -733,6 +733,12 @@ static int do_dentry_open(struct file *f, return 0; } + /* Any file opened for execve()/uselib() has to be a regular file. */ + if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { + error = -EACCES; + goto cleanup_file; + } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) -- cgit v1.2.3 From 3b3fcc3d4ffd9046b6ede3f62f473f40b8bd7493 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 28 Mar 2019 20:43:38 -0700 Subject: ocfs2: fix inode bh swapping mixup in ocfs2_reflink_inodes_lock commit e6a9467ea14bae8691b0f72c500510c42ea8edb8 upstream. ocfs2_reflink_inodes_lock() can swap the inode1/inode2 variables so that we always grab cluster locks in order of increasing inode number. Unfortunately, we forget to swap the inode record buffer head pointers when we've done this, which leads to incorrect bookkeepping when we're trying to make the two inodes have the same refcount tree. This has the effect of causing filesystem shutdowns if you're trying to reflink data from inode 100 into inode 97, where inode 100 already has a refcount tree attached and inode 97 doesn't. The reflink code decides to copy the refcount tree pointer from 100 to 97, but uses inode 97's inode record to open the tree root (which it doesn't have) and blows up. This issue causes filesystem shutdowns and metadata corruption! Link: http://lkml.kernel.org/r/20190312214910.GK20533@magnolia Fixes: 29ac8e856cb369 ("ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features") Signed-off-by: Darrick J. Wong Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/refcounttree.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7a5ee145c733..fc197e599e8c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4716,22 +4716,23 @@ out: /* Lock an inode and grab a bh pointing to the inode. */ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, - struct buffer_head **bh1, + struct buffer_head **bh_s, struct inode *t_inode, - struct buffer_head **bh2) + struct buffer_head **bh_t) { - struct inode *inode1; - struct inode *inode2; + struct inode *inode1 = s_inode; + struct inode *inode2 = t_inode; struct ocfs2_inode_info *oi1; struct ocfs2_inode_info *oi2; + struct buffer_head *bh1 = NULL; + struct buffer_head *bh2 = NULL; bool same_inode = (s_inode == t_inode); + bool need_swap = (inode1->i_ino > inode2->i_ino); int status; /* First grab the VFS and rw locks. */ lock_two_nondirectories(s_inode, t_inode); - inode1 = s_inode; - inode2 = t_inode; - if (inode1->i_ino > inode2->i_ino) + if (need_swap) swap(inode1, inode2); status = ocfs2_rw_lock(inode1, 1); @@ -4754,17 +4755,13 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); - if (*bh1) - *bh1 = NULL; - if (*bh2) - *bh2 = NULL; - /* We always want to lock the one with the lower lockid first. */ if (oi1->ip_blkno > oi2->ip_blkno) mlog_errno(-ENOLCK); /* lock id1 */ - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + status = ocfs2_inode_lock_nested(inode1, &bh1, 1, + OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -4773,15 +4770,25 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, /* lock id2 */ if (!same_inode) { - status = ocfs2_inode_lock_nested(inode2, bh2, 1, + status = ocfs2_inode_lock_nested(inode2, &bh2, 1, OI_LS_REFLINK_TARGET); if (status < 0) { if (status != -ENOENT) mlog_errno(status); goto out_cl1; } - } else - *bh2 = *bh1; + } else { + bh2 = bh1; + } + + /* + * If we swapped inode order above, we have to swap the buffer heads + * before passing them back to the caller. + */ + if (need_swap) + swap(bh1, bh2); + *bh_s = bh1; + *bh_t = bh2; trace_ocfs2_double_lock_end( (unsigned long long)oi1->ip_blkno, @@ -4791,8 +4798,7 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode, out_cl1: ocfs2_inode_unlock(inode1, 1); - brelse(*bh1); - *bh1 = NULL; + brelse(bh1); out_rw2: ocfs2_rw_unlock(inode2, 1); out_i2: -- cgit v1.2.3 From 07d0d2bd957ad922cf571e7cabb6c34067142b93 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 28 Mar 2019 20:44:40 -0700 Subject: fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links commit 23da9588037ecdd4901db76a5b79a42b529c4ec3 upstream. Syzkaller reports: kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 5373 Comm: syz-executor.0 Not tainted 5.0.0-rc8+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:put_links+0x101/0x440 fs/proc/proc_sysctl.c:1599 Code: 00 0f 85 3a 03 00 00 48 8b 43 38 48 89 44 24 20 48 83 c0 38 48 89 c2 48 89 44 24 28 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 fe 02 00 00 48 8b 74 24 20 48 c7 c7 60 2a 9d 91 RSP: 0018:ffff8881d828f238 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8881e01b1140 RCX: ffffffff8ee98267 RDX: 0000000000000007 RSI: ffffc90001479000 RDI: ffff8881e01b1178 RBP: dffffc0000000000 R08: ffffed103ee27259 R09: ffffed103ee27259 R10: 0000000000000001 R11: ffffed103ee27258 R12: fffffffffffffff4 R13: 0000000000000006 R14: ffff8881f59838c0 R15: dffffc0000000000 FS: 00007f072254f700(0000) GS:ffff8881f7100000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff8b286668 CR3: 00000001f0542002 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: drop_sysctl_table+0x152/0x9f0 fs/proc/proc_sysctl.c:1629 get_subdir fs/proc/proc_sysctl.c:1022 [inline] __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335 br_netfilter_init+0xbc/0x1000 [br_netfilter] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f072254ec58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003 RBP: 00007f072254ec70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f072254f6bc R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004 Modules linked in: br_netfilter(+) dvb_usb_dibusb_mc_common dib3000mc dibx000_common dvb_usb_dibusb_common dvb_usb_dw2102 dvb_usb classmate_laptop palmas_regulator cn videobuf2_v4l2 v4l2_common snd_soc_bd28623 mptbase snd_usb_usx2y snd_usbmidi_lib snd_rawmidi wmi libnvdimm lockd sunrpc grace rc_kworld_pc150u rc_core rtc_da9063 sha1_ssse3 i2c_cros_ec_tunnel adxl34x_spi adxl34x nfnetlink lib80211 i5500_temp dvb_as102 dvb_core videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops udc_core lnbp22 leds_lp3952 hid_roccat_ryos s1d13xxxfb mtd vport_geneve openvswitch nf_conncount nf_nat_ipv6 nsh geneve udp_tunnel ip6_udp_tunnel snd_soc_mt6351 sis_agp phylink snd_soc_adau1761_spi snd_soc_adau1761 snd_soc_adau17x1 snd_soc_core snd_pcm_dmaengine ac97_bus snd_compress snd_soc_adau_utils snd_soc_sigmadsp_regmap snd_soc_sigmadsp raid_class hid_roccat_konepure hid_roccat_common hid_roccat c2port_duramar2150 core mdio_bcm_unimac iptable_security iptable_raw iptable_mangle iptable_nat nf_nat_ipv4 nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim devlink vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel joydev mousedev ide_pci_generic piix aesni_intel aes_x86_64 ide_core crypto_simd atkbd cryptd glue_helper serio_raw ata_generic pata_acpi i2c_piix4 floppy sch_fq_codel ip_tables x_tables ipv6 [last unloaded: lm73] Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace 770020de38961fd0 ]--- A new dir entry can be created in get_subdir and its 'header->parent' is set to NULL. Only after insert_header success, it will be set to 'dir', otherwise 'header->parent' is set to NULL and drop_sysctl_table is called. However in err handling path of get_subdir, drop_sysctl_table also be called on 'new->header' regardless its value of parent pointer. Then put_links is called, which triggers NULL-ptr deref when access member of header->parent. In fact we have multiple error paths which call drop_sysctl_table() there, upon failure on insert_links() we also call drop_sysctl_table().And even in the successful case on __register_sysctl_table() we still always call drop_sysctl_table().This patch fix it. Link: http://lkml.kernel.org/r/20190314085527.13244-1-yuehaibing@huawei.com Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets") Signed-off-by: YueHaibing Reported-by: Hulk Robot Acked-by: Luis Chamberlain Cc: Kees Cook Cc: Alexey Dobriyan Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Al Viro Cc: Eric W. Biederman Cc: [3.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/proc_sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4d598a399bbf..d65390727541 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,7 +1626,8 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - put_links(header); + if (parent) + put_links(header); start_unregistering(header); if (!--header->count) kfree_rcu(header, rcu); -- cgit v1.2.3 From 2dbc7c66d6dafb7781c55d067d8e406d71a34ccc Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 23 Mar 2019 11:56:01 -0400 Subject: ext4: cleanup bh release code in ext4_ind_remove_space() commit 5e86bdda41534e17621d5a071b294943cae4376e upstream. Currently, we are releasing the indirect buffer where we are done with it in ext4_ind_remove_space(), so we can see the brelse() and BUFFER_TRACE() everywhere. It seems fragile and hard to read, and we may probably forget to release the buffer some day. This patch cleans up the code by putting of the code which releases the buffers to the end of the function. Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: Jari Ruusu Signed-off-by: Greg Kroah-Hartman --- fs/ext4/indirect.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 9e96a0bd08d9..e1801b288847 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1219,6 +1219,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t offsets[4], offsets2[4]; Indirect chain[4], chain2[4]; Indirect *partial, *partial2; + Indirect *p = NULL, *p2 = NULL; ext4_lblk_t max_block; __le32 nr = 0, nr2 = 0; int n = 0, n2 = 0; @@ -1260,7 +1261,7 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, } - partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); if (nr) { if (partial == chain) { /* Shared branch grows from the inode */ @@ -1285,13 +1286,11 @@ int ext4_ind_remove_space(handle_t *handle, struct inode *inode, partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } end_range: - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); if (nr2) { if (partial2 == chain2) { /* @@ -1321,16 +1320,14 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } goto do_indirects; } /* Punch happened within the same level (n == n2) */ - partial = ext4_find_shared(inode, n, offsets, chain, &nr); - partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); + partial = p = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = p2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); /* Free top, but only if partial2 isn't its subtree. */ if (nr) { @@ -1387,15 +1384,7 @@ end_range: partial->p + 1, partial2->p, (chain+n-1) - partial); - while (partial > chain) { - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - } - while (partial2 > chain2) { - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); - } - return 0; + goto cleanup; } /* @@ -1410,8 +1399,6 @@ end_range: partial->p + 1, (__le32 *)partial->bh->b_data+addr_per_block, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); partial--; } if (partial2 > chain2 && depth2 <= depth) { @@ -1419,11 +1406,21 @@ end_range: (__le32 *)partial2->bh->b_data, partial2->p, (chain2+n2-1) - partial2); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); partial2--; } } + +cleanup: + while (p && p > chain) { + BUFFER_TRACE(p->bh, "call brelse"); + brelse(p->bh); + p--; + } + while (p2 && p2 > chain2) { + BUFFER_TRACE(p2->bh, "call brelse"); + brelse(p2->bh); + p2--; + } return 0; do_indirects: @@ -1431,7 +1428,7 @@ do_indirects: switch (offsets[0]) { default: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_IND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); @@ -1439,7 +1436,7 @@ do_indirects: } case EXT4_IND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); @@ -1447,7 +1444,7 @@ do_indirects: } case EXT4_DIND_BLOCK: if (++n >= n2) - return 0; + break; nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); @@ -1456,5 +1453,5 @@ do_indirects: case EXT4_TIND_BLOCK: ; } - return 0; + goto cleanup; } -- cgit v1.2.3 From 2938651d36cae8c22e79d293fe9802cc9d5f97f3 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Mar 2019 18:44:16 +0100 Subject: CIFS: fix POSIX lock leak and invalid ptr deref [ Upstream commit bc31d0cdcfbadb6258b45db97e93b1c83822ba33 ] We have a customer reporting crashes in lock_get_status() with many "Leaked POSIX lock" messages preceeding the crash. Leaked POSIX lock on dev=0x0:0x56 ... Leaked POSIX lock on dev=0x0:0x56 ... Leaked POSIX lock on dev=0x0:0x56 ... Leaked POSIX lock on dev=0x0:0x53 ... Leaked POSIX lock on dev=0x0:0x53 ... Leaked POSIX lock on dev=0x0:0x53 ... Leaked POSIX lock on dev=0x0:0x53 ... POSIX: fl_owner=ffff8900e7b79380 fl_flags=0x1 fl_type=0x1 fl_pid=20709 Leaked POSIX lock on dev=0x0:0x4b ino... Leaked locks on dev=0x0:0x4b ino=0xf911400000029: POSIX: fl_owner=ffff89f41c870e00 fl_flags=0x1 fl_type=0x1 fl_pid=19592 stack segment: 0000 [#1] SMP Modules linked in: binfmt_misc msr tcp_diag udp_diag inet_diag unix_diag af_packet_diag netlink_diag rpcsec_gss_krb5 arc4 ecb auth_rpcgss nfsv4 md4 nfs nls_utf8 lockd grace cifs sunrpc ccm dns_resolver fscache af_packet iscsi_ibft iscsi_boot_sysfs vmw_vsock_vmci_transport vsock xfs libcrc32c sb_edac edac_core crct10dif_pclmul crc32_pclmul ghash_clmulni_intel drbg ansi_cprng vmw_balloon aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd joydev pcspkr vmxnet3 i2c_piix4 vmw_vmci shpchp fjes processor button ac btrfs xor raid6_pq sr_mod cdrom ata_generic sd_mod ata_piix vmwgfx crc32c_intel drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm serio_raw ahci libahci drm libata vmw_pvscsi sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua scsi_mod autofs4 Supported: Yes CPU: 6 PID: 28250 Comm: lsof Not tainted 4.4.156-94.64-default #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016 task: ffff88a345f28740 ti: ffff88c74005c000 task.ti: ffff88c74005c000 RIP: 0010:[] [] lock_get_status+0x9b/0x3b0 RSP: 0018:ffff88c74005fd90 EFLAGS: 00010202 RAX: ffff89bde83e20ae RBX: ffff89e870003d18 RCX: 0000000049534f50 RDX: ffffffff81a3541f RSI: ffffffff81a3544e RDI: ffff89bde83e20ae RBP: 0026252423222120 R08: 0000000020584953 R09: 000000000000ffff R10: 0000000000000000 R11: ffff88c74005fc70 R12: ffff89e5ca7b1340 R13: 00000000000050e5 R14: ffff89e870003d30 R15: ffff89e5ca7b1340 FS: 00007fafd64be800(0000) GS:ffff89f41fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000001c80018 CR3: 000000a522048000 CR4: 0000000000360670 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: 0000000000000208 ffffffff81a3d6b6 ffff89e870003d30 ffff89e870003d18 ffff89e5ca7b1340 ffff89f41738d7c0 ffff89e870003d30 ffff89e5ca7b1340 ffffffff8125e08f 0000000000000000 ffff89bc22b67d00 ffff88c74005ff28 Call Trace: [] locks_show+0x2f/0x70 [] seq_read+0x251/0x3a0 [] proc_reg_read+0x3c/0x70 [] __vfs_read+0x26/0x140 [] vfs_read+0x7a/0x120 [] SyS_read+0x42/0xa0 [] entry_SYSCALL_64_fastpath+0x1e/0xb7 When Linux closes a FD (close(), close-on-exec, dup2(), ...) it calls filp_close() which also removes all posix locks. The lock struct is initialized like so in filp_close() and passed down to cifs ... lock.fl_type = F_UNLCK; lock.fl_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; ... Note the FL_CLOSE flag, which hints the VFS code that this unlocking is done for closing the fd. filp_close() locks_remove_posix(filp, id); vfs_lock_file(filp, F_SETLK, &lock, NULL); return filp->f_op->lock(filp, cmd, fl) => cifs_lock() rc = cifs_setlk(file, flock, type, wait_flag, posix_lck, lock, unlock, xid); rc = server->ops->mand_unlock_range(cfile, flock, xid); if (flock->fl_flags & FL_POSIX && !rc) rc = locks_lock_file_wait(file, flock) Notice how we don't call locks_lock_file_wait() which does the generic VFS lock/unlock/wait work on the inode if rc != 0. If we are closing the handle, the SMB server is supposed to remove any locks associated with it. Similarly, cifs.ko frees and wakes up any lock and lock waiter when closing the file: cifs_close() cifsFileInfo_put(file->private_data) /* * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ down_write(&cifsi->lock_sem); list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { list_del(&li->llist); cifs_del_lock_waiters(li); kfree(li); } list_del(&cifs_file->llist->llist); kfree(cifs_file->llist); up_write(&cifsi->lock_sem); So we can safely ignore unlocking failures in cifs_lock() if they happen with the FL_CLOSE flag hint set as both the server and the client take care of it during the actual closing. This is not a proper fix for the unlocking failure but it's safe and it seems to prevent the lock leakages and crashes the customer experiences. Signed-off-by: Aurelien Aptel Signed-off-by: NeilBrown Signed-off-by: Steve French Acked-by: Pavel Shilovsky Signed-off-by: Sasha Levin --- fs/cifs/file.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 08761a6a039d..d847132ab027 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1631,8 +1631,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX && !rc) + if (flock->fl_flags & FL_POSIX) { + /* + * If this is a request to remove all locks because we + * are closing the file, it doesn't matter if the + * unlocking failed as both cifs.ko and the SMB server + * remove the lock on file close + */ + if (rc) { + cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); + if (!(flock->fl_flags & FL_CLOSE)) + return rc; + } rc = locks_lock_file_wait(file, flock); + } return rc; } -- cgit v1.2.3 From 198c99857b302ad46a4717dc6351b069fc5ac17f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 5 Mar 2019 19:32:26 +0800 Subject: f2fs: fix to adapt small inline xattr space in __find_inline_xattr() [ Upstream commit 2c28aba8b2e2a51749fa66e01b68e1cd5b53e022 ] With below testcase, we will fail to find existed xattr entry: 1. mkfs.f2fs -O extra_attr -O flexible_inline_xattr /dev/zram0 2. mount -t f2fs -o inline_xattr_size=1 /dev/zram0 /mnt/f2fs/ 3. touch /mnt/f2fs/file 4. setfattr -n "user.name" -v 0 /mnt/f2fs/file 5. getfattr -n "user.name" /mnt/f2fs/file /mnt/f2fs/file: user.name: No such attribute The reason is for inode which has very small inline xattr size, __find_inline_xattr() will fail to traverse any entry due to first entry may not be loaded from xattr node yet, later, we may skip to check entire xattr datas in __find_xattr(), result in such wrong condition. This patch adds condition to check such case to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/xattr.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 087e53a2d96c..409a637f7a92 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -227,11 +227,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -242,6 +242,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } -- cgit v1.2.3 From d7391962d723c92fbf02d2aa823e9f42413c76a7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 12 Mar 2019 15:44:27 +0800 Subject: f2fs: fix to avoid deadlock in f2fs_read_inline_dir() [ Upstream commit aadcef64b22f668c1a107b86d3521d9cac915c24 ] As Jiqun Li reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202883 sometimes, dead lock when make system call SYS_getdents64 with fsync() is called by another process. monkey running on android9.0 1. task 9785 held sbi->cp_rwsem and waiting lock_page() 2. task 10349 held mm_sem and waiting sbi->cp_rwsem 3. task 9709 held lock_page() and waiting mm_sem so this is a dead lock scenario. task stack is show by crash tools as following crash_arm64> bt ffffffc03c354080 PID: 9785 TASK: ffffffc03c354080 CPU: 1 COMMAND: "RxIoScheduler-3" >> #7 [ffffffc01b50fac0] __lock_page at ffffff80081b11e8 crash-arm64> bt 10349 PID: 10349 TASK: ffffffc018b83080 CPU: 1 COMMAND: "BUGLY_ASYNC_UPL" >> #3 [ffffffc01f8cfa40] rwsem_down_read_failed at ffffff8008a93afc PC: 00000033 LR: 00000000 SP: 00000000 PSTATE: ffffffffffffffff crash-arm64> bt 9709 PID: 9709 TASK: ffffffc03e7f3080 CPU: 1 COMMAND: "IntentService[A" >> #3 [ffffffc001e67850] rwsem_down_read_failed at ffffff8008a93afc >> #8 [ffffffc001e67b80] el1_ia at ffffff8008084fc4 PC: ffffff8008274114 [compat_filldir64+120] LR: ffffff80083584d4 [f2fs_fill_dentries+448] SP: ffffffc001e67b80 PSTATE: 80400145 X29: ffffffc001e67b80 X28: 0000000000000000 X27: 000000000000001a X26: 00000000000093d7 X25: ffffffc070d52480 X24: 0000000000000008 X23: 0000000000000028 X22: 00000000d43dfd60 X21: ffffffc001e67e90 X20: 0000000000000011 X19: ffffff80093a4000 X18: 0000000000000000 X17: 0000000000000000 X16: 0000000000000000 X15: 0000000000000000 X14: ffffffffffffffff X13: 0000000000000008 X12: 0101010101010101 X11: 7f7f7f7f7f7f7f7f X10: 6a6a6a6a6a6a6a6a X9: 7f7f7f7f7f7f7f7f X8: 0000000080808000 X7: ffffff800827409c X6: 0000000080808000 X5: 0000000000000008 X4: 00000000000093d7 X3: 000000000000001a X2: 0000000000000011 X1: ffffffc070d52480 X0: 0000000000800238 >> #9 [ffffffc001e67be0] f2fs_fill_dentries at ffffff80083584d0 PC: 0000003c LR: 00000000 SP: 00000000 PSTATE: 000000d9 X12: f48a02ff X11: d4678960 X10: d43dfc00 X9: d4678ae4 X8: 00000058 X7: d4678994 X6: d43de800 X5: 000000d9 X4: d43dfc0c X3: d43dfc10 X2: d46799c8 X1: 00000000 X0: 00001068 Below potential deadlock will happen between three threads: Thread A Thread B Thread C - f2fs_do_sync_file - f2fs_write_checkpoint - down_write(&sbi->node_change) -- 1) - do_page_fault - down_write(&mm->mmap_sem) -- 2) - do_wp_page - f2fs_vm_page_mkwrite - getdents64 - f2fs_read_inline_dir - lock_page -- 3) - f2fs_sync_node_pages - lock_page -- 3) - __do_map_lock - down_read(&sbi->node_change) -- 1) - f2fs_fill_dentries - dir_emit - compat_filldir64 - do_page_fault - down_read(&mm->mmap_sem) -- 2) Since f2fs_readdir is protected by inode.i_rwsem, there should not be any updates in inode page, we're safe to lookup dents in inode page without its lock held, so taking off the lock to improve concurrency of readdir and avoid potential deadlock. Reported-by: Jiqun Li Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/inline.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 115dc219344b..92703efde36e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -661,6 +661,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -669,7 +675,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } -- cgit v1.2.3 From 20141feb9bdec6e7daed7782c1535b31081aeb71 Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Tue, 5 Mar 2019 15:41:41 -0800 Subject: ocfs2: fix a panic problem caused by o2cb_ctl [ Upstream commit cc725ef3cb202ef2019a3c67c8913efa05c3cce6 ] In the process of creating a node, it will cause NULL pointer dereference in kernel if o2cb_ctl failed in the interval (mkdir, o2cb_set_node_attribute(node_num)] in function o2cb_add_node. The node num is initialized to 0 in function o2nm_node_group_make_item, o2nm_node_group_drop_item will mistake the node number 0 for a valid node number when we delete the node before the node number is set correctly. If the local node number of the current host happens to be 0, cluster->cl_local_node will be set to O2NM_INVALID_NODE_NUM while o2hb_thread still running. The panic stack is generated as follows: o2hb_thread \-o2hb_do_disk_heartbeat \-o2hb_check_own_slot |-slot = ®->hr_slots[o2nm_this_node()]; //o2nm_this_node() return O2NM_INVALID_NODE_NUM We need to check whether the node number is set when we delete the node. Link: http://lkml.kernel.org/r/133d8045-72cc-863e-8eae-5013f9f6bc51@huawei.com Signed-off-by: Jia Guo Reviewed-by: Joseph Qi Acked-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/ocfs2/cluster/nodemanager.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 0e4166cc23a0..4ac775e32240 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ -- cgit v1.2.3 From 9b4f27667402c303de02192a2ce84f4c1823be69 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Mon, 4 Feb 2019 13:36:53 +0530 Subject: f2fs: do not use mutex lock in atomic context [ Upstream commit 9083977dabf3833298ddcd40dee28687f1e6b483 ] Fix below warning coming because of using mutex lock in atomic context. BUG: sleeping function called from invalid context at kernel/locking/mutex.c:98 in_atomic(): 1, irqs_disabled(): 0, pid: 585, name: sh Preemption disabled at: __radix_tree_preload+0x28/0x130 Call trace: dump_backtrace+0x0/0x2b4 show_stack+0x20/0x28 dump_stack+0xa8/0xe0 ___might_sleep+0x144/0x194 __might_sleep+0x58/0x8c mutex_lock+0x2c/0x48 f2fs_trace_pid+0x88/0x14c f2fs_set_node_page_dirty+0xd0/0x184 Do not use f2fs_radix_tree_insert() to avoid doing cond_resched() with spin_lock() acquired. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/trace.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index a1fcd00bbb2b..8ac1851a21c0 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -17,7 +17,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -61,23 +61,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -122,7 +128,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -150,7 +156,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -158,5 +164,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } -- cgit v1.2.3 From d609ecd887f8dd8288b9ff58042381a0ce801ec0 Mon Sep 17 00:00:00 2001 From: Shuriyc Chu Date: Tue, 5 Mar 2019 15:41:56 -0800 Subject: fs/file.c: initialize init_files.resize_wait [ Upstream commit 5704a06810682683355624923547b41540e2801a ] (Taken from https://bugzilla.kernel.org/show_bug.cgi?id=200647) 'get_unused_fd_flags' in kthread cause kernel crash. It works fine on 4.1, but causes crash after get 64 fds. It also cause crash on ubuntu1404/1604/1804, centos7.5, and the crash messages are almost the same. The crash message on centos7.5 shows below: start fd 61 start fd 62 start fd 63 BUG: unable to handle kernel NULL pointer dereference at (null) IP: __wake_up_common+0x2e/0x90 PGD 0 Oops: 0000 [#1] SMP Modules linked in: test(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter devlink sunrpc kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd sg ppdev pcspkr virtio_balloon parport_pc parport i2c_piix4 joydev ip_tables xfs libcrc32c sr_mod cdrom sd_mod crc_t10dif crct10dif_generic ata_generic pata_acpi virtio_scsi virtio_console virtio_net cirrus drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm ata_piix serio_raw libata virtio_pci virtio_ring i2c_core virtio floppy dm_mirror dm_region_hash dm_log dm_mod CPU: 2 PID: 1820 Comm: test_fd Kdump: loaded Tainted: G OE ------------ 3.10.0-862.3.3.el7.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014 task: ffff8e92b9431fa0 ti: ffff8e94247a0000 task.ti: ffff8e94247a0000 RIP: 0010:__wake_up_common+0x2e/0x90 RSP: 0018:ffff8e94247a2d18 EFLAGS: 00010086 RAX: 0000000000000000 RBX: ffffffff9d09daa0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000003 RDI: ffffffff9d09daa0 RBP: ffff8e94247a2d50 R08: 0000000000000000 R09: ffff8e92b95dfda8 R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff9d09daa8 R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff8e9434e80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000017c686000 CR4: 00000000000207e0 Call Trace: __wake_up+0x39/0x50 expand_files+0x131/0x250 __alloc_fd+0x47/0x170 get_unused_fd_flags+0x30/0x40 test_fd+0x12a/0x1c0 [test] kthread+0xd1/0xe0 ret_from_fork_nospec_begin+0x21/0x21 Code: 66 90 55 48 89 e5 41 57 41 89 f7 41 56 41 89 ce 41 55 41 54 49 89 fc 49 83 c4 08 53 48 83 ec 10 48 8b 47 08 89 55 cc 4c 89 45 d0 <48> 8b 08 49 39 c4 48 8d 78 e8 4c 8d 69 e8 75 08 eb 3b 4c 89 ef RIP __wake_up_common+0x2e/0x90 RSP CR2: 0000000000000000 This issue exists since CentOS 7.5 3.10.0-862 and CentOS 7.4 (3.10.0-693.21.1 ) is ok. Root cause: the item 'resize_wait' is not initialized before being used. Reported-by: Richard Zhang Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 7ffd6e9d103d..780d29e58847 100644 --- a/fs/file.c +++ b/fs/file.c @@ -457,6 +457,7 @@ struct files_struct init_files = { .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), + .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) -- cgit v1.2.3 From 626d98bbdb30126719921f64dacaa3dc8ddc0c8a Mon Sep 17 00:00:00 2001 From: Louis Taylor Date: Wed, 27 Feb 2019 22:25:15 +0000 Subject: cifs: use correct format characters [ Upstream commit 259594bea574e515a148171b5cd84ce5cbdc028a ] When compiling with -Wformat, clang emits the following warnings: fs/cifs/smb1ops.c:312:20: warning: format specifies type 'unsigned short' but the argument has type 'unsigned int' [-Wformat] tgt_total_cnt, total_in_tgt); ^~~~~~~~~~~~ fs/cifs/cifs_dfs_ref.c:289:4: warning: format specifies type 'short' but the argument has type 'int' [-Wformat] ref->flags, ref->server_type); ^~~~~~~~~~ fs/cifs/cifs_dfs_ref.c:289:16: warning: format specifies type 'short' but the argument has type 'int' [-Wformat] ref->flags, ref->server_type); ^~~~~~~~~~~~~~~~ fs/cifs/cifs_dfs_ref.c:291:4: warning: format specifies type 'short' but the argument has type 'int' [-Wformat] ref->ref_flag, ref->path_consumed); ^~~~~~~~~~~~~ fs/cifs/cifs_dfs_ref.c:291:19: warning: format specifies type 'short' but the argument has type 'int' [-Wformat] ref->ref_flag, ref->path_consumed); ^~~~~~~~~~~~~~~~~~ The types of these arguments are unconditionally defined, so this patch updates the format character to the correct ones for ints and unsigned ints. Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Louis Taylor Signed-off-by: Steve French Reviewed-by: Nick Desaulniers Signed-off-by: Sasha Levin --- fs/cifs/cifs_dfs_ref.c | 4 ++-- fs/cifs/smb1ops.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b61df117fd4..563e2f6268c3 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -271,9 +271,9 @@ static void dump_referral(const struct dfs_info3_param *ref) { cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); - cifs_dbg(FYI, "DFS: fl: %hd, srv_type: %hd\n", + cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n", ref->flags, ref->server_type); - cifs_dbg(FYI, "DFS: ref_flags: %hd, path_consumed: %hd\n", + cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n", ref->ref_flag, ref->path_consumed); } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 378151e09e91..47db8eb6cbcf 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -308,7 +308,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr) remaining = tgt_total_cnt - total_in_tgt; if (remaining < 0) { - cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%hu\n", + cifs_dbg(FYI, "Server sent too much data. tgt_total_cnt=%hu total_in_tgt=%u\n", tgt_total_cnt, total_in_tgt); return -EPROTO; } -- cgit v1.2.3 From 4ab78f4d75c644cbe707a6eb858c3737c25c842f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 15 Feb 2019 00:08:25 +0800 Subject: f2fs: fix to check inline_xattr_size boundary correctly [ Upstream commit 500e0b28ecd3c5aade98f3c3a339d18dcb166bb6 ] We use below condition to check inline_xattr_size boundary: if (!F2FS_OPTION(sbi).inline_xattr_size || F2FS_OPTION(sbi).inline_xattr_size >= DEF_ADDRS_PER_INODE - F2FS_TOTAL_EXTRA_ATTR_SIZE - DEF_INLINE_RESERVED_SIZE - DEF_MIN_INLINE_SIZE) There is there problems in that check: - we should allow inline_xattr_size equaling to min size of inline {data,dentry} area. - F2FS_TOTAL_EXTRA_ATTR_SIZE and inline_xattr_size are based on different size unit, previous one is 4 bytes, latter one is 1 bytes. - DEF_MIN_INLINE_SIZE only indicate min size of inline data area, however, we need to consider min size of inline dentry area as well, minimal inline dentry should at least contain two entries: '.' and '..', so that min inline_dentry size is 40 bytes. .bitmap 1 * 1 = 1 .reserved 1 * 1 = 1 .dentry 11 * 2 = 22 .filename 8 * 2 = 16 total 40 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 42aef5c94927..a3ba20e5946f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -450,7 +450,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c9639ef0e8d5..79370b7fa9d2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -822,12 +822,13 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + if (F2FS_OPTION(sbi).inline_xattr_size < + sizeof(struct f2fs_xattr_header) / sizeof(__le32) || + F2FS_OPTION(sbi).inline_xattr_size > + DEF_ADDRS_PER_INODE - + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - + DEF_INLINE_RESERVED_SIZE - + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) { f2fs_msg(sb, KERN_ERR, "inline xattr size is out of range"); return -EINVAL; -- cgit v1.2.3 From d579b4eae836e8f47e28ea373a579fdb9d4e04b5 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 22 Jan 2019 09:46:45 +0900 Subject: cifs: Accept validate negotiate if server return NT_STATUS_NOT_SUPPORTED [ Upstream commit 969ae8e8d4ee54c99134d3895f2adf96047f5bee ] Old windows version or Netapp SMB server will return NT_STATUS_NOT_SUPPORTED since they do not allow or implement FSCTL_VALIDATE_NEGOTIATE_INFO. The client should accept the response provided it's properly signed. See https://blogs.msdn.microsoft.com/openspecification/2012/06/28/smb3-secure-dialect-negotiation/ and MS-SMB2 validate negotiate response processing: https://msdn.microsoft.com/en-us/library/hh880630.aspx Samba client had already handled it. https://bugzilla.samba.org/attachment.cgi?id=13285&action=edit Signed-off-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/smb2pdu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 3d0db37d64ad..71f32d983384 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -881,8 +881,14 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */, (char *)pneg_inbuf, inbuflen, (char **)&pneg_rsp, &rsplen); - - if (rc != 0) { + if (rc == -EOPNOTSUPP) { + /* + * Old Windows versions or Netapp SMB server can return + * not supported error. Client should accept it. + */ + cifs_dbg(VFS, "Server does not support validate negotiate\n"); + return 0; + } else if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); rc = -EIO; goto out_free_inbuf; -- cgit v1.2.3 From 36a3219e617aa4650caf5dff858b98b95efc1d22 Mon Sep 17 00:00:00 2001 From: Yao Liu Date: Mon, 28 Jan 2019 19:47:28 +0800 Subject: cifs: Fix NULL pointer dereference of devname [ Upstream commit 68e2672f8fbd1e04982b8d2798dd318bf2515dd2 ] There is a NULL pointer dereference of devname in strspn() The oops looks something like: CIFS: Attempting to mount (null) BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 ... RIP: 0010:strspn+0x0/0x50 ... Call Trace: ? cifs_parse_mount_options+0x222/0x1710 [cifs] ? cifs_get_volume_info+0x2f/0x80 [cifs] cifs_setup_volume_info+0x20/0x190 [cifs] cifs_get_volume_info+0x50/0x80 [cifs] cifs_smb3_do_mount+0x59/0x630 [cifs] ? ida_alloc_range+0x34b/0x3d0 cifs_do_mount+0x11/0x20 [cifs] mount_fs+0x52/0x170 vfs_kern_mount+0x6b/0x170 do_mount+0x216/0xdc0 ksys_mount+0x83/0xd0 __x64_sys_mount+0x25/0x30 do_syscall_64+0x65/0x220 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix this by adding a NULL check on devname in cifs_parse_devname() Signed-off-by: Yao Liu Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/connect.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index a5ea742654aa..f31339db45fd 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1347,6 +1347,11 @@ cifs_parse_devname(const char *devname, struct smb_vol *vol) const char *delims = "/\\"; size_t len; + if (unlikely(!devname || !*devname)) { + cifs_dbg(VFS, "Device name not specified.\n"); + return -EINVAL; + } + /* make sure we have a valid UNC double delimiter prefix */ len = strspn(devname, delims); if (len != 2) -- cgit v1.2.3 From 6a817a7aed1c4074274118d6a27f8b094e2fe0dd Mon Sep 17 00:00:00 2001 From: luojiajun Date: Fri, 1 Mar 2019 00:30:00 -0500 Subject: jbd2: fix invalid descriptor block checksum [ Upstream commit 6e876c3dd205d30b0db6850e97a03d75457df007 ] In jbd2_journal_commit_transaction(), if we are in abort mode, we may flush the buffer without setting descriptor block checksum by goto start_journal_io. Then fs is mounted, jbd2_descriptor_block_csum_verify() failed. [ 271.379811] EXT4-fs (vdd): shut down requested (2) [ 271.381827] Aborting journal on device vdd-8. [ 271.597136] JBD2: Invalid checksum recovering block 22199 in log [ 271.598023] JBD2: recovery failed [ 271.598484] EXT4-fs (vdd): error loading journal Fix this problem by keep setting descriptor block checksum if the descriptor buffer is not NULL. This checksum problem can be reproduced by xfstests generic/388. Signed-off-by: luojiajun Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Sasha Levin --- fs/jbd2/commit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 150cc030b4d7..65ea0355a4f6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -691,9 +691,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* -- cgit v1.2.3 From 83c395332fdf0810264c3903dd56d76cde1e083c Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 26 Feb 2019 11:51:50 +0100 Subject: fs: fix guard_bio_eod to check for real EOD errors [ Upstream commit dce30ca9e3b676fb288c33c1f4725a0621361185 ] guard_bio_eod() can truncate a segment in bio to allow it to do IO on odd last sectors of a device. It already checks if the IO starts past EOD, but it does not consider the possibility of an IO request starting within device boundaries can contain more than one segment past EOD. In such cases, truncated_bytes can be bigger than PAGE_SIZE, and will underflow bvec->bv_len. Fix this by checking if truncated_bytes is lower than PAGE_SIZE. This situation has been found on filesystems such as isofs and vfat, which doesn't check the device size before mount, if the device is smaller than the filesystem itself, a readahead on such filesystem, which spans EOD, can trigger this situation, leading a call to zero_user() with a wrong size possibly corrupting memory. I didn't see any crash, or didn't let the system run long enough to check if memory corruption will be hit somewhere, but adding instrumentation to guard_bio_end() to check truncated_bytes size, was enough to see the error. The following script can trigger the error. MNT=/mnt IMG=./DISK.img DEV=/dev/loop0 mkfs.vfat $IMG mount $IMG $MNT cp -R /etc $MNT &> /dev/null umount $MNT losetup -D losetup --find --show --sizelimit 16247280 $IMG mount $DEV $MNT find $MNT -type f -exec cat {} + >/dev/null Kudos to Eric Sandeen for coming up with the reproducer above Reviewed-by: Ming Lei Signed-off-by: Carlos Maiolino Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index c083c4b3c1e7..a550e0d8e965 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3027,6 +3027,13 @@ void guard_bio_eod(int op, struct bio *bio) /* Uhhuh. We've got a bio that straddles the device size! */ truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); + /* + * The bio contains more than one segment which spans EOD, just return + * and let IO layer turn it into an EIO + */ + if (truncated_bytes > bvec->bv_len) + return; + /* Truncate the bio.. */ bio->bi_iter.bi_size -= truncated_bytes; bvec->bv_len -= truncated_bytes; -- cgit v1.2.3 From dcedd37957de3e54a194a044befefd7687e023a1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 25 Jan 2019 07:55:27 +0800 Subject: btrfs: qgroup: Make qgroup async transaction commit more aggressive [ Upstream commit f5fef4593653dfa2a865c485bb81415de51d5c99 ] [BUG] Btrfs qgroup will still hit EDQUOT under the following case: $ dev=/dev/test/test $ mnt=/mnt/btrfs $ umount $mnt &> /dev/null $ umount $dev &> /dev/null $ mkfs.btrfs -f $dev $ mount $dev $mnt -o nospace_cache $ btrfs subv create $mnt/subv $ btrfs quota enable $mnt $ btrfs quota rescan -w $mnt $ btrfs qgroup limit -e 1G $mnt/subv $ fallocate -l 900M $mnt/subv/padding $ sync $ rm $mnt/subv/padding # Hit EDQUOT $ xfs_io -f -c "pwrite 0 512M" $mnt/subv/real_file [CAUSE] Since commit a514d63882c3 ("btrfs: qgroup: Commit transaction in advance to reduce early EDQUOT"), btrfs is not forced to commit transaction to reclaim more quota space. Instead, we just check pertrans metadata reservation against some threshold and try to do asynchronously transaction commit. However in above case, the pertrans metadata reservation is pretty small thus it will never trigger asynchronous transaction commit. [FIX] Instead of only accounting pertrans metadata reservation, we calculate how much free space we have, and if there isn't much free space left, commit transaction asynchronously to try to free some space. This may slow down the fs when we have less than 32M free qgroup space, but should reduce a lot of false EDQUOT, so the cost should be acceptable. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/qgroup.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e1fcb28ad4cc..e46e83e87600 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2427,16 +2427,15 @@ out: /* * Two limits to commit transaction in advance. * - * For RATIO, it will be 1/RATIO of the remaining limit - * (excluding data and prealloc meta) as threshold. + * For RATIO, it will be 1/RATIO of the remaining limit as threshold. * For SIZE, it will be in byte unit as threshold. */ -#define QGROUP_PERTRANS_RATIO 32 -#define QGROUP_PERTRANS_SIZE SZ_32M +#define QGROUP_FREE_RATIO 32 +#define QGROUP_FREE_SIZE SZ_32M static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 limit; + u64 free; u64 threshold; if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && @@ -2455,20 +2454,21 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, */ if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - limit = qg->max_excl; - else - limit = qg->max_rfer; - threshold = (limit - qg->rsv.values[BTRFS_QGROUP_RSV_DATA] - - qg->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC]) / - QGROUP_PERTRANS_RATIO; - threshold = min_t(u64, threshold, QGROUP_PERTRANS_SIZE); + if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; + threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } else { + free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; + threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, + QGROUP_FREE_SIZE); + } /* * Use transaction_kthread to commit transaction, so we no * longer need to bother nested transaction nor lock context. */ - if (qg->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > threshold) + if (free < threshold) btrfs_commit_transaction_locksafe(fs_info); } -- cgit v1.2.3 From c26d61ea90db085ad4871b5987f1cf033ce6cfb0 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Thu, 6 Dec 2018 20:05:34 +0100 Subject: vfs: fix preadv64v2 and pwritev64v2 compat syscalls with offset == -1 [ Upstream commit cc4b1242d7e3b42eed73881fc749944146493e4f ] The preadv2 and pwritev2 syscalls are supposed to emulate the readv and writev syscalls when offset == -1. Therefore the compat code should check for offset before calling do_compat_preadv64 and do_compat_pwritev64. This is the case for the preadv2 and pwritev2 syscalls, but handling of offset == -1 is missing in their 64-bit equivalent. This patch fixes that, calling do_compat_readv and do_compat_writev when offset == -1. This fixes the following glibc tests on x32: - misc/tst-preadvwritev2 - misc/tst-preadvwritev64v2 Cc: Alexander Viro Cc: H.J. Lu Signed-off-by: Aurelien Jarno Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/read_write.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 8a2737f0d61d..562974a0616c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1241,6 +1241,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + return do_compat_preadv64(fd, vec, vlen, pos, flags); } #endif @@ -1347,6 +1350,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + return do_compat_pwritev64(fd, vec, vlen, pos, flags); } #endif -- cgit v1.2.3 From 9154420173d56cedc5739fdc3a57799e94ae8238 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Feb 2019 16:27:14 -0500 Subject: jbd2: fix race when writing superblock [ Upstream commit 538bcaa6261b77e71d37f5596c33127c1a3ec3f7 ] The jbd2 superblock is lockless now, so there is probably a race condition between writing it so disk and modifing contents of it, which may lead to checksum error. The following race is the one case that we have captured. jbd2 fsstress jbd2_journal_commit_transaction jbd2_journal_update_sb_log_tail jbd2_write_superblock jbd2_superblock_csum_set jbd2_journal_revoke jbd2_journal_set_features(revork) modify superblock submit_bh(checksum incorrect) Fix this by locking the buffer head before modifing it. We always write the jbd2 superblock after we modify it, so this just means calling the lock_buffer() a little earlier. This checksum corruption problem can be reproduced by xfstests generic/475. Reported-by: zhangyi (F) Suggested-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/jbd2/journal.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..88f2a49338a1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1356,6 +1356,10 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; @@ -1365,7 +1369,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); - lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1424,6 +1427,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); + lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); @@ -1454,18 +1458,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - read_unlock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); return; } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); - read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, write_op); @@ -1488,9 +1491,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; int errcode; - read_lock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; - read_unlock(&journal->j_state_lock); if (errcode == -ESHUTDOWN) errcode = 0; jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); @@ -1894,28 +1896,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; + /* Load the checksum driver if necessary */ + if ((journal->j_chksum_driver == NULL) && + INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + lock_buffer(journal->j_sb_buffer); + /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - - /* Load the checksum driver */ - if (journal->j_chksum_driver == NULL) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", - 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c " - "driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, - sb->s_uuid, - sizeof(sb->s_uuid)); - } } /* If enabling v1 checksums, downgrade superblock */ @@ -1927,6 +1928,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); return 1; #undef COMPAT_FEATURE_ON -- cgit v1.2.3 From dbeca415575f2cbeb4594ab63fa83d97987994c1 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Tue, 15 Jan 2019 20:02:15 +0000 Subject: f2fs: UBSAN: set boolean value iostat_enable correctly [ Upstream commit ac92985864e187a1735502f6a02f54eaa655b2aa ] When setting /sys/fs/f2fs//iostat_enable with non-bool value, UBSAN reports the following warning. [ 7562.295484] ================================================================================ [ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10 [ 7562.297651] load of value 64 is not a valid value for type '_Bool' [ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79 [ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 7562.298662] Call Trace: [ 7562.298760] dump_stack+0x46/0x5b [ 7562.298811] ubsan_epilogue+0x9/0x40 [ 7562.298830] __ubsan_handle_load_invalid_value+0x72/0x90 [ 7562.298863] f2fs_file_write_iter+0x29f/0x3f0 [ 7562.298905] __vfs_write+0x115/0x160 [ 7562.298922] vfs_write+0xa7/0x190 [ 7562.298934] ksys_write+0x50/0xc0 [ 7562.298973] do_syscall_64+0x4a/0xe0 [ 7562.298992] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 7562.299001] RIP: 0033:0x7fa45ec19c00 [ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24 [ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00 [ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001 [ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000 [ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400 [ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000 [ 7562.299091] ================================================================================ So, if iostat_enable is enabled, set its value as true. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/sysfs.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 81c0e5337443..98887187af4c 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -273,10 +273,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } -- cgit v1.2.3 From 16515acd5bc33e86985fa93ee1b894815e52cdc5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Mar 2019 10:49:56 +0000 Subject: Btrfs: do not allow trimming when a fs is mounted with the nologreplay option commit f35f06c35560a86e841631f0243b83a984dc11a9 upstream. Whan a filesystem is mounted with the nologreplay mount option, which requires it to be mounted in RO mode as well, we can not allow discard on free space inside block groups, because log trees refer to extents that are not pinned in a block group's free space cache (pinning the extents is precisely the first phase of replaying a log tree). So do not allow the fitrim ioctl to do anything when the filesystem is mounted with the nologreplay option, because later it can be mounted RW without that option, which causes log replay to happen and result in either a failure to replay the log trees (leading to a mount failure), a crash or some silent corruption. Reported-by: Darrick J. Wong Fixes: 96da09192cda ("btrfs: Introduce new mount option to disable tree log replay") CC: stable@vger.kernel.org # 4.9+ Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8bf9cce11213..0eb333c62fe4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -496,6 +496,16 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + /* + * If the fs is mounted with nologreplay, which requires it to be + * mounted in RO mode as well, we can not allow discard on free space + * inside block groups, because log trees refer to extents that are not + * pinned in a block group's free space cache (pinning the extents is + * precisely the first phase of replaying a log tree). + */ + if (btrfs_test_opt(fs_info, NOLOGREPLAY)) + return -EROFS; + rcu_read_lock(); list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, dev_list) { -- cgit v1.2.3 From fbfbb996d58ebf5ddd0c2ae36efcb7015dc50e95 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 2 Apr 2019 18:07:38 +0800 Subject: btrfs: prop: fix zstd compression parameter validation commit 50398fde997f6be8faebdb5f38e9c9c467370f51 upstream. We let pass zstd compression parameter even if it is not fully valid. For example: $ btrfs prop set /btrfs compression zst $ btrfs prop get /btrfs compression compression=zst zlib and lzo are fine. Fix it by checking the correct prefix length. Fixes: 5c1aab1dd544 ("btrfs: Add zstd support") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/props.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dc6140013ae8..fd19f3078566 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -396,7 +396,7 @@ static int prop_compression_apply(struct inode *inode, btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (!strncmp("zlib", value, 4)) { type = BTRFS_COMPRESS_ZLIB; - } else if (!strncmp("zstd", value, len)) { + } else if (!strncmp("zstd", value, 4)) { type = BTRFS_COMPRESS_ZSTD; btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } else { -- cgit v1.2.3 From 54fb5c9da6cd665ddee9bcc5645b5f428dbfae2c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 2 Apr 2019 18:07:40 +0800 Subject: btrfs: prop: fix vanished compression property after failed set commit 272e5326c7837697882ce3162029ba893059b616 upstream. The compression property resets to NULL, instead of the old value if we fail to set the new compression parameter. $ btrfs prop get /btrfs compression compression=lzo $ btrfs prop set /btrfs compression zli ERROR: failed to set compression for /btrfs: Invalid argument $ btrfs prop get /btrfs compression This is because the compression property ->validate() is successful for 'zli' as the strncmp() used the length passed from the userspace. Fix it by using the expected string length in strncmp(). Fixes: 63541927c8d1 ("Btrfs: add support for inode properties") Fixes: 5c1aab1dd544 ("btrfs: Add zstd support") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/props.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index fd19f3078566..61d22a56c0ba 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -366,11 +366,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, static int prop_compression_validate(const char *value, size_t len) { - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) return 0; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) return 0; - else if (!strncmp("zstd", value, len)) + else if (!strncmp("zstd", value, 4)) return 0; return -EINVAL; -- cgit v1.2.3 From 543bb48dc48b4b65a51d91c9680afbdb573a9fa7 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 12 Apr 2019 10:09:16 +0800 Subject: block: fix the return errno for direct IO commit a89afe58f1a74aac768a5eb77af95ef4ee15beaa upstream. If the last bio returned is not dio->bio, the status of the bio will not assigned to dio->bio if it is error. This will cause the whole IO status wrong. ksoftirqd/21-117 [021] ..s. 4017.966090: 8,0 C N 4883648 [0] -0 [018] ..s. 4017.970888: 8,0 C WS 4924800 + 1024 [0] -0 [018] ..s. 4017.970909: 8,0 D WS 4935424 + 1024 [] -0 [018] ..s. 4017.970924: 8,0 D WS 4936448 + 321 [] ksoftirqd/21-117 [021] ..s. 4017.995033: 8,0 C R 4883648 + 336 [65475] ksoftirqd/21-117 [021] d.s. 4018.001988: myprobe1: (blkdev_bio_end_io+0x0/0x168) bi_status=7 ksoftirqd/21-117 [021] d.s. 4018.001992: myprobe: (aio_complete_rw+0x0/0x148) x0=0xffff802f2595ad80 res=0x12a000 res2=0x0 We always have to assign bio->bi_status to dio->bio.bi_status because we will only check dio->bio.bi_status when we return the whole IO to the upper layer. Fixes: 542ff7bf18c6 ("block: new direct I/O implementation") Cc: stable@vger.kernel.org Cc: Christoph Hellwig Cc: Jens Axboe Reviewed-by: Ming Lei Signed-off-by: Jason Yan Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index cdbb888a8d4a..1c25dae083a8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -296,10 +296,10 @@ static void blkdev_bio_end_io(struct bio *bio) struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; - if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - } else { + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; -- cgit v1.2.3 From ca306c17d2edcc8aa3bf1724a5cb1ecefc31ef3b Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Sat, 2 Mar 2019 09:17:32 +0800 Subject: inotify: Fix fsnotify_mark refcount leak in inotify_update_existing_watch() [ Upstream commit 62c9d2674b31d4c8a674bee86b7edc6da2803aea ] Commit 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") forgot to call fsnotify_put_mark() with IN_MASK_CREATE after fsnotify_find_mark() Fixes: 4d97f7d53da7dc83 ("inotify: Add flag IN_MASK_CREATE for inotify_add_watch()") Signed-off-by: ZhangXiaoxu Signed-off-by: Jan Kara Signed-off-by: Sasha Levin --- fs/notify/inotify/inotify_user.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 780bba695453..97a51690338e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; - else if (create) - return -EEXIST; + else if (create) { + ret = -EEXIST; + goto out; + } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); @@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* return the wd */ ret = i_mark->wd; +out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); -- cgit v1.2.3 From 90a1327e4ed3e2c840b86f1f35d5aec712c652c4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 14 Mar 2019 23:46:05 -0400 Subject: ext4: avoid panic during forced reboot [ Upstream commit 1dc1097ff60e4105216da7cd0aa99032b039a994 ] When admin calls "reboot -f" - i.e., does a hard system reboot by directly calling reboot(2) - ext4 filesystem mounted with errors=panic can panic the system. This happens because the underlying device gets disabled without unmounting the filesystem and thus some syscall running in parallel to reboot(2) can result in the filesystem getting IO errors. This is somewhat surprising to the users so try improve the behavior by switching to errors=remount-ro behavior when the system is running reboot(2). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a1cf7d68b4f0..abba7ece78e9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -430,6 +430,12 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) spin_unlock(&sbi->s_md_lock); } +static bool system_going_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -460,7 +466,12 @@ static void ext4_handle_error(struct super_block *sb) if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt(sb, ERRORS_RO)) { + /* + * We force ERRORS_RO behavior when system is rebooting. Otherwise we + * could panic during 'reboot -f' as the underlying device got already + * disabled. + */ + if (test_opt(sb, ERRORS_RO) || system_going_down()) { ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible @@ -468,8 +479,7 @@ static void ext4_handle_error(struct super_block *sb) */ smp_wmb(); sb->s_flags |= SB_RDONLY; - } - if (test_opt(sb, ERRORS_PANIC)) { + } else if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; -- cgit v1.2.3 From f5a94fd3b375f44bcb27b81595b22f2f464a47d2 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 15 Mar 2019 00:15:32 -0400 Subject: ext4: add missing brelse() in add_new_gdb_meta_bg() [ Upstream commit d64264d6218e6892edd832dc3a5a5857c2856c53 ] Currently in add_new_gdb_meta_bg() there is a missing brelse of gdb_bh in case ext4_journal_get_write_access() fails. Additionally kvfree() is missing in the same error path. Fix it by moving the ext4_journal_get_write_access() before the ext4 sb update as Ted suggested and release n_group_desc and gdb_bh in case it fails. Fixes: 61a9c11e5e7a ("ext4: add missing brelse() add_new_gdb_meta_bg()'s error path") Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/resize.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3d9b18505c0c..90061c3d048b 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -932,11 +932,18 @@ static int add_new_gdb_meta_bg(struct super_block *sb, memcpy(n_group_desc, o_group_desc, EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + if (err) { + kvfree(n_group_desc); + brelse(gdb_bh); + return err; + } + EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; kvfree(o_group_desc); - BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); return err; } -- cgit v1.2.3 From a793860c0f52f28d0d5f052a20981057eb9fb9f1 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 15 Mar 2019 00:22:28 -0400 Subject: ext4: report real fs size after failed resize [ Upstream commit 6c7328400e0488f7d49e19e02290ba343b6811b2 ] Currently when the file system resize using ext4_resize_fs() fails it will report into log that "resized filesystem to ". However this may not be true in the case of failure. Use the current block count as returned by ext4_blocks_count() to report the block count. Additionally, report a warning that "error occurred during file system resize" Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/resize.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 90061c3d048b..e7ae26e36c9c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -2080,6 +2080,10 @@ out: free_flex_gd(flex_gd); if (resize_inode != NULL) iput(resize_inode); - ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", n_blocks_count); + if (err) + ext4_warning(sb, "error (%d) occurred during " + "file system resize", err); + ext4_msg(sb, KERN_INFO, "resized filesystem to %llu", + ext4_blocks_count(es)); return err; } -- cgit v1.2.3 From 40276e4e2fd050e7bed856b6278770347179ae37 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Mar 2019 15:58:38 -0500 Subject: fix incorrect error code mapping for OBJECTID_NOT_FOUND [ Upstream commit 85f9987b236cf46e06ffdb5c225cf1f3c0acb789 ] It was mapped to EIO which can be confusing when user space queries for an object GUID for an object for which the server file system doesn't support (or hasn't saved one). As Amir Goldstein suggested this is similar to ENOATTR (equivalently ENODATA in Linux errno definitions) so changing NT STATUS code mapping for OBJECTID_NOT_FOUND to ENODATA. Signed-off-by: Steve French CC: Amir Goldstein Signed-off-by: Sasha Levin --- fs/cifs/smb2maperror.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index c3ae8c1d6089..18814f1d67d9 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1036,7 +1036,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_UNFINISHED_CONTEXT_DELETED, -EIO, "STATUS_UNFINISHED_CONTEXT_DELETED"}, {STATUS_NO_TGT_REPLY, -EIO, "STATUS_NO_TGT_REPLY"}, - {STATUS_OBJECTID_NOT_FOUND, -EIO, "STATUS_OBJECTID_NOT_FOUND"}, + /* Note that ENOATTTR and ENODATA are the same errno */ + {STATUS_OBJECTID_NOT_FOUND, -ENODATA, "STATUS_OBJECTID_NOT_FOUND"}, {STATUS_NO_IP_ADDRESSES, -EIO, "STATUS_NO_IP_ADDRESSES"}, {STATUS_WRONG_CREDENTIAL_HANDLE, -EIO, "STATUS_WRONG_CREDENTIAL_HANDLE"}, -- cgit v1.2.3 From 83e3e89d66393f0209d4d236f0663a0de7b1927f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 8 Mar 2019 11:05:08 +0800 Subject: x86/gart: Exclude GART aperture from kcore [ Upstream commit ffc8599aa9763f39f6736a79da4d1575e7006f9a ] On machines where the GART aperture is mapped over physical RAM, /proc/kcore contains the GART aperture range. Accessing the GART range via /proc/kcore results in a kernel crash. vmcore used to have the same issue, until it was fixed with commit 2a3e83c6f96c ("x86/gart: Exclude GART aperture from vmcore")', leveraging existing hook infrastructure in vmcore to let /proc/vmcore return zeroes when attempting to read the aperture region, and so it won't read from the actual memory. Apply the same workaround for kcore. First implement the same hook infrastructure for kcore, then reuse the hook functions introduced in the previous vmcore fix. Just with some minor adjustment, rename some functions for more general usage, and simplify the hook infrastructure a bit as there is no module usage yet. Suggested-by: Baoquan He Signed-off-by: Kairui Song Signed-off-by: Thomas Gleixner Reviewed-by: Jiri Bohac Acked-by: Baoquan He Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Alexey Dobriyan Cc: Andrew Morton Cc: Omar Sandoval Cc: Dave Young Link: https://lkml.kernel.org/r/20190308030508.13548-1-kasong@redhat.com Signed-off-by: Sasha Levin --- fs/proc/kcore.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index d297fe4472a9..d0137e3e585e 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -54,6 +54,28 @@ static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; +/* + * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error + * Same as oldmem_pfn_is_ram in vmcore + */ +static int (*mem_pfn_is_ram)(unsigned long pfn); + +int __init register_mem_pfn_is_ram(int (*fn)(unsigned long pfn)) +{ + if (mem_pfn_is_ram) + return -EBUSY; + mem_pfn_is_ram = fn; + return 0; +} + +static int pfn_is_ram(unsigned long pfn) +{ + if (mem_pfn_is_ram) + return mem_pfn_is_ram(pfn); + else + return 1; +} + /* This doesn't grab kclist_lock, so it should only be used at init time. */ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, int type) @@ -465,6 +487,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) goto out; } m = NULL; /* skip the list anchor */ + } else if (!pfn_is_ram(__pa(start) >> PAGE_SHIFT)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ -- cgit v1.2.3 From 6fd66bec6d6a7e52bcb969dc35b67103ed717664 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 23 Mar 2019 12:10:29 -0400 Subject: ext4: prohibit fstrim in norecovery mode [ Upstream commit 18915b5873f07e5030e6fb108a050fa7c71c59fb ] The ext4 fstrim implementation uses the block bitmaps to find free space that can be discarded. If we haven't replayed the journal, the bitmaps will be stale and we absolutely *cannot* use stale metadata to zap the underlying storage. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2e76fb55d94a..5f24fdc140ad 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -999,6 +999,13 @@ resizefs_out: if (!blk_queue_discard(q)) return -EOPNOTSUPP; + /* + * We haven't replayed the journal, so we cannot use our + * block-bitmap-guided storage zapping commands. + */ + if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) + return -EROFS; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; -- cgit v1.2.3 From f9368366b4d190bacad99601fc5a026e13edc368 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 4 Sep 2018 03:52:17 +0800 Subject: f2fs: fix to avoid NULL pointer dereference on se->discard_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7d20c8abb2edcf962ca857d51f4d0f9cd4b19053 ] https://bugzilla.kernel.org/show_bug.cgi?id=200951 These is a NULL pointer dereference issue reported in bugzilla: Hi, in the setup there is a SATA SSD connected to a SATA-to-USB bridge. The disc is "Samsung SSD 850 PRO 256G" which supports TRIM. There are four partitions: sda1: FAT /boot sda2: F2FS / sda3: F2FS /home sda4: F2FS The bridge is ASMT1153e which uses the "uas" driver. There is no TRIM pass-through, so, when mounting it reports: mounting with "discard" option, but the device does not support discard The USB host is USB3.0 and UASP capable. It is the one on RK3399. Given this everything works fine, except there is no TRIM support. In order to enable TRIM a new UDEV rule is added [1]: /etc/udev/rules.d/10-sata-bridge-trim.rules: ACTION=="add|change", ATTRS{idVendor}=="174c", ATTRS{idProduct}=="55aa", SUBSYSTEM=="scsi_disk", ATTR{provisioning_mode}="unmap" After reboot any F2FS write hangs forever and dmesg reports: Unable to handle kernel NULL pointer dereference Also tested on a x86_64 system: works fine even with TRIM enabled. same disc same bridge different usb host controller different cpu architecture not root filesystem Regards, Vicenç. [1] Post #5 in https://bbs.archlinux.org/viewtopic.php?id=236280 Unable to handle kernel NULL pointer dereference at virtual address 000000000000003e Mem abort info: ESR = 0x96000004 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000626e3122 [000000000000003e] pgd=0000000000000000 Internal error: Oops: 96000004 [#1] SMP Modules linked in: overlay snd_soc_hdmi_codec rc_cec dw_hdmi_i2s_audio dw_hdmi_cec snd_soc_simple_card snd_soc_simple_card_utils snd_soc_rockchip_i2s rockchip_rga snd_soc_rockchip_pcm rockchipdrm videobuf2_dma_sg v4l2_mem2mem rtc_rk808 videobuf2_memops analogix_dp videobuf2_v4l2 videobuf2_common dw_hdmi dw_wdt cec rc_core videodev drm_kms_helper media drm rockchip_thermal rockchip_saradc realtek drm_panel_orientation_quirks syscopyarea sysfillrect sysimgblt fb_sys_fops dwmac_rk stmmac_platform stmmac pwm_bl squashfs loop crypto_user gpio_keys hid_kensington CPU: 5 PID: 957 Comm: nvim Not tainted 4.19.0-rc1-1-ARCH #1 Hardware name: Sapphire-RK3399 Board (DT) pstate: 00000005 (nzcv daif -PAN -UAO) pc : update_sit_entry+0x304/0x4b0 lr : update_sit_entry+0x108/0x4b0 sp : ffff00000ca13bd0 x29: ffff00000ca13bd0 x28: 000000000000003e x27: 0000000000000020 x26: 0000000000080000 x25: 0000000000000048 x24: ffff8000ebb85cf8 x23: 0000000000000253 x22: 00000000ffffffff x21: 00000000000535f2 x20: 00000000ffffffdf x19: ffff8000eb9e6800 x18: ffff8000eb9e6be8 x17: 0000000007ce6926 x16: 000000001c83ffa8 x15: 0000000000000000 x14: ffff8000f602df90 x13: 0000000000000006 x12: 0000000000000040 x11: 0000000000000228 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 00000000000535f2 x6 : ffff8000ebff3440 x5 : ffff8000ebff3440 x4 : ffff8000ebe3a6c8 x3 : 00000000ffffffff x2 : 0000000000000020 x1 : 0000000000000000 x0 : ffff8000eb9e5800 Process nvim (pid: 957, stack limit = 0x0000000063a78320) Call trace: update_sit_entry+0x304/0x4b0 f2fs_invalidate_blocks+0x98/0x140 truncate_node+0x90/0x400 f2fs_remove_inode_page+0xe8/0x340 f2fs_evict_inode+0x2b0/0x408 evict+0xe0/0x1e0 iput+0x160/0x260 do_unlinkat+0x214/0x298 __arm64_sys_unlinkat+0x3c/0x68 el0_svc_handler+0x94/0x118 el0_svc+0x8/0xc Code: f9400800 b9488400 36080140 f9400f01 (387c4820) ---[ end trace a0f21a307118c477 ]--- The reason is it is possible to enable discard flag on block queue via UDEV, but during mount, f2fs will initialize se->discard_map only if this flag is set, once the flag is set after mount, f2fs may dereference NULL pointer on se->discard_map. So this patch does below changes to fix this issue: - initialize and update se->discard_map all the time. - don't clear DISCARD option if device has no QUEUE_FLAG_DISCARD flag during mount. - don't issue small discard on zoned block device. - introduce some functions to enhance the readability. Signed-off-by: Chao Yu Tested-by: Vicente Bergas Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/debug.c | 3 +-- fs/f2fs/f2fs.h | 15 ++++++++++--- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 65 ++++++++++++++++++++++++------------------------------- fs/f2fs/super.c | 16 ++++---------- 5 files changed, 46 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 214a968962a1..ebe649d9793c 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -190,8 +190,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); - if (f2fs_discard_en(sbi)) - si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index a3ba20e5946f..1f5d5f62bb77 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3409,11 +3409,20 @@ static inline int get_blkz_type(struct f2fs_sb_info *sbi, } #endif -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi) { - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + return f2fs_sb_has_blkzoned(sbi->sb); +} - return blk_queue_discard(q) || f2fs_sb_has_blkzoned(sbi->sb); +static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) +{ + return blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev)); +} + +static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) +{ + return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || + f2fs_hw_should_discard(sbi); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 81c1dd635a8d..9fc076ca002d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1983,7 +1983,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!blk_queue_discard(q)) + if (!f2fs_hw_support_discard(F2FS_SB(sb))) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1fa6f8185766..ac038563273d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1744,11 +1744,11 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) return false; if (!force) { - if (!test_opt(sbi, DISCARD) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -1854,7 +1854,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, dirty_i->nr_dirty[PRE]--; } - if (!test_opt(sbi, DISCARD)) + if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && @@ -2044,8 +2044,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - !f2fs_test_and_set_bit(offset, se->discard_map)) + if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* don't overwrite by SSR to keep node chain */ @@ -2073,8 +2072,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (f2fs_discard_en(sbi) && - f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -2690,7 +2688,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (test_opt(sbi, DISCARD)) + if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); @@ -3781,13 +3779,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #endif - if (f2fs_discard_en(sbi)) { - sit_i->sentries[start].discard_map - = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, - GFP_KERNEL); - if (!sit_i->sentries[start].discard_map) - return -ENOMEM; - } + sit_i->sentries[start].discard_map + = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, + GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -3935,18 +3931,16 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) total_node_blocks += se->valid_blocks; /* build discard map only one time */ - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; } if (sbi->segs_per_sec > 1) @@ -3984,16 +3978,13 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (f2fs_discard_en(sbi)) { - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; - } + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; } if (sbi->segs_per_sec > 1) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 79370b7fa9d2..6539969a1c7f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -360,7 +360,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); - struct request_queue *q; substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; @@ -415,14 +414,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; break; case Opt_discard: - q = bdev_get_queue(sb->s_bdev); - if (blk_queue_discard(q)) { - set_opt(sbi, DISCARD); - } else if (!f2fs_sb_has_blkzoned(sb)) { - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } + set_opt(sbi, DISCARD); break; case Opt_nodiscard: if (f2fs_sb_has_blkzoned(sb)) { @@ -1033,7 +1025,8 @@ static void f2fs_put_super(struct super_block *sb) /* be sure to wait for any on-going discard commands */ dropped = f2fs_wait_discard_bios(sbi); - if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) { + if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && + !sbi->discard_blks && !dropped) { struct cp_control cpc = { .reason = CP_UMOUNT | CP_TRIMMED, }; @@ -1403,8 +1396,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) - set_opt(sbi, DISCARD); + set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi->sb)) set_opt_mode(sbi, F2FS_MOUNT_LFS); else -- cgit v1.2.3 From db77c7890ed73cfbdeb65954f12f6e0dab51b823 Mon Sep 17 00:00:00 2001 From: Gertjan Halkes Date: Wed, 5 Sep 2018 15:41:29 +0900 Subject: 9p: do not trust pdu content for stat item size [ Upstream commit 2803cf4379ed252894f046cb8812a48db35294e3 ] v9fs_dir_readdir() could deadloop if a struct was sent with a size set to -2 Link: http://lkml.kernel.org/r/1536134432-11997-1-git-send-email-asmadeus@codewreck.org Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=88021 Signed-off-by: Gertjan Halkes Signed-off-by: Dominique Martinet Signed-off-by: Sasha Levin --- fs/9p/vfs_dir.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 48db9a9f13f9..cb6c4031af55 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -105,7 +105,6 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int err = 0; struct p9_fid *fid; int buflen; - int reclen = 0; struct p9_rdir *rdir; struct kvec kvec; @@ -138,11 +137,10 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) while (rdir->head < rdir->tail) { err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); - if (err) { + if (err <= 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); return -EIO; } - reclen = st.size+2; over = !dir_emit(ctx, st.name, strlen(st.name), v9fs_qid2ino(&st.qid), dt_type(&st)); @@ -150,8 +148,8 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (over) return 0; - rdir->head += reclen; - ctx->pos += reclen; + rdir->head += err; + ctx->pos += err; } } } -- cgit v1.2.3 From 4369f8a38085347d9cf78fe6261b1296f664132c Mon Sep 17 00:00:00 2001 From: Dinu-Razvan Chis-Serban Date: Wed, 5 Sep 2018 16:44:12 +0900 Subject: 9p locks: add mount option for lock retry interval [ Upstream commit 5e172f75e51e3de1b4274146d9b990f803cb5c2a ] The default P9_LOCK_TIMEOUT can be too long for some users exporting a local file system to a guest VM (30s), make this configurable at mount time. Link: http://lkml.kernel.org/r/1536295827-3181-1-git-send-email-asmadeus@codewreck.org Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=195727 Signed-off-by: Dinu-Razvan Chis-Serban Signed-off-by: Dominique Martinet Signed-off-by: Sasha Levin --- fs/9p/v9fs.c | 21 +++++++++++++++++++++ fs/9p/v9fs.h | 1 + fs/9p/vfs_file.c | 6 +++++- 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 89bac3d2f05b..619128b55837 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -61,6 +61,8 @@ enum { Opt_cache_loose, Opt_fscache, Opt_mmap, /* Access options */ Opt_access, Opt_posixacl, + /* Lock timeout option */ + Opt_locktimeout, /* Error token */ Opt_err }; @@ -80,6 +82,7 @@ static const match_table_t tokens = { {Opt_cachetag, "cachetag=%s"}, {Opt_access, "access=%s"}, {Opt_posixacl, "posixacl"}, + {Opt_locktimeout, "locktimeout=%u"}, {Opt_err, NULL} }; @@ -187,6 +190,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif + v9ses->session_lock_timeout = P9_LOCK_TIMEOUT; if (!opts) return 0; @@ -359,6 +363,23 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) #endif break; + case Opt_locktimeout: + r = match_int(&args[0], &option); + if (r < 0) { + p9_debug(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + if (option < 1) { + p9_debug(P9_DEBUG_ERROR, + "locktimeout must be a greater than zero integer.\n"); + ret = -EINVAL; + continue; + } + v9ses->session_lock_timeout = (long)option * HZ; + break; + default: continue; } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 982e017acadb..129e5243a6bf 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -116,6 +116,7 @@ struct v9fs_session_info { struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; + long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c87e6d6ec069..05454a7e22dc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -154,6 +154,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; + struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); @@ -189,6 +190,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; + v9ses = v9fs_inode2v9ses(file_inode(filp)); + /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying @@ -202,7 +205,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; - if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0) + if (schedule_timeout_interruptible(v9ses->session_lock_timeout) + != 0) break; /* * p9_client_lock_dotl overwrites flock.client_id with the -- cgit v1.2.3 From 14b183214c08ba2dcd4fee8017879a1ad6f8f0e6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 6 Sep 2018 20:34:12 +0800 Subject: f2fs: fix to do sanity check with current segment number [ Upstream commit 042be0f849e5fc24116d0afecfaf926eed5cac63 ] https://bugzilla.kernel.org/show_bug.cgi?id=200219 Reproduction way: - mount image - run poc code - umount image F2FS-fs (loop1): Bitmap was wrongly set, blk:15364 ------------[ cut here ]------------ kernel BUG at /home/yuchao/git/devf2fs/segment.c:2061! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 2 PID: 17686 Comm: umount Tainted: G W O 4.18.0-rc2+ #39 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 EIP: update_sit_entry+0x459/0x4e0 [f2fs] Code: e8 1c b5 fd ff 0f 0b 0f 0b 8b 45 e4 c7 44 24 08 9c 7a 6c f8 c7 44 24 04 bc 4a 6c f8 89 44 24 0c 8b 06 89 04 24 e8 f7 b4 fd ff <0f> 0b 8b 45 e4 0f b6 d2 89 54 24 10 c7 44 24 08 60 7a 6c f8 c7 44 EAX: 00000032 EBX: 000000f8 ECX: 00000002 EDX: 00000001 ESI: d7177000 EDI: f520fe68 EBP: d6477c6c ESP: d6477c34 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010282 CR0: 80050033 CR2: b7fbe000 CR3: 2a99b3c0 CR4: 000406f0 Call Trace: f2fs_allocate_data_block+0x124/0x580 [f2fs] do_write_page+0x78/0x150 [f2fs] f2fs_do_write_node_page+0x25/0xa0 [f2fs] __write_node_page+0x2bf/0x550 [f2fs] f2fs_sync_node_pages+0x60e/0x6d0 [f2fs] ? sync_inode_metadata+0x2f/0x40 ? f2fs_write_checkpoint+0x28f/0x7d0 [f2fs] ? up_write+0x1e/0x80 f2fs_write_checkpoint+0x2a9/0x7d0 [f2fs] ? mark_held_locks+0x5d/0x80 ? _raw_spin_unlock_irq+0x27/0x50 kill_f2fs_super+0x68/0x90 [f2fs] deactivate_locked_super+0x3d/0x70 deactivate_super+0x40/0x60 cleanup_mnt+0x39/0x70 __cleanup_mnt+0x10/0x20 task_work_run+0x81/0xa0 exit_to_usermode_loop+0x59/0xa7 do_fast_syscall_32+0x1f5/0x22c entry_SYSENTER_32+0x53/0x86 EIP: 0xb7f95c51 Code: c1 1e f7 ff ff 89 e5 8b 55 08 85 d2 8b 81 64 cd ff ff 74 02 89 02 5d c3 8b 0c 24 c3 8b 1c 24 c3 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76 EAX: 00000000 EBX: 0871ab90 ECX: bfb2cd00 EDX: 00000000 ESI: 00000000 EDI: 0871ab90 EBP: 0871ab90 ESP: bfb2cd7c DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000246 Modules linked in: f2fs(O) crc32_generic bnep rfcomm bluetooth ecdh_generic snd_intel8x0 snd_ac97_codec ac97_bus snd_pcm snd_seq_midi snd_seq_midi_event snd_rawmidi snd_seq pcbc joydev aesni_intel snd_seq_device aes_i586 snd_timer crypto_simd snd cryptd soundcore mac_hid serio_raw video i2c_piix4 parport_pc ppdev lp parport hid_generic psmouse usbhid hid e1000 [last unloaded: f2fs] ---[ end trace d423f83982cfcdc5 ]--- The reason is, different log headers using the same segment, once one log's next block address is used by another log, it will cause panic as above. Main area: 24 segs, 24 secs 24 zones - COLD data: 0, 0, 0 - WARM data: 1, 1, 1 - HOT data: 20, 20, 20 - Dir dnode: 22, 22, 22 - File dnode: 22, 22, 22 - Indir nodes: 21, 21, 21 So this patch adds sanity check to detect such condition to avoid this issue. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/super.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6539969a1c7f..06017138eab0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2329,7 +2329,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int segment_count_main; unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count; - int i; + int i, j; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -2370,11 +2370,43 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_NODE_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_node_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Node segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) return 1; + for (j = i + 1; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u, %u) has the same " + "segno: %u", i, j, + le32_to_cpu(ckpt->cur_data_segno[i])); + return 1; + } + } + } + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + for (j = i; j < NR_CURSEG_DATA_TYPE; j++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) == + le32_to_cpu(ckpt->cur_data_segno[j])) { + f2fs_msg(sbi->sb, KERN_ERR, + "Data segment (%u) and Data segment (%u)" + " has the same segno: %u", i, j, + le32_to_cpu(ckpt->cur_node_segno[i])); + return 1; + } + } } sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); -- cgit v1.2.3 From 8722566b7870e1a0d631001c66bc11d590317a38 Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Fri, 12 Oct 2018 18:49:26 +0800 Subject: f2fs: cleanup dirty pages if recover failed [ Upstream commit 26b5a079197c8cb6725565968b7fd3299bd1877b ] During recover, we will try to create new dentries for inodes with dentry_mark. But if the parent is missing (e.g. killed by fsck), recover will break. But those recovered dirty pages are not cleanup. This will hit f2fs_bug_on: [ 53.519566] F2FS-fs (loop0): Found nat_bits in checkpoint [ 53.539354] F2FS-fs (loop0): recover_inode: ino = 5, name = file, inline = 3 [ 53.539402] F2FS-fs (loop0): recover_dentry: ino = 5, name = file, dir = 0, err = -2 [ 53.545760] F2FS-fs (loop0): Cannot recover all fsync data errno=-2 [ 53.546105] F2FS-fs (loop0): access invalid blkaddr:4294967295 [ 53.546171] WARNING: CPU: 1 PID: 1798 at fs/f2fs/checkpoint.c:163 f2fs_is_valid_blkaddr+0x26c/0x320 [ 53.546174] Modules linked in: [ 53.546183] CPU: 1 PID: 1798 Comm: mount Not tainted 4.19.0-rc2+ #1 [ 53.546186] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 53.546191] RIP: 0010:f2fs_is_valid_blkaddr+0x26c/0x320 [ 53.546195] Code: 85 bb 00 00 00 48 89 df 88 44 24 07 e8 ad a8 db ff 48 8b 3b 44 89 e1 48 c7 c2 40 03 72 a9 48 c7 c6 e0 01 72 a9 e8 84 3c ff ff <0f> 0b 0f b6 44 24 07 e9 8a 00 00 00 48 8d bf 38 01 00 00 e8 7c a8 [ 53.546201] RSP: 0018:ffff88006c067768 EFLAGS: 00010282 [ 53.546208] RAX: 0000000000000000 RBX: ffff880068844200 RCX: ffffffffa83e1a33 [ 53.546211] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff88006d51e590 [ 53.546215] RBP: 0000000000000005 R08: ffffed000daa3cb3 R09: ffffed000daa3cb3 [ 53.546218] R10: 0000000000000001 R11: ffffed000daa3cb2 R12: 00000000ffffffff [ 53.546221] R13: ffff88006a1f8000 R14: 0000000000000200 R15: 0000000000000009 [ 53.546226] FS: 00007fb2f3646840(0000) GS:ffff88006d500000(0000) knlGS:0000000000000000 [ 53.546229] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 53.546234] CR2: 00007f0fd77f0008 CR3: 00000000687e6002 CR4: 00000000000206e0 [ 53.546237] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 53.546240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 53.546242] Call Trace: [ 53.546248] f2fs_submit_page_bio+0x95/0x740 [ 53.546253] read_node_page+0x161/0x1e0 [ 53.546271] ? truncate_node+0x650/0x650 [ 53.546283] ? add_to_page_cache_lru+0x12c/0x170 [ 53.546288] ? pagecache_get_page+0x262/0x2d0 [ 53.546292] __get_node_page+0x200/0x660 [ 53.546302] f2fs_update_inode_page+0x4a/0x160 [ 53.546306] f2fs_write_inode+0x86/0xb0 [ 53.546317] __writeback_single_inode+0x49c/0x620 [ 53.546322] writeback_single_inode+0xe4/0x1e0 [ 53.546326] sync_inode_metadata+0x93/0xd0 [ 53.546330] ? sync_inode+0x10/0x10 [ 53.546342] ? do_raw_spin_unlock+0xed/0x100 [ 53.546347] f2fs_sync_inode_meta+0xe0/0x130 [ 53.546351] f2fs_fill_super+0x287d/0x2d10 [ 53.546367] ? vsnprintf+0x742/0x7a0 [ 53.546372] ? f2fs_commit_super+0x180/0x180 [ 53.546379] ? up_write+0x20/0x40 [ 53.546385] ? set_blocksize+0x5f/0x140 [ 53.546391] ? f2fs_commit_super+0x180/0x180 [ 53.546402] mount_bdev+0x181/0x200 [ 53.546406] mount_fs+0x94/0x180 [ 53.546411] vfs_kern_mount+0x6c/0x1e0 [ 53.546415] do_mount+0xe5e/0x1510 [ 53.546420] ? fs_reclaim_release+0x9/0x30 [ 53.546424] ? copy_mount_string+0x20/0x20 [ 53.546428] ? fs_reclaim_acquire+0xd/0x30 [ 53.546435] ? __might_sleep+0x2c/0xc0 [ 53.546440] ? ___might_sleep+0x53/0x170 [ 53.546453] ? __might_fault+0x4c/0x60 [ 53.546468] ? _copy_from_user+0x95/0xa0 [ 53.546474] ? memdup_user+0x39/0x60 [ 53.546478] ksys_mount+0x88/0xb0 [ 53.546482] __x64_sys_mount+0x5d/0x70 [ 53.546495] do_syscall_64+0x65/0x130 [ 53.546503] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 53.547639] ---[ end trace b804d1ea2fec893e ]--- So if recover fails, we need to drop all recovered data. Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/recovery.c | 32 +++++++++++++++++++++----------- fs/f2fs/super.c | 15 ++++++++++++++- 2 files changed, 35 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 9a8579fb3a30..ae0e5f2e67b4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -99,8 +99,12 @@ err_out: return ERR_PTR(err); } -static void del_fsync_inode(struct fsync_inode_entry *entry) +static void del_fsync_inode(struct fsync_inode_entry *entry, int drop) { + if (drop) { + /* inode should not be recovered, drop it */ + f2fs_inode_synced(entry->inode); + } iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); @@ -321,12 +325,12 @@ next: return err; } -static void destroy_fsync_dnodes(struct list_head *head) +static void destroy_fsync_dnodes(struct list_head *head, int drop) { struct fsync_inode_entry *entry, *tmp; list_for_each_entry_safe(entry, tmp, head, list) - del_fsync_inode(entry); + del_fsync_inode(entry, drop); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -561,7 +565,7 @@ out: } static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, - struct list_head *dir_list) + struct list_head *tmp_inode_list, struct list_head *dir_list) { struct curseg_info *curseg; struct page *page = NULL; @@ -615,7 +619,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, } if (entry->blkaddr == blkaddr) - del_fsync_inode(entry); + list_move_tail(&entry->list, tmp_inode_list); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -628,7 +632,7 @@ next: int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct list_head inode_list; + struct list_head inode_list, tmp_inode_list; struct list_head dir_list; int err; int ret = 0; @@ -659,6 +663,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&tmp_inode_list); INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ @@ -677,11 +682,16 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, &dir_list); + err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); + else { + /* restore s_flags to let iput() trash data */ + sbi->sb->s_flags = s_flags; + } skip: - destroy_fsync_dnodes(&inode_list); + destroy_fsync_dnodes(&inode_list, err); + destroy_fsync_dnodes(&tmp_inode_list, err); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), @@ -690,13 +700,13 @@ skip: if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); + } else { + clear_sbi_flag(sbi, SBI_POR_DOING); } - - clear_sbi_flag(sbi, SBI_POR_DOING); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ - destroy_fsync_dnodes(&dir_list); + destroy_fsync_dnodes(&dir_list, err); if (need_writecp) { set_sbi_flag(sbi, SBI_IS_RECOVERED); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 06017138eab0..2264f27fd26d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1885,6 +1885,19 @@ void f2fs_quota_off_umount(struct super_block *sb) } } +static void f2fs_truncate_quota_inode_pages(struct super_block *sb) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int type; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!dqopt->files[type]) + continue; + f2fs_inode_synced(dqopt->files[type]); + } +} + + static int f2fs_get_projid(struct inode *inode, kprojid_t *projid) { *projid = F2FS_I(inode)->i_projid; @@ -3131,10 +3144,10 @@ skip_recovery: free_meta: #ifdef CONFIG_QUOTA + f2fs_truncate_quota_inode_pages(sb); if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) f2fs_quota_off_umount(sbi->sb); #endif - f2fs_sync_inode_meta(sbi); /* * Some dirty meta pages can be produced by f2fs_recover_orphan_inodes() * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() -- cgit v1.2.3 From e9603cffb1ca7699b22cec844d4092a7a2e0e416 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 19 Oct 2018 01:58:22 -0500 Subject: cifs: fallback to older infolevels on findfirst queryinfo retry [ Upstream commit 3b7960caceafdfc2cdfe2850487f8d091eb41144 ] In cases where queryinfo fails, we have cases in cifs (vers=1.0) where with backupuid mounts we retry the query info with findfirst. This doesn't work to some NetApp servers which don't support WindowsXP (and later) infolevel 261 (SMB_FIND_FILE_ID_FULL_DIR_INFO) so in this case use other info levels (in this case it will usually be level 257, SMB_FIND_FILE_DIRECTORY_INFO). (Also fixes some indentation) See kernel bugzilla 201435 Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/cifs/inode.c | 67 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 020f49c15b30..b59ebed4f615 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -780,43 +780,50 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else if ((rc == -EACCES) && backup_cred(cifs_sb) && (strcmp(server->vals->version_string, SMB1_VERSION_STRING) == 0)) { - /* - * For SMB2 and later the backup intent flag is already - * sent if needed on open and there is no path based - * FindFirst operation to use to retry with - */ + /* + * For SMB2 and later the backup intent flag is already + * sent if needed on open and there is no path based + * FindFirst operation to use to retry with + */ - srchinf = kzalloc(sizeof(struct cifs_search_info), - GFP_KERNEL); - if (srchinf == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + srchinf = kzalloc(sizeof(struct cifs_search_info), + GFP_KERNEL); + if (srchinf == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } - srchinf->endOfSearch = false; + srchinf->endOfSearch = false; + if (tcon->unix_ext) + srchinf->info_level = SMB_FIND_FILE_UNIX; + else if ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) + srchinf->info_level = SMB_FIND_FILE_INFO_STANDARD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) srchinf->info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; + else /* no srvino useful for fallback to some netapp */ + srchinf->info_level = SMB_FIND_FILE_DIRECTORY_INFO; - srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | - CIFS_SEARCH_CLOSE_AT_END | - CIFS_SEARCH_BACKUP_SEARCH; + srchflgs = CIFS_SEARCH_CLOSE_ALWAYS | + CIFS_SEARCH_CLOSE_AT_END | + CIFS_SEARCH_BACKUP_SEARCH; - rc = CIFSFindFirst(xid, tcon, full_path, - cifs_sb, NULL, srchflgs, srchinf, false); - if (!rc) { - data = - (FILE_ALL_INFO *)srchinf->srch_entries_start; + rc = CIFSFindFirst(xid, tcon, full_path, + cifs_sb, NULL, srchflgs, srchinf, false); + if (!rc) { + data = (FILE_ALL_INFO *)srchinf->srch_entries_start; - cifs_dir_info_to_fattr(&fattr, - (FILE_DIRECTORY_INFO *)data, cifs_sb); - fattr.cf_uniqueid = le64_to_cpu( - ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); - validinum = true; + cifs_dir_info_to_fattr(&fattr, + (FILE_DIRECTORY_INFO *)data, cifs_sb); + fattr.cf_uniqueid = le64_to_cpu( + ((SEARCH_ID_FULL_DIR_INFO *)data)->UniqueId); + validinum = true; - cifs_buf_release(srchinf->ntwrk_buf_start); - } - kfree(srchinf); - if (rc) - goto cgii_exit; + cifs_buf_release(srchinf->ntwrk_buf_start); + } + kfree(srchinf); + if (rc) + goto cgii_exit; } else goto cgii_exit; -- cgit v1.2.3 From 48b0309f85ae819da5bbe992b8769729f39c66c7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 23 Feb 2019 09:48:27 +0800 Subject: f2fs: fix to dirty inode for i_mode recovery [ Upstream commit ca597bddedd94906cd761d8be6a3ad21292725de ] As Seulbae Kim reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202637 We didn't recover permission field correctly after sudden power-cut, the reason is in setattr we didn't add inode into global dirty list once i_mode is changed, so latter checkpoint triggered by fsync will not flush last i_mode into disk, result in this problem, fix it. Reported-by: Seulbae Kim Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Sasha Levin --- fs/f2fs/file.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9fc076ca002d..b3f46e3bec17 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -770,7 +770,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -830,8 +829,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -845,7 +842,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); -- cgit v1.2.3 From 8092ecc306d81186a64cda42411121f4d35aaff4 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Fri, 29 Mar 2019 10:49:12 +0100 Subject: CIFS: keep FileInfo handle live during oplock break commit b98749cac4a695f084a5ff076f4510b23e353ecd upstream. In the oplock break handler, writing pending changes from pages puts the FileInfo handle. If the refcount reaches zero it closes the handle and waits for any oplock break handler to return, thus causing a deadlock. To prevent this situation: * We add a wait flag to cifsFileInfo_put() to decide whether we should wait for running/pending oplock break handlers * We keep an additionnal reference of the SMB FileInfo handle so that for the rest of the handler putting the handle won't close it. - The ref is bumped everytime we queue the handler via the cifs_queue_oplock_break() helper. - The ref is decremented at the end of the handler This bug was triggered by xfstest 464. Also important fix to address the various reports of oops in smb2_push_mandatory_locks Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/file.c | 30 +++++++++++++++++++++++++----- fs/cifs/misc.c | 25 +++++++++++++++++++++++-- fs/cifs/smb2misc.c | 6 +++--- 4 files changed, 53 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 80f33582059e..6f227cc781e5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1263,6 +1263,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1763,6 +1764,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock; #endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); +void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d847132ab027..d6b45682833b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -358,12 +358,30 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/* - * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. Must be called without holding - * tcon->open_file_lock and cifs_file->file_info_lock. +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + _cifsFileInfo_put(cifs_file, true); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock and + * cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) { struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); @@ -411,7 +429,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = wait_oplock_handler ? + cancel_work_sync(&cifs_file->oplock_break) : false; if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -4170,6 +4189,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + _cifsFileInfo_put(cfile, false /* do not wait for ourself */); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 6926685e513c..facc94e159a1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -490,8 +490,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsoplockd_wq, - &netfile->oplock_break); + cifs_queue_oplock_break(netfile); netfile->oplock_break_cancelled = false; spin_unlock(&tcon->open_file_lock); @@ -588,6 +587,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) spin_unlock(&cinode->writers_lock); } +/** + * cifs_queue_oplock_break - queue the oplock break handler for cfile + * + * This function is called from the demultiplex thread when it + * receives an oplock break for @cfile. + * + * Assumes the tcon->open_file_lock is held. + * Assumes cfile->file_info_lock is NOT held. + */ +void cifs_queue_oplock_break(struct cifsFileInfo *cfile) +{ + /* + * Bump the handle refcount now while we hold the + * open_file_lock to enforce the validity of it for the oplock + * break handler. The matching put is done at the end of the + * handler. + */ + cifsFileInfo_get(cfile); + + queue_work(cifsoplockd_wq, &cfile->oplock_break); +} + void cifs_done_oplock_break(struct cifsInodeInfo *cinode) { clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 58700d2ba8cd..0a7ed2e3ad4f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -555,7 +555,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - queue_work(cifsoplockd_wq, &cfile->oplock_break); + cifs_queue_oplock_break(cfile); kfree(lw); return true; } @@ -719,8 +719,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsoplockd_wq, - &cfile->oplock_break); + + cifs_queue_oplock_break(cfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); -- cgit v1.2.3 From 8fb89b43b65fcd35f15d982712904b96fc64c68a Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Sat, 6 Apr 2019 15:47:38 +0800 Subject: cifs: Fix use-after-free in SMB2_write commit 6a3eb3360667170988f8a6477f6686242061488a upstream. There is a KASAN use-after-free: BUG: KASAN: use-after-free in SMB2_write+0x1342/0x1580 Read of size 8 at addr ffff8880b6a8e450 by task ln/4196 Should not release the 'req' because it will use in the trace. Fixes: eccb4422cf97 ("smb3: Add ftrace tracepoints for improved SMB3 debugging") Signed-off-by: ZhangXiaoxu Signed-off-by: Steve French CC: Stable 4.18+ Reviewed-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 71f32d983384..299d8e46f01f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3591,7 +3591,6 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { @@ -3609,6 +3608,7 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->offset, *nbytes); } + cifs_small_buf_release(req); free_rsp_buf(resp_buftype, rsp); return rc; } -- cgit v1.2.3 From c69330a855ab4342d304f67f8c1e7d1fa2686bec Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Sat, 6 Apr 2019 15:47:39 +0800 Subject: cifs: Fix use-after-free in SMB2_read commit 088aaf17aa79300cab14dbee2569c58cfafd7d6e upstream. There is a KASAN use-after-free: BUG: KASAN: use-after-free in SMB2_read+0x1136/0x1190 Read of size 8 at addr ffff8880b4e45e50 by task ln/1009 Should not release the 'req' because it will use in the trace. Fixes: eccb4422cf97 ("smb3: Add ftrace tracepoints for improved SMB3 debugging") Signed-off-by: ZhangXiaoxu Signed-off-by: Steve French CC: Stable 4.18+ Reviewed-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 299d8e46f01f..c6fd3acc5560 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3273,8 +3273,6 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rqst.rq_nvec = 1; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; if (rc) { @@ -3293,6 +3291,8 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); + cifs_small_buf_release(req); + *nbytes = le32_to_cpu(rsp->DataLength); if ((*nbytes > CIFS_MAX_MSGSIZE) || (*nbytes > io_parms->length)) { -- cgit v1.2.3 From 2fcee5eaae6ee1c2ccbe629ffd46c6674a98824c Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 10 Apr 2019 07:47:22 +1000 Subject: cifs: fix handle leak in smb2_query_symlink() commit e6d0fb7b34f264f72c33053558a360a6a734905e upstream. If we enter smb2_query_symlink() for something that is not a symlink and where the SMB2_open() would succeed we would never end up closing this handle and would thus leak a handle on the server. Fix this by immediately calling SMB2_close() on successfull open. Signed-off-by: Ronnie Sahlberg CC: Stable Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d4d7d61a6fe2..2001184afe70 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1906,6 +1906,8 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_iov, &resp_buftype); + if (!rc) + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); if (!rc || !err_iov.iov_base) { rc = -ENOENT; goto free_path; -- cgit v1.2.3 From 6ff17bc5936e5fab33de8064dc0690f6c8c789ca Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream. The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 18 ++++++++++++++++++ fs/userfaultfd.c | 9 +++++++++ 2 files changed, 27 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d76fe166f6ce..c5819baee35c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1138,6 +1138,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d8b8323e80f4..aaca81b5e119 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -630,6 +630,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -884,6 +886,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -906,6 +910,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1334,6 +1339,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1521,6 +1528,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; -- cgit v1.2.3 From 8766cc7d0d1d3de0bc0ede9cbf61532925b7f23f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 21 Feb 2019 11:17:34 -0500 Subject: ext4: fix some error pointer dereferences [ Upstream commit 7159a986b4202343f6cca3bb8079ecace5816fd6 ] We can't pass error pointers to brelse(). Fixes: fb265c9cb49e ("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases") Signed-off-by: Dan Carpenter Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Sasha Levin --- fs/ext4/xattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c0ba5206cd9d..006c277dc22e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); + bh = NULL; goto out; } @@ -2907,6 +2908,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (error == -EIO) EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3063,6 +3065,7 @@ ext4_xattr_block_cache_find(struct inode *inode, if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) return NULL; + bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { -- cgit v1.2.3 From d5bf783a09a06c81ca4783054355f1d243e124e7 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 23 Apr 2019 16:39:45 +1000 Subject: cifs: fix memory leak in SMB2_read commit 05fd5c2c61732152a6bddc318aae62d7e436629b upstream. Commit 088aaf17aa79300cab14dbee2569c58cfafd7d6e introduced a leak where if SMB2_read() returned an error we would return without freeing the request buffer. Cc: Stable Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c6fd3acc5560..33afb637e6f8 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3285,6 +3285,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, rc); } free_rsp_buf(resp_buftype, rsp_iov.iov_base); + cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else trace_smb3_read_done(xid, req->PersistentFileId, -- cgit v1.2.3 From ee231063ff954dbd46d1c55bbb6519e60dd9c649 Mon Sep 17 00:00:00 2001 From: Frank Sorenson Date: Tue, 16 Apr 2019 08:37:27 -0500 Subject: cifs: do not attempt cifs operation on smb2+ rename error commit 652727bbe1b17993636346716ae5867627793647 upstream. A path-based rename returning EBUSY will incorrectly try opening the file with a cifs (NT Create AndX) operation on an smb2+ mount, which causes the server to force a session close. If the mount is smb2+, skip the fallback. Signed-off-by: Frank Sorenson Signed-off-by: Steve French CC: Stable Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b59ebed4f615..1fadd314ae7f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1735,6 +1735,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0 || rc != -EBUSY) goto do_rename_exit; + /* Don't fall back to using SMB on SMB 2+ mount */ + if (server->vals->protocol_id != 0) + goto do_rename_exit; + /* open-file renames don't work across directories */ if (to_dentry->d_parent != from_dentry->d_parent) goto do_rename_exit; -- cgit v1.2.3 From cffeb9c84d20816a2173e3cfeca210c8bfa8e357 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 4 Apr 2019 23:59:25 +0200 Subject: tracing: Fix buffer_ref pipe ops commit b987222654f84f7b4ca95b3a55eca784cb30235b upstream. This fixes multiple issues in buffer_pipe_buf_ops: - The ->steal() handler must not return zero unless the pipe buffer has the only reference to the page. But generic_pipe_buf_steal() assumes that every reference to the pipe is tracked by the page's refcount, which isn't true for these buffers - buffer_pipe_buf_get(), which duplicates a buffer, doesn't touch the page's refcount. Fix it by using generic_pipe_buf_nosteal(), which refuses every attempted theft. It should be easy to actually support ->steal, but the only current users of pipe_buf_steal() are the virtio console and FUSE, and they also only use it as an optimization. So it's probably not worth the effort. - The ->get() and ->release() handlers can be invoked concurrently on pipe buffers backed by the same struct buffer_ref. Make them safe against concurrency by using refcount_t. - The pointers stored in ->private were only zeroed out when the last reference to the buffer_ref was dropped. As far as I know, this shouldn't be necessary anyway, but if we do it, let's always do it. Link: http://lkml.kernel.org/r/20190404215925.253531-1-jannh@google.com Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Al Viro Cc: stable@vger.kernel.org Fixes: 73a757e63114d ("ring-buffer: Return reader page back into existing ring buffer") Signed-off-by: Jann Horn Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- fs/splice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 29e92b506394..c78e0e3ff6c4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -333,8 +333,8 @@ const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) +int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return 1; } -- cgit v1.2.3 From 8d693ef0141c90dd94fe47be1727214bf3adb3dd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 15 Apr 2019 12:00:42 -0400 Subject: ceph: only use d_name directly when parent is locked commit 1bcb344086f3ecf8d6705f6d708441baa823beb3 upstream. Ben reported tripping the BUG_ON in create_request_message during some performance testing. Analysis of the vmcore showed that the length of the r_dentry->d_name string changed after we allocated the buffer, but before we encoded it. build_dentry_path returns pointers to d_name in the common case of non-snapped dentries, but this optimization isn't safe unless the parent directory is locked. When it isn't, have the code make a copy of the d_name while holding the d_lock. Cc: stable@vger.kernel.org Reported-by: Ben England Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mds_client.c | 61 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bc43c822426a..d87ca0bd1bcd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1945,10 +1945,39 @@ retry: return path; } +/* Duplicate the dentry->d_name.name safely */ +static int clone_dentry_name(struct dentry *dentry, const char **ppath, + int *ppathlen) +{ + u32 len; + char *name; + +retry: + len = READ_ONCE(dentry->d_name.len); + name = kmalloc(len + 1, GFP_NOFS); + if (!name) + return -ENOMEM; + + spin_lock(&dentry->d_lock); + if (dentry->d_name.len != len) { + spin_unlock(&dentry->d_lock); + kfree(name); + goto retry; + } + memcpy(name, dentry->d_name.name, len); + spin_unlock(&dentry->d_lock); + + name[len] = '\0'; + *ppath = name; + *ppathlen = len; + return 0; +} + static int build_dentry_path(struct dentry *dentry, struct inode *dir, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath, bool parent_locked) { + int ret; char *path; rcu_read_lock(); @@ -1957,8 +1986,15 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (dir && ceph_snap(dir) == CEPH_NOSNAP) { *pino = ceph_ino(dir); rcu_read_unlock(); - *ppath = dentry->d_name.name; - *ppathlen = dentry->d_name.len; + if (parent_locked) { + *ppath = dentry->d_name.name; + *ppathlen = dentry->d_name.len; + } else { + ret = clone_dentry_name(dentry, ppath, ppathlen); + if (ret) + return ret; + *pfreepath = true; + } return 0; } rcu_read_unlock(); @@ -1966,13 +2002,13 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } static int build_inode_path(struct inode *inode, const char **ppath, int *ppathlen, u64 *pino, - int *pfreepath) + bool *pfreepath) { struct dentry *dentry; char *path; @@ -1988,7 +2024,7 @@ static int build_inode_path(struct inode *inode, if (IS_ERR(path)) return PTR_ERR(path); *ppath = path; - *pfreepath = 1; + *pfreepath = true; return 0; } @@ -1999,7 +2035,7 @@ static int build_inode_path(struct inode *inode, static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, struct inode *rdiri, const char *rpath, u64 rino, const char **ppath, int *pathlen, - u64 *ino, int *freepath) + u64 *ino, bool *freepath, bool parent_locked) { int r = 0; @@ -2009,7 +2045,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, ceph_snap(rinode)); } else if (rdentry) { r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, - freepath); + freepath, parent_locked); dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, *ppath); } else if (rpath || rino) { @@ -2035,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; int pathlen1 = 0, pathlen2 = 0; - int freepath1 = 0, freepath2 = 0; + bool freepath1 = false, freepath2 = false; int len; u16 releases; void *p, *end; @@ -2043,16 +2079,19 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, - &path1, &pathlen1, &ino1, &freepath1); + &path1, &pathlen1, &ino1, &freepath1, + test_bit(CEPH_MDS_R_PARENT_LOCKED, + &req->r_req_flags)); if (ret < 0) { msg = ERR_PTR(ret); goto out; } + /* If r_old_dentry is set, then assume that its parent is locked */ ret = set_request_path_attr(NULL, req->r_old_dentry, req->r_old_dentry_dir, req->r_path2, req->r_ino2.ino, - &path2, &pathlen2, &ino2, &freepath2); + &path2, &pathlen2, &ino2, &freepath2, true); if (ret < 0) { msg = ERR_PTR(ret); goto out_free1; -- cgit v1.2.3 From 246d2bf32da1df979b2bb0b4ca38b9b2b6ac2bf4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 17 Apr 2019 12:58:28 -0400 Subject: ceph: ensure d_name stability in ceph_dentry_hash() commit 76a495d666e5043ffc315695f8241f5e94a98849 upstream. Take the d_lock here to ensure that d_name doesn't change. Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/dir.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82928cea0209..7f3f64ba464f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1470,6 +1470,7 @@ void ceph_dentry_lru_del(struct dentry *dn) unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) { struct ceph_inode_info *dci = ceph_inode(dir); + unsigned hash; switch (dci->i_dir_layout.dl_dir_hash) { case 0: /* for backward compat */ @@ -1477,8 +1478,11 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) return dn->d_name.hash; default: - return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + spin_lock(&dn->d_lock); + hash = ceph_str_hash(dci->i_dir_layout.dl_dir_hash, dn->d_name.name, dn->d_name.len); + spin_unlock(&dn->d_lock); + return hash; } } -- cgit v1.2.3 From 950eec8126001c63ec6055d22d370d8ab833ed4f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 18 Apr 2019 11:24:57 +0800 Subject: ceph: fix ci->i_head_snapc leak commit 37659182bff1eeaaeadcfc8f853c6d2b6dbc3f47 upstream. We missed two places that i_wrbuffer_ref_head, i_wr_ref, i_dirty_caps and i_flushing_caps may change. When they are all zeros, we should free i_head_snapc. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/38224 Reported-and-tested-by: Luis Henriques Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mds_client.c | 9 +++++++++ fs/ceph/snap.c | 7 ++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d87ca0bd1bcd..bfcf11c70bfa 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1290,6 +1290,15 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (drop && + ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f74193da0e09..1f46b02f7314 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -568,7 +568,12 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) old_snapc = NULL; update_snapc: - if (ci->i_head_snapc) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ci->i_head_snapc = NULL; + } else { ci->i_head_snapc = ceph_get_snap_context(new_snapc); dout(" new snapc is %p\n", new_snapc); } -- cgit v1.2.3 From b4d4b5e4b83979b8b0888295bda1751b4340747c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 5 Apr 2019 08:54:37 -0700 Subject: nfsd: Don't release the callback slot unless it was actually held commit e6abc8caa6deb14be2a206253f7e1c5e37e9515b upstream. If there are multiple callbacks queued, waiting for the callback slot when the callback gets shut down, then they all currently end up acting as if they hold the slot, and call nfsd4_cb_sequence_done() resulting in interesting side-effects. In addition, the 'retry_nowait' path in nfsd4_cb_sequence_done() causes a loop back to nfsd4_cb_prepare() without first freeing the slot, which causes a deadlock when nfsd41_cb_get_slot() gets called a second time. This patch therefore adds a boolean to track whether or not the callback did pick up the slot, so that it can do the right thing in these 2 cases. Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4callback.c | 8 +++++++- fs/nfsd/state.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 601bf33c26a0..ebbb0285addb 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -926,8 +926,9 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) cb->cb_seq_status = 1; cb->cb_status = 0; if (minorversion) { - if (!nfsd41_cb_get_slot(clp, task)) + if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) return; + cb->cb_holds_slot = true; } rpc_call_start(task); } @@ -954,6 +955,9 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback return true; } + if (!cb->cb_holds_slot) + goto need_restart; + switch (cb->cb_seq_status) { case 0: /* @@ -992,6 +996,7 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback cb->cb_seq_status); } + cb->cb_holds_slot = false; clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -1199,6 +1204,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_seq_status = 1; cb->cb_status = 0; cb->cb_need_restart = false; + cb->cb_holds_slot = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 0b15dac7e609..0f07ad6dc1ef 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -70,6 +70,7 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; bool cb_need_restart; + bool cb_holds_slot; }; struct nfsd4_callback_ops { -- cgit v1.2.3 From 4d476a00b3f98334fd8a680dc0d813f4d82a5f3d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 25 Apr 2019 22:24:05 -0700 Subject: fs/proc/proc_sysctl.c: Fix a NULL pointer dereference commit 89189557b47b35683a27c80ee78aef18248eefb4 upstream. Syzkaller report this: sysctl could not get directory: /net//bridge -12 kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN PTI CPU: 1 PID: 7027 Comm: syz-executor.0 Tainted: G C 5.1.0-rc3+ #8 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 RIP: 0010:__write_once_size include/linux/compiler.h:220 [inline] RIP: 0010:__rb_change_child include/linux/rbtree_augmented.h:144 [inline] RIP: 0010:__rb_erase_augmented include/linux/rbtree_augmented.h:186 [inline] RIP: 0010:rb_erase+0x5f4/0x19f0 lib/rbtree.c:459 Code: 00 0f 85 60 13 00 00 48 89 1a 48 83 c4 18 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 f2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 75 0c 00 00 4d 85 ed 4c 89 2e 74 ce 4c 89 ea 48 RSP: 0018:ffff8881bb507778 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: ffff8881f224b5b8 RCX: ffffffff818f3f6a RDX: 000000000000000a RSI: 0000000000000050 RDI: ffff8881f224b568 RBP: 0000000000000000 R08: ffffed10376a0ef4 R09: ffffed10376a0ef4 R10: 0000000000000001 R11: ffffed10376a0ef4 R12: ffff8881f224b558 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f3e7ce13700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fd60fbe9398 CR3: 00000001cb55c001 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: erase_entry fs/proc/proc_sysctl.c:178 [inline] erase_header+0xe3/0x160 fs/proc/proc_sysctl.c:207 start_unregistering fs/proc/proc_sysctl.c:331 [inline] drop_sysctl_table+0x558/0x880 fs/proc/proc_sysctl.c:1631 get_subdir fs/proc/proc_sysctl.c:1022 [inline] __register_sysctl_table+0xd65/0x1090 fs/proc/proc_sysctl.c:1335 br_netfilter_init+0x68/0x1000 [br_netfilter] do_one_initcall+0xbc/0x47d init/main.c:901 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Modules linked in: br_netfilter(+) backlight comedi(C) hid_sensor_hub max3100 ti_ads8688 udc_core fddi snd_mona leds_gpio rc_streamzap mtd pata_netcell nf_log_common rc_winfast udp_tunnel snd_usbmidi_lib snd_usb_toneport snd_usb_line6 snd_rawmidi snd_seq_device snd_hwdep videobuf2_v4l2 videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops rc_gadmei_rm008z 8250_of smm665 hid_tmff hid_saitek hwmon_vid rc_ati_tv_wonder_hd_600 rc_core pata_pdc202xx_old dn_rtmsg as3722 ad714x_i2c ad714x snd_soc_cs4265 hid_kensington panel_ilitek_ili9322 drm drm_panel_orientation_quirks ipack cdc_phonet usbcore phonet hid_jabra hid extcon_arizona can_dev industrialio_triggered_buffer kfifo_buf industrialio adm1031 i2c_mux_ltc4306 i2c_mux ipmi_msghandler mlxsw_core snd_soc_cs35l34 snd_soc_core snd_pcm_dmaengine snd_pcm snd_timer ac97_bus snd_compress snd soundcore gpio_da9055 uio ecdh_generic mdio_thunder of_mdio fixed_phy libphy mdio_cavium iptable_security iptable_raw iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun joydev mousedev ppdev tpm kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel ide_pci_generic piix aes_x86_64 crypto_simd cryptd ide_core glue_helper input_leds psmouse intel_agp intel_gtt serio_raw ata_generic i2c_piix4 agpgart pata_acpi parport_pc parport floppy rtc_cmos sch_fq_codel ip_tables x_tables sha1_ssse3 sha1_generic ipv6 [last unloaded: br_netfilter] Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace 68741688d5fbfe85 ]--- commit 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links") forgot to handle start_unregistering() case, while header->parent is NULL, it calls erase_header() and as seen in the above syzkaller call trace, accessing &header->parent->root will trigger a NULL pointer dereference. As that commit explained, there is also no need to call start_unregistering() if header->parent is NULL. Link: http://lkml.kernel.org/r/20190409153622.28112-1-yuehaibing@huawei.com Fixes: 23da9588037e ("fs/proc/proc_sysctl.c: fix NULL pointer dereference in put_links") Fixes: 0e47c99d7fe25 ("sysctl: Replace root_list with links between sysctl_table_sets") Signed-off-by: YueHaibing Reported-by: Hulk Robot Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Alexey Dobriyan Cc: Al Viro Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/proc_sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d65390727541..7325baa8f9d4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1626,9 +1626,11 @@ static void drop_sysctl_table(struct ctl_table_header *header) if (--header->nreg) return; - if (parent) + if (parent) { put_links(header); - start_unregistering(header); + start_unregistering(header); + } + if (!--header->count) kfree_rcu(header, rcu); -- cgit v1.2.3 From 94ad68a6e5704b4475f4cc95dd541d21227e749b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 30 Mar 2019 10:21:07 +0900 Subject: NFS: Forbid setting AF_INET6 to "struct sockaddr_in"->sin_family. commit 7c2bd9a39845bfb6d72ddb55ce737650271f6f96 upstream. syzbot is reporting uninitialized value at rpc_sockaddr2uaddr() [1]. This is because syzbot is setting AF_INET6 to "struct sockaddr_in"->sin_family (which is embedded into user-visible "struct nfs_mount_data" structure) despite nfs23_validate_mount_data() cannot pass sizeof(struct sockaddr_in6) bytes of AF_INET6 address to rpc_sockaddr2uaddr(). Since "struct nfs_mount_data" structure is user-visible, we can't change "struct nfs_mount_data" to use "struct sockaddr_storage". Therefore, assuming that everybody is using AF_INET family when passing address via "struct nfs_mount_data"->addr, reject if its sin_family is not AF_INET. [1] https://syzkaller.appspot.com/bug?id=599993614e7cbbf66bc2656a919ab2a95fb5d75c Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6b666d187907..6df9b85caf20 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2052,7 +2052,8 @@ static int nfs23_validate_mount_data(void *options, memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); args->nfs_server.port = ntohs(data->addr.sin_port); - if (!nfs_verify_server_address(sap)) + if (sap->sa_family != AF_INET || + !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) -- cgit v1.2.3 From 9101cbe70ef64c7f35fb75552005a3a696cc288e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Nov 2018 16:44:07 +0100 Subject: aio: clear IOCB_HIPRI commit 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 upstream. No one is going to poll for aio (yet), so we must clear the HIPRI flag, as we would otherwise send it down the poll queues, where no one will be polling for completions. Signed-off-by: Christoph Hellwig IOCB_HIPRI, not RWF_HIPRI. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 45d5ef8dd0a8..78aa249070b1 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1438,8 +1438,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - fput(req->ki_filp); - return ret; + goto out_fput; } req->ki_ioprio = iocb->aio_reqprio; @@ -1448,7 +1447,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - fput(req->ki_filp); + goto out_fput; + + req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ + return 0; + +out_fput: + fput(req->ki_filp); return ret; } -- cgit v1.2.3 From b3373253f0bab538a7521537dfcb73e731b3d732 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Nov 2018 14:27:13 -0700 Subject: aio: use assigned completion handler commit bc9bff61624ac33b7c95861abea1af24ee7a94fc upstream. We know this is a read/write request, but in preparation for having different kinds of those, ensure that we call the assigned handler instead of assuming it's aio_complete_rq(). Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 78aa249070b1..3df3fb0678e5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1492,7 +1492,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; /*FALLTHRU*/ default: - aio_complete_rw(req, ret, 0); + req->ki_complete(req, ret, 0); } } -- cgit v1.2.3 From 730198c889d85db78058cfb57c1b41c65f55c94e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Nov 2018 15:57:42 -0700 Subject: aio: separate out ring reservation from req allocation commit 432c79978c33ecef91b1b04cea6936c20810da29 upstream. This is in preparation for certain types of IO not needing a ring reserveration. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3df3fb0678e5..b9e0df08277b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -902,7 +902,7 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) local_irq_restore(flags); } -static bool get_reqs_available(struct kioctx *ctx) +static bool __get_reqs_available(struct kioctx *ctx) { struct kioctx_cpu *kcpu; bool ret = false; @@ -994,6 +994,14 @@ static void user_refill_reqs_available(struct kioctx *ctx) spin_unlock_irq(&ctx->completion_lock); } +static bool get_reqs_available(struct kioctx *ctx) +{ + if (__get_reqs_available(ctx)) + return true; + user_refill_reqs_available(ctx); + return __get_reqs_available(ctx); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. @@ -1002,24 +1010,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; - if (!get_reqs_available(ctx)) { - user_refill_reqs_available(ctx); - if (!get_reqs_available(ctx)) - return NULL; - } - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) - goto out_put; + return NULL; percpu_ref_get(&ctx->reqs); INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 0); req->ki_ctx = ctx; return req; -out_put: - put_reqs_available(ctx, 1); - return NULL; } static struct kioctx *lookup_ioctx(unsigned long ctx_id) @@ -1813,9 +1812,13 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return -EINVAL; } + if (!get_reqs_available(ctx)) + return -EAGAIN; + + ret = -EAGAIN; req = aio_get_req(ctx); if (unlikely(!req)) - return -EAGAIN; + goto out_put_reqs_available; if (iocb.aio_flags & IOCB_FLAG_RESFD) { /* @@ -1878,11 +1881,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; return 0; out_put_req: - put_reqs_available(ctx, 1); percpu_ref_put(&ctx->reqs); if (req->ki_eventfd) eventfd_ctx_put(req->ki_eventfd); kmem_cache_free(kiocb_cachep, req); +out_put_reqs_available: + put_reqs_available(ctx, 1); return ret; } -- cgit v1.2.3 From ef529eead8cfc11c051b90d239e7137f7141ea94 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 4 Dec 2018 09:44:49 -0700 Subject: aio: don't zero entire aio_kiocb aio_get_req() commit 2bc4ca9bb600cbe36941da2b2a67189fc4302a04 upstream. It's 192 bytes, fairly substantial. Most items don't need to be cleared, especially not upfront. Clear the ones we do need to clear, and leave the other ones for setup when the iocb is prepared and submitted. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index b9e0df08277b..2547f17b4fef 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1010,14 +1010,15 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { struct aio_kiocb *req; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); if (unlikely(!req)) return NULL; percpu_ref_get(&ctx->reqs); + req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); refcount_set(&req->ki_refcnt, 0); - req->ki_ctx = ctx; + req->ki_eventfd = NULL; return req; } @@ -1738,6 +1739,10 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) if (unlikely(!req->file)) return -EBADF; + req->head = NULL; + req->woken = false; + req->cancelled = false; + apt.pt._qproc = aio_poll_queue_proc; apt.pt._key = req->events; apt.iocb = aiocb; -- cgit v1.2.3 From 4d677689742ab60d5be46e20708276368564427a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 24 Nov 2018 21:33:09 -0700 Subject: aio: use iocb_put() instead of open coding it commit 71ebc6fef0f53459f37fb39e1466792232fa52ee upstream. Replace the percpu_ref_put() + kmem_cache_free() with a call to iocb_put() instead. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 2547f17b4fef..e2b63ab28ecc 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1886,10 +1886,9 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; return 0; out_put_req: - percpu_ref_put(&ctx->reqs); if (req->ki_eventfd) eventfd_ctx_put(req->ki_eventfd); - kmem_cache_free(kiocb_cachep, req); + iocb_put(req); out_put_reqs_available: put_reqs_available(ctx, 1); return ret; -- cgit v1.2.3 From d384f8b855a573ea301fd7f5558cc64cb22107e6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 24 Nov 2018 14:46:14 -0700 Subject: aio: split out iocb copy from io_submit_one() commit 88a6f18b950e2e4dce57d31daa151105f4f3dcff upstream. In preparation of handing in iocbs in a different fashion as well. Also make it clear that the iocb being passed in isn't modified, by marking it const throughout. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 68 ++++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index e2b63ab28ecc..6e1da220f04b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1416,7 +1416,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) aio_complete(iocb, res, res2); } -static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; @@ -1457,7 +1457,7 @@ out_fput: return ret; } -static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec, +static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, bool vectored, bool compat, struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; @@ -1496,8 +1496,8 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) } } -static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1529,8 +1529,8 @@ out_fput: return ret; } -static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, - bool compat) +static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, + bool vectored, bool compat) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1585,7 +1585,8 @@ static void aio_fsync_work(struct work_struct *work) aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); } -static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync) +static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, + bool datasync) { if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)) @@ -1719,7 +1720,7 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); } -static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb) +static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; @@ -1791,27 +1792,23 @@ out: return 0; } -static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - bool compat) +static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, + struct iocb __user *user_iocb, bool compat) { struct aio_kiocb *req; - struct iocb iocb; ssize_t ret; - if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) - return -EFAULT; - /* enforce forwards compatibility on users */ - if (unlikely(iocb.aio_reserved2)) { + if (unlikely(iocb->aio_reserved2)) { pr_debug("EINVAL: reserve field set\n"); return -EINVAL; } /* prevent overflows */ if (unlikely( - (iocb.aio_buf != (unsigned long)iocb.aio_buf) || - (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) || - ((ssize_t)iocb.aio_nbytes < 0) + (iocb->aio_buf != (unsigned long)iocb->aio_buf) || + (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || + ((ssize_t)iocb->aio_nbytes < 0) )) { pr_debug("EINVAL: overflow check\n"); return -EINVAL; @@ -1825,14 +1822,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) goto out_put_reqs_available; - if (iocb.aio_flags & IOCB_FLAG_RESFD) { + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an * instance of the file* now. The file descriptor must be * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; @@ -1847,32 +1844,32 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, } req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb.aio_data; + req->ki_user_data = iocb->aio_data; - switch (iocb.aio_lio_opcode) { + switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: - ret = aio_read(&req->rw, &iocb, false, compat); + ret = aio_read(&req->rw, iocb, false, compat); break; case IOCB_CMD_PWRITE: - ret = aio_write(&req->rw, &iocb, false, compat); + ret = aio_write(&req->rw, iocb, false, compat); break; case IOCB_CMD_PREADV: - ret = aio_read(&req->rw, &iocb, true, compat); + ret = aio_read(&req->rw, iocb, true, compat); break; case IOCB_CMD_PWRITEV: - ret = aio_write(&req->rw, &iocb, true, compat); + ret = aio_write(&req->rw, iocb, true, compat); break; case IOCB_CMD_FSYNC: - ret = aio_fsync(&req->fsync, &iocb, false); + ret = aio_fsync(&req->fsync, iocb, false); break; case IOCB_CMD_FDSYNC: - ret = aio_fsync(&req->fsync, &iocb, true); + ret = aio_fsync(&req->fsync, iocb, true); break; case IOCB_CMD_POLL: - ret = aio_poll(req, &iocb); + ret = aio_poll(req, iocb); break; default: - pr_debug("invalid aio operation %d\n", iocb.aio_lio_opcode); + pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode); ret = -EINVAL; break; } @@ -1894,6 +1891,17 @@ out_put_reqs_available: return ret; } +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, + bool compat) +{ + struct iocb iocb; + + if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb)))) + return -EFAULT; + + return __io_submit_one(ctx, &iocb, user_iocb, compat); +} + /* sys_io_submit: * Queue the nr iocbs pointed to by iocbpp for processing. Returns * the number of iocbs queued. May return -EINVAL if the aio_context -- cgit v1.2.3 From a812f7b68a3940e0369fd0fb24febec794a67623 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Nov 2018 20:06:23 -0700 Subject: aio: abstract out io_event filler helper commit 875736bb3f3ded168469f6a14df7a938416a99d5 upstream. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6e1da220f04b..f6ce01ca6903 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1059,6 +1059,15 @@ static inline void iocb_put(struct aio_kiocb *iocb) } } +static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, + long res, long res2) +{ + ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; + ev->data = iocb->ki_user_data; + ev->res = res; + ev->res2 = res2; +} + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1086,10 +1095,7 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_user_iocb; - event->data = iocb->ki_user_data; - event->res = res; - event->res2 = res2; + aio_fill_event(event, iocb, res, res2); kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); -- cgit v1.2.3 From 2afa01cd9186974051b38b7d1f31bb2407e41e3a Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Tue, 5 Feb 2019 14:13:35 -0500 Subject: aio: initialize kiocb private in case any filesystems expect it. commit ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 upstream. A recent optimization had left private uninitialized. Fixes: 2bc4ca9bb600 ("aio: don't zero entire aio_kiocb aio_get_req()") Reviewed-by: Christoph Hellwig Signed-off-by: Mike Marshall Signed-off-by: Jens Axboe Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index f6ce01ca6903..d74fc9e112ac 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1430,6 +1430,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) if (unlikely(!req->ki_filp)) return -EBADF; req->ki_complete = aio_complete_rw; + req->private = NULL; req->ki_pos = iocb->aio_offset; req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) -- cgit v1.2.3 From d6b2615f7d31d8e58b685d42dbafcc7dc1204bbd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Mar 2019 14:23:33 -0800 Subject: aio: simplify - and fix - fget/fput for io_submit() commit 84c4e1f89fefe70554da0ab33be72c9be7994379 upstream. Al Viro root-caused a race where the IOCB_CMD_POLL handling of fget/fput() could cause us to access the file pointer after it had already been freed: "In more details - normally IOCB_CMD_POLL handling looks so: 1) io_submit(2) allocates aio_kiocb instance and passes it to aio_poll() 2) aio_poll() resolves the descriptor to struct file by req->file = fget(iocb->aio_fildes) 3) aio_poll() sets ->woken to false and raises ->ki_refcnt of that aio_kiocb to 2 (bumps by 1, that is). 4) aio_poll() calls vfs_poll(). After sanity checks (basically, "poll_wait() had been called and only once") it locks the queue. That's what the extra reference to iocb had been for - we know we can safely access it. 5) With queue locked, we check if ->woken has already been set to true (by aio_poll_wake()) and, if it had been, we unlock the queue, drop a reference to aio_kiocb and bugger off - at that point it's a responsibility to aio_poll_wake() and the stuff called/scheduled by it. That code will drop the reference to file in req->file, along with the other reference to our aio_kiocb. 6) otherwise, we see whether we need to wait. If we do, we unlock the queue, drop one reference to aio_kiocb and go away - eventual wakeup (or cancel) will deal with the reference to file and with the other reference to aio_kiocb 7) otherwise we remove ourselves from waitqueue (still under the queue lock), so that wakeup won't get us. No async activity will be happening, so we can safely drop req->file and iocb ourselves. If wakeup happens while we are in vfs_poll(), we are fine - aio_kiocb won't get freed under us, so we can do all the checks and locking safely. And we don't touch ->file if we detect that case. However, vfs_poll() most certainly *does* touch the file it had been given. So wakeup coming while we are still in ->poll() might end up doing fput() on that file. That case is not too rare, and usually we are saved by the still present reference from descriptor table - that fput() is not the final one. But if another thread closes that descriptor right after our fget() and wakeup does happen before ->poll() returns, we are in trouble - final fput() done while we are in the middle of a method: Al also wrote a patch to take an extra reference to the file descriptor to fix this, but I instead suggested we just streamline the whole file pointer handling by submit_io() so that the generic aio submission code simply keeps the file pointer around until the aio has completed. Fixes: bfe4037e722e ("aio: implement IOCB_CMD_POLL") Acked-by: Al Viro Reported-by: syzbot+503d4cc169fcec1cb18c@syzkaller.appspotmail.com Signed-off-by: Linus Torvalds Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 72 ++++++++++++++++++++++++++-------------------------------------- 1 file changed, 29 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index d74fc9e112ac..46229e663b57 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -161,9 +161,13 @@ struct kioctx { unsigned id; }; +/* + * First field must be the file pointer in all the + * iocb unions! See also 'struct kiocb' in + */ struct fsync_iocb { - struct work_struct work; struct file *file; + struct work_struct work; bool datasync; }; @@ -177,8 +181,15 @@ struct poll_iocb { struct work_struct work; }; +/* + * NOTE! Each of the iocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'ki_filp' in this struct. + */ struct aio_kiocb { union { + struct file *ki_filp; struct kiocb rw; struct fsync_iocb fsync; struct poll_iocb poll; @@ -1054,6 +1065,8 @@ static inline void iocb_put(struct aio_kiocb *iocb) { if (refcount_read(&iocb->ki_refcnt) == 0 || refcount_dec_and_test(&iocb->ki_refcnt)) { + if (iocb->ki_filp) + fput(iocb->ki_filp); percpu_ref_put(&iocb->ki_ctx->reqs); kmem_cache_free(kiocb_cachep, iocb); } @@ -1418,7 +1431,6 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - fput(kiocb->ki_filp); aio_complete(iocb, res, res2); } @@ -1426,9 +1438,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) { int ret; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) - return -EBADF; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; @@ -1445,7 +1454,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = ioprio_check_cap(iocb->aio_reqprio); if (ret) { pr_debug("aio ioprio check cap error: %d\n", ret); - goto out_fput; + return ret; } req->ki_ioprio = iocb->aio_reqprio; @@ -1454,14 +1463,10 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) - goto out_fput; + return ret; req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */ return 0; - -out_fput: - fput(req->ki_filp); - return ret; } static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, @@ -1515,24 +1520,19 @@ static ssize_t aio_read(struct kiocb *req, const struct iocb *iocb, if (ret) return ret; file = req->ki_filp; - - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_READ))) - goto out_fput; + return -EBADF; ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) aio_rw_done(req, call_read_iter(file, req, &iter)); kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } @@ -1549,16 +1549,14 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, return ret; file = req->ki_filp; - ret = -EBADF; if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out_fput; - ret = -EINVAL; + return -EBADF; if (unlikely(!file->f_op->write_iter)) - goto out_fput; + return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); if (ret) - goto out_fput; + return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { /* @@ -1576,9 +1574,6 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, aio_rw_done(req, call_write_iter(file, req, &iter)); } kfree(iovec); -out_fput: - if (unlikely(ret)) - fput(file); return ret; } @@ -1588,7 +1583,6 @@ static void aio_fsync_work(struct work_struct *work) int ret; ret = vfs_fsync(req->file, req->datasync); - fput(req->file); aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); } @@ -1599,13 +1593,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, iocb->aio_rw_flags)) return -EINVAL; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; - if (unlikely(!req->file->f_op->fsync)) { - fput(req->file); + if (unlikely(!req->file->f_op->fsync)) return -EINVAL; - } req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); @@ -1615,10 +1604,7 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) { - struct file *file = iocb->poll.file; - aio_complete(iocb, mangle_poll(mask), 0); - fput(file); } static void aio_poll_complete_work(struct work_struct *work) @@ -1743,9 +1729,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_WORK(&req->work, aio_poll_complete_work); req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; - req->file = fget(iocb->aio_fildes); - if (unlikely(!req->file)) - return -EBADF; req->head = NULL; req->woken = false; @@ -1788,10 +1771,8 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) spin_unlock_irq(&ctx->ctx_lock); out: - if (unlikely(apt.error)) { - fput(req->file); + if (unlikely(apt.error)) return apt.error; - } if (mask) aio_poll_complete(aiocb, mask); @@ -1829,6 +1810,11 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, if (unlikely(!req)) goto out_put_reqs_available; + req->ki_filp = fget(iocb->aio_fildes); + ret = -EBADF; + if (unlikely(!req->ki_filp)) + goto out_put_req; + if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an -- cgit v1.2.3 From c7f2525abfecf8a57a1417837b6a809df79b299e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 Mar 2019 20:22:54 -0500 Subject: pin iocb through aio. commit b53119f13a04879c3bf502828d99d13726639ead upstream. aio_poll() is not the only case that needs file pinned; worse, while aio_read()/aio_write() can live without pinning iocb itself, the proof is rather brittle and can easily break on later changes. Signed-off-by: Linus Torvalds Signed-off-by: Al Viro Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 46229e663b57..10e5a8f52dce 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1016,6 +1016,9 @@ static bool get_reqs_available(struct kioctx *ctx) /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. + * + * The refcount is initialized to 2 - one for the async op completion, + * one for the synchronous code that does this. */ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { @@ -1028,7 +1031,7 @@ static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) percpu_ref_get(&ctx->reqs); req->ki_ctx = ctx; INIT_LIST_HEAD(&req->ki_list); - refcount_set(&req->ki_refcnt, 0); + refcount_set(&req->ki_refcnt, 2); req->ki_eventfd = NULL; return req; } @@ -1061,15 +1064,18 @@ out: return ret; } +static inline void iocb_destroy(struct aio_kiocb *iocb) +{ + if (iocb->ki_filp) + fput(iocb->ki_filp); + percpu_ref_put(&iocb->ki_ctx->reqs); + kmem_cache_free(kiocb_cachep, iocb); +} + static inline void iocb_put(struct aio_kiocb *iocb) { - if (refcount_read(&iocb->ki_refcnt) == 0 || - refcount_dec_and_test(&iocb->ki_refcnt)) { - if (iocb->ki_filp) - fput(iocb->ki_filp); - percpu_ref_put(&iocb->ki_ctx->reqs); - kmem_cache_free(kiocb_cachep, iocb); - } + if (refcount_dec_and_test(&iocb->ki_refcnt)) + iocb_destroy(iocb); } static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, @@ -1743,9 +1749,6 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) INIT_LIST_HEAD(&req->wait.entry); init_waitqueue_func_entry(&req->wait, aio_poll_wake); - /* one for removal from waitqueue, one for this function */ - refcount_set(&aiocb->ki_refcnt, 2); - mask = vfs_poll(req->file, &apt.pt) & req->events; if (unlikely(!req->head)) { /* we did not manage to set up a waitqueue, done */ @@ -1776,7 +1779,6 @@ out: if (mask) aio_poll_complete(aiocb, mask); - iocb_put(aiocb); return 0; } @@ -1867,18 +1869,21 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, break; } + /* Done with the synchronous reference */ + iocb_put(req); + /* * If ret is 0, we'd either done aio_complete() ourselves or have * arranged for that to be done asynchronously. Anything non-zero * means that we need to destroy req ourselves. */ - if (ret) - goto out_put_req; - return 0; + if (!ret) + return 0; + out_put_req: if (req->ki_eventfd) eventfd_ctx_put(req->ki_eventfd); - iocb_put(req); + iocb_destroy(req); out_put_reqs_available: put_reqs_available(ctx, 1); return ret; -- cgit v1.2.3 From 592ea630b081a6c97ec56499b0e12f68fd2da2d8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 11 Mar 2019 19:00:36 -0400 Subject: aio: fold lookup_kiocb() into its sole caller commit 833f4154ed560232120bc475935ee1d6a20e159f upstream. Signed-off-by: Al Viro Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 10e5a8f52dce..cda193f6de76 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1992,24 +1992,6 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, } #endif -/* lookup_kiocb - * Finds a given iocb for cancellation. - */ -static struct aio_kiocb * -lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb) -{ - struct aio_kiocb *kiocb; - - assert_spin_locked(&ctx->ctx_lock); - - /* TODO: use a hash or array, this sucks. */ - list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) - return kiocb; - } - return NULL; -} - /* sys_io_cancel: * Attempts to cancel an iocb previously passed to io_submit. If * the operation is successfully cancelled, the resulting event is @@ -2038,10 +2020,13 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - kiocb = lookup_kiocb(ctx, iocb); - if (kiocb) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + /* TODO: use a hash or array, this sucks. */ + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_user_iocb == iocb) { + ret = kiocb->ki_cancel(&kiocb->rw); + list_del_init(&kiocb->ki_list); + break; + } } spin_unlock_irq(&ctx->ctx_lock); -- cgit v1.2.3 From c20202c51d2b6703a4e539235f892f34daabd791 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Mar 2019 19:43:45 -0500 Subject: aio: keep io_event in aio_kiocb commit a9339b7855094ba11a97e8822ae038135e879e79 upstream. We want to separate forming the resulting io_event from putting it into the ring buffer. Signed-off-by: Al Viro Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index cda193f6de76..ec30f1bdac0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -198,8 +198,7 @@ struct aio_kiocb { struct kioctx *ki_ctx; kiocb_cancel_fn *ki_cancel; - struct iocb __user *ki_user_iocb; /* user's aiocb */ - __u64 ki_user_data; /* user's data for completion */ + struct io_event ki_res; struct list_head ki_list; /* the aio core uses this * for cancellation */ @@ -1078,15 +1077,6 @@ static inline void iocb_put(struct aio_kiocb *iocb) iocb_destroy(iocb); } -static void aio_fill_event(struct io_event *ev, struct aio_kiocb *iocb, - long res, long res2) -{ - ev->obj = (u64)(unsigned long)iocb->ki_user_iocb; - ev->data = iocb->ki_user_data; - ev->res = res; - ev->res2 = res2; -} - /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1098,6 +1088,8 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) unsigned tail, pos, head; unsigned long flags; + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail @@ -1114,14 +1106,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - aio_fill_event(event, iocb, res, res2); + *event = iocb->ki_res; kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); - pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, - res, res2); + pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, + (void __user *)(unsigned long)iocb->ki_res.obj, + iocb->ki_res.data, iocb->ki_res.res, iocb->ki_res.res2); /* after flagging the request as done, we * must never even look at it again @@ -1838,8 +1830,10 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, goto out_put_req; } - req->ki_user_iocb = user_iocb; - req->ki_user_data = iocb->aio_data; + req->ki_res.obj = (u64)(unsigned long)user_iocb; + req->ki_res.data = iocb->aio_data; + req->ki_res.res = 0; + req->ki_res.res2 = 0; switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: @@ -2009,6 +2003,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct aio_kiocb *kiocb; int ret = -EINVAL; u32 key; + u64 obj = (u64)(unsigned long)iocb; if (unlikely(get_user(key, &iocb->aio_key))) return -EFAULT; @@ -2022,7 +2017,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, spin_lock_irq(&ctx->ctx_lock); /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { - if (kiocb->ki_user_iocb == iocb) { + if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); list_del_init(&kiocb->ki_list); break; -- cgit v1.2.3 From aab66dfb757aa5b211ec6b0c322b42f4ef5ab34f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Mar 2019 19:49:55 -0500 Subject: aio: store event at final iocb_put() commit 2bb874c0d873d13bd9b9b9c6d7b7c4edab18c8b4 upstream. Instead of having aio_complete() set ->ki_res.{res,res2}, do that explicitly in its callers, drop the reference (as aio_complete() used to do) and delay the rest until the final iocb_put(). Signed-off-by: Al Viro Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index ec30f1bdac0c..556ee620038f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1071,16 +1071,10 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } -static inline void iocb_put(struct aio_kiocb *iocb) -{ - if (refcount_dec_and_test(&iocb->ki_refcnt)) - iocb_destroy(iocb); -} - /* aio_complete * Called when the io request on the given iocb is complete. */ -static void aio_complete(struct aio_kiocb *iocb, long res, long res2) +static void aio_complete(struct aio_kiocb *iocb) { struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; @@ -1088,8 +1082,6 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) unsigned tail, pos, head; unsigned long flags; - iocb->ki_res.res = res; - iocb->ki_res.res2 = res2; /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail @@ -1155,7 +1147,14 @@ static void aio_complete(struct aio_kiocb *iocb, long res, long res2) if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); - iocb_put(iocb); +} + +static inline void iocb_put(struct aio_kiocb *iocb) +{ + if (refcount_dec_and_test(&iocb->ki_refcnt)) { + aio_complete(iocb); + iocb_destroy(iocb); + } } /* aio_read_events_ring @@ -1429,7 +1428,9 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) file_end_write(kiocb->ki_filp); } - aio_complete(iocb, res, res2); + iocb->ki_res.res = res; + iocb->ki_res.res2 = res2; + iocb_put(iocb); } static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) @@ -1577,11 +1578,10 @@ static ssize_t aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { - struct fsync_iocb *req = container_of(work, struct fsync_iocb, work); - int ret; + struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); - ret = vfs_fsync(req->file, req->datasync); - aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0); + iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + iocb_put(iocb); } static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, @@ -1602,7 +1602,8 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) { - aio_complete(iocb, mangle_poll(mask), 0); + iocb->ki_res.res = mangle_poll(mask); + iocb_put(iocb); } static void aio_poll_complete_work(struct work_struct *work) -- cgit v1.2.3 From e9e47779aaa7212ccb75f8d8d4d16ab188efb313 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Mar 2019 21:45:41 -0500 Subject: Fix aio_poll() races commit af5c72b1fc7a00aa484e90b0c4e0eeb582545634 upstream. aio_poll() has to cope with several unpleasant problems: * requests that might stay around indefinitely need to be made visible for io_cancel(2); that must not be done to a request already completed, though. * in cases when ->poll() has placed us on a waitqueue, wakeup might have happened (and request completed) before ->poll() returns. * worse, in some early wakeup cases request might end up re-added into the queue later - we can't treat "woken up and currently not in the queue" as "it's not going to stick around indefinitely" * ... moreover, ->poll() might have decided not to put it on any queues to start with, and that needs to be distinguished from the previous case * ->poll() might have tried to put us on more than one queue. Only the first will succeed for aio poll, so we might end up missing wakeups. OTOH, we might very well notice that only after the wakeup hits and request gets completed (all before ->poll() gets around to the second poll_wait()). In that case it's too late to decide that we have an error. req->woken was an attempt to deal with that. Unfortunately, it was broken. What we need to keep track of is not that wakeup has happened - the thing might come back after that. It's that async reference is already gone and won't come back, so we can't (and needn't) put the request on the list of cancellables. The easiest case is "request hadn't been put on any waitqueues"; we can tell by seeing NULL apt.head, and in that case there won't be anything async. We should either complete the request ourselves (if vfs_poll() reports anything of interest) or return an error. In all other cases we get exclusion with wakeups by grabbing the queue lock. If request is currently on queue and we have something interesting from vfs_poll(), we can steal it and complete the request ourselves. If it's on queue and vfs_poll() has not reported anything interesting, we either put it on the cancellable list, or, if we know that it hadn't been put on all queues ->poll() wanted it on, we steal it and return an error. If it's _not_ on queue, it's either been already dealt with (in which case we do nothing), or there's aio_poll_complete_work() about to be executed. In that case we either put it on the cancellable list, or, if we know it hadn't been put on all queues ->poll() wanted it on, simulate what cancel would've done. It's a lot more convoluted than I'd like it to be. Single-consumer APIs suck, and unfortunately aio is not an exception... Signed-off-by: Al Viro Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/aio.c | 90 +++++++++++++++++++++++++++++----------------------------------- 1 file changed, 40 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 556ee620038f..911e23087dfb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -175,7 +175,7 @@ struct poll_iocb { struct file *file; struct wait_queue_head *head; __poll_t events; - bool woken; + bool done; bool cancelled; struct wait_queue_entry wait; struct work_struct work; @@ -1600,12 +1600,6 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, return 0; } -static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask) -{ - iocb->ki_res.res = mangle_poll(mask); - iocb_put(iocb); -} - static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1631,9 +1625,11 @@ static void aio_poll_complete_work(struct work_struct *work) return; } list_del_init(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; spin_unlock_irq(&ctx->ctx_lock); - aio_poll_complete(iocb, mask); + iocb_put(iocb); } /* assumes we are called with irqs disabled */ @@ -1661,31 +1657,27 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, __poll_t mask = key_to_poll(key); unsigned long flags; - req->woken = true; - /* for instances that support it check for an event match first: */ - if (mask) { - if (!(mask & req->events)) - return 0; + if (mask && !(mask & req->events)) + return 0; + list_del_init(&req->wait.entry); + + if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { /* * Try to complete the iocb inline if we can. Use * irqsave/irqrestore because not all filesystems (e.g. fuse) * call this function with IRQs disabled and because IRQs * have to be disabled before ctx_lock is obtained. */ - if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) { - list_del(&iocb->ki_list); - spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); - - list_del_init(&req->wait.entry); - aio_poll_complete(iocb, mask); - return 1; - } + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); + req->done = true; + spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags); + iocb_put(iocb); + } else { + schedule_work(&req->work); } - - list_del_init(&req->wait.entry); - schedule_work(&req->work); return 1; } @@ -1717,6 +1709,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) struct kioctx *ctx = aiocb->ki_ctx; struct poll_iocb *req = &aiocb->poll; struct aio_poll_table apt; + bool cancel = false; __poll_t mask; /* reject any unknown events outside the normal event mask. */ @@ -1730,7 +1723,7 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP; req->head = NULL; - req->woken = false; + req->done = false; req->cancelled = false; apt.pt._qproc = aio_poll_queue_proc; @@ -1743,36 +1736,33 @@ static ssize_t aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) init_waitqueue_func_entry(&req->wait, aio_poll_wake); mask = vfs_poll(req->file, &apt.pt) & req->events; - if (unlikely(!req->head)) { - /* we did not manage to set up a waitqueue, done */ - goto out; - } - spin_lock_irq(&ctx->ctx_lock); - spin_lock(&req->head->lock); - if (req->woken) { - /* wake_up context handles the rest */ - mask = 0; + if (likely(req->head)) { + spin_lock(&req->head->lock); + if (unlikely(list_empty(&req->wait.entry))) { + if (apt.error) + cancel = true; + apt.error = 0; + mask = 0; + } + if (mask || apt.error) { + list_del_init(&req->wait.entry); + } else if (cancel) { + WRITE_ONCE(req->cancelled, true); + } else if (!req->done) { /* actually waiting for an event */ + list_add_tail(&aiocb->ki_list, &ctx->active_reqs); + aiocb->ki_cancel = aio_poll_cancel; + } + spin_unlock(&req->head->lock); + } + if (mask) { /* no async, we'd stolen it */ + aiocb->ki_res.res = mangle_poll(mask); apt.error = 0; - } else if (mask || apt.error) { - /* if we get an error or a mask we are done */ - WARN_ON_ONCE(list_empty(&req->wait.entry)); - list_del_init(&req->wait.entry); - } else { - /* actually waiting for an event */ - list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } - spin_unlock(&req->head->lock); spin_unlock_irq(&ctx->ctx_lock); - -out: - if (unlikely(apt.error)) - return apt.error; - if (mask) - aio_poll_complete(aiocb, mask); - return 0; + iocb_put(aiocb); + return apt.error; } static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, -- cgit v1.2.3 From 0311ff82b70fa12e80d188635bff24029ec06ae1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 5 Apr 2019 14:02:10 -0700 Subject: fs: prevent page refcount overflow in pipe_buf_get commit 15fab63e1e57be9fdb5eec1bbc5916e9825e9acb upstream. Change pipe_buf_get() to return a bool indicating whether it succeeded in raising the refcount of the page (if the thing in the pipe is a page). This removes another mechanism for overflowing the page refcount. All callers converted to handle a failure. Reported-by: Jann Horn Signed-off-by: Matthew Wilcox Cc: stable@kernel.org Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 12 ++++++------ fs/pipe.c | 4 ++-- fs/splice.c | 12 ++++++++++-- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index baaed4d05b22..249de20f752a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1989,10 +1989,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; ret = -EINVAL; - if (rem < len) { - pipe_unlock(pipe); - goto out; - } + if (rem < len) + goto out_free; rem = len; while (rem) { @@ -2010,7 +2008,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); pipe->nrbufs--; } else { - pipe_buf_get(pipe, ibuf); + if (!pipe_buf_get(pipe, ibuf)) + goto out_free; + *obuf = *ibuf; obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->len = rem; @@ -2033,11 +2033,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, ret = fuse_dev_do_write(fud, &cs, len); pipe_lock(pipe); +out_free: for (idx = 0; idx < nbuf; idx++) pipe_buf_release(pipe, &bufs[idx]); pipe_unlock(pipe); -out: kvfree(bufs); return ret; } diff --git a/fs/pipe.c b/fs/pipe.c index c51750ed4011..2a297bce381f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -189,9 +189,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal); * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ -void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - get_page(buf->page); + return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); diff --git a/fs/splice.c b/fs/splice.c index c78e0e3ff6c4..485e409ef841 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1584,7 +1584,11 @@ retry: * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } *obuf = *ibuf; /* @@ -1658,7 +1662,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, * Get a reference to this pipe buffer, * so we can copy the contents over. */ - pipe_buf_get(ipipe, ibuf); + if (!pipe_buf_get(ipipe, ibuf)) { + if (ret == 0) + ret = -EFAULT; + break; + } obuf = opipe->bufs + nbuf; *obuf = *ibuf; -- cgit v1.2.3 From b989a3e9d260e2b567f5ae501fdf445f626911c1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 21 Mar 2019 17:57:56 -0400 Subject: NFS: Fix a typo in nfs_init_timeout_values() [ Upstream commit 5a698243930c441afccec04e4d5dc8febfd2b775 ] Specifying a retrans=0 mount parameter to a NFS/TCP mount, is inadvertently causing the NFS client to rewrite any specified timeout parameter to the default of 60 seconds. Fixes: a956beda19a6 ("NFS: Allow the mount option retrans=0") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin (Microsoft) --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 96d5f8135eb9..751ca65da8a3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -459,7 +459,7 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, case XPRT_TRANSPORT_RDMA: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_TCP_RETRANS; - if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0) + if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0) to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10; if (to->to_initval > NFS_MAX_TCP_TIMEOUT) to->to_initval = NFS_MAX_TCP_TIMEOUT; -- cgit v1.2.3 From 97c4f3a8853c50968a40e183dabd2ec27feba191 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2019 01:38:58 +0000 Subject: ceph: fix use-after-free on symlink traversal [ Upstream commit daf5cc27eed99afdea8d96e71b89ba41f5406ef6 ] free the symlink body after the same RCU delay we have for freeing the struct inode itself, so that traversal during RCU pathwalk wouldn't step into freed memory. Signed-off-by: Al Viro Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Sasha Levin (Microsoft) --- fs/ceph/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4055ab4d5c52..3e518c2ae2bf 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -524,6 +524,7 @@ static void ceph_i_callback(struct rcu_head *head) struct inode *inode = container_of(head, struct inode, i_rcu); struct ceph_inode_info *ci = ceph_inode(inode); + kfree(ci->i_symlink); kmem_cache_free(ceph_inode_cachep, ci); } @@ -561,7 +562,6 @@ void ceph_destroy_inode(struct inode *inode) ceph_put_snap_realm(mdsc, realm); } - kfree(ci->i_symlink); while ((n = rb_first(&ci->i_fragtree)) != NULL) { frag = rb_entry(n, struct ceph_inode_frag, node); rb_erase(n, &ci->i_fragtree); -- cgit v1.2.3 From e22c11da0a8683d22011bbce18da493c079d67b3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2019 01:39:50 +0000 Subject: jffs2: fix use-after-free on symlink traversal [ Upstream commit 4fdcfab5b5537c21891e22e65996d4d0dd8ab4ca ] free the symlink body after the same RCU delay we have for freeing the struct inode itself, so that traversal during RCU pathwalk wouldn't step into freed memory. Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/jffs2/readinode.c | 5 ----- fs/jffs2/super.c | 5 ++++- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 389ea53ea487..bccfc40b3a74 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1414,11 +1414,6 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); - if (f->target) { - kfree(f->target); - f->target = NULL; - } - fds = f->dents; while(fds) { fd = fds; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb6ae387469f..05d892c79339 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -47,7 +47,10 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb) static void jffs2_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode)); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + + kfree(f->target); + kmem_cache_free(jffs2_inode_cachep, f); } static void jffs2_destroy_inode(struct inode *inode) -- cgit v1.2.3 From f0112b649525fda883ae77db7d0774cb4da47cb8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Mar 2019 01:43:37 +0000 Subject: debugfs: fix use-after-free on symlink traversal [ Upstream commit 93b919da64c15b90953f96a536e5e61df896ca57 ] symlink body shouldn't be freed without an RCU delay. Switch debugfs to ->destroy_inode() and use of call_rcu(); free both the inode and symlink body in the callback. Similar to solution for bpf, only here it's even more obvious that ->evict_inode() can be dropped. Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/debugfs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 41ef452c1fcf..e5126fad57c5 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -163,19 +163,24 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void debugfs_evict_inode(struct inode *inode) +static void debugfs_i_callback(struct rcu_head *head) { - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); + struct inode *inode = container_of(head, struct inode, i_rcu); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); + free_inode_nonrcu(inode); +} + +static void debugfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, debugfs_i_callback); } static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, - .evict_inode = debugfs_evict_inode, + .destroy_inode = debugfs_destroy_inode, }; static void debugfs_release_dentry(struct dentry *dentry) -- cgit v1.2.3 From b51fdcbe45d1f09b5ecea5d2b796d37b8e6cbb4c Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 5 Apr 2019 18:39:06 -0700 Subject: hugetlbfs: fix memory leak for resv_map [ Upstream commit 58b6e5e8f1addd44583d61b0a03c0f5519527e35 ] When mknod is used to create a block special file in hugetlbfs, it will allocate an inode and kmalloc a 'struct resv_map' via resv_map_alloc(). inode->i_mapping->private_data will point the newly allocated resv_map. However, when the device special file is opened bd_acquire() will set inode->i_mapping to bd_inode->i_mapping. Thus the pointer to the allocated resv_map is lost and the structure is leaked. Programs to reproduce: mount -t hugetlbfs nodev hugetlbfs mknod hugetlbfs/dev b 0 0 exec 30<> hugetlbfs/dev umount hugetlbfs/ resv_map structures are only needed for inodes which can have associated page allocations. To fix the leak, only allocate resv_map for those inodes which could possibly be associated with page allocations. Link: http://lkml.kernel.org/r/20190401213101.16476-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Andrew Morton Reported-by: Yufen Yu Suggested-by: Yufen Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/hugetlbfs/inode.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..a3a3d256fb0e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -741,11 +741,17 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev) { struct inode *inode; - struct resv_map *resv_map; + struct resv_map *resv_map = NULL; - resv_map = resv_map_alloc(); - if (!resv_map) - return NULL; + /* + * Reserve maps are only needed for inodes that can have associated + * page allocations. + */ + if (S_ISREG(mode) || S_ISLNK(mode)) { + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; + } inode = new_inode(sb); if (inode) { @@ -780,8 +786,10 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); - } else - kref_put(&resv_map->refs, resv_map_release); + } else { + if (resv_map) + kref_put(&resv_map->refs, resv_map_release); + } return inode; } -- cgit v1.2.3 From 04b4d5f75ab04858a9b6dff94d74e4fdf0b9f194 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 26 Mar 2019 22:20:43 +0000 Subject: fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock [ Upstream commit 10dce8af34226d90fa56746a934f8da5dcdba3df ] Commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") added locking for file.f_pos access and in particular made concurrent read and write not possible - now both those functions take f_pos lock for the whole run, and so if e.g. a read is blocked waiting for data, write will deadlock waiting for that read to complete. This caused regression for stream-like files where previously read and write could run simultaneously, but after that patch could not do so anymore. See e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") which fixes such regression for particular case of /proc/xen/xenbus. The patch that added f_pos lock in 2014 did so to guarantee POSIX thread safety for read/write/lseek and added the locking to file descriptors of all regular files. In 2014 that thread-safety problem was not new as it was already discussed earlier in 2006. However even though 2006'th version of Linus's patch was adding f_pos locking "only for files that are marked seekable with FMODE_LSEEK (thus avoiding the stream-like objects like pipes and sockets)", the 2014 version - the one that actually made it into the tree as 9c225f2655e3 - is doing so irregardless of whether a file is seekable or not. See https://lore.kernel.org/lkml/53022DB1.4070805@gmail.com/ https://lwn.net/Articles/180387 https://lwn.net/Articles/180396 for historic context. The reason that it did so is, probably, that there are many files that are marked non-seekable, but e.g. their read implementation actually depends on knowing current position to correctly handle the read. Some examples: kernel/power/user.c snapshot_read fs/debugfs/file.c u32_array_read fs/fuse/control.c fuse_conn_waiting_read + ... drivers/hwmon/asus_atk0110.c atk_debugfs_ggrp_read arch/s390/hypfs/inode.c hypfs_read_iter ... Despite that, many nonseekable_open users implement read and write with pure stream semantics - they don't depend on passed ppos at all. And for those cases where read could wait for something inside, it creates a situation similar to xenbus - the write could be never made to go until read is done, and read is waiting for some, potentially external, event, for potentially unbounded time -> deadlock. Besides xenbus, there are 14 such places in the kernel that I've found with semantic patch (see below): drivers/xen/evtchn.c:667:8-24: ERROR: evtchn_fops: .read() can deadlock .write() drivers/isdn/capi/capi.c:963:8-24: ERROR: capi_fops: .read() can deadlock .write() drivers/input/evdev.c:527:1-17: ERROR: evdev_fops: .read() can deadlock .write() drivers/char/pcmcia/cm4000_cs.c:1685:7-23: ERROR: cm4000_fops: .read() can deadlock .write() net/rfkill/core.c:1146:8-24: ERROR: rfkill_fops: .read() can deadlock .write() drivers/s390/char/fs3270.c:488:1-17: ERROR: fs3270_fops: .read() can deadlock .write() drivers/usb/misc/ldusb.c:310:1-17: ERROR: ld_usb_fops: .read() can deadlock .write() drivers/hid/uhid.c:635:1-17: ERROR: uhid_fops: .read() can deadlock .write() net/batman-adv/icmp_socket.c:80:1-17: ERROR: batadv_fops: .read() can deadlock .write() drivers/media/rc/lirc_dev.c:198:1-17: ERROR: lirc_fops: .read() can deadlock .write() drivers/leds/uleds.c:77:1-17: ERROR: uleds_fops: .read() can deadlock .write() drivers/input/misc/uinput.c:400:1-17: ERROR: uinput_fops: .read() can deadlock .write() drivers/infiniband/core/user_mad.c:985:7-23: ERROR: umad_fops: .read() can deadlock .write() drivers/gnss/core.c:45:1-17: ERROR: gnss_fops: .read() can deadlock .write() In addition to the cases above another regression caused by f_pos locking is that now FUSE filesystems that implement open with FOPEN_NONSEEKABLE flag, can no longer implement bidirectional stream-like files - for the same reason as above e.g. read can deadlock write locking on file.f_pos in the kernel. FUSE's FOPEN_NONSEEKABLE was added in 2008 in a7c1b990f715 ("fuse: implement nonseekable open") to support OSSPD. OSSPD implements /dev/dsp in userspace with FOPEN_NONSEEKABLE flag, with corresponding read and write routines not depending on current position at all, and with both read and write being potentially blocking operations: See https://github.com/libfuse/osspd https://lwn.net/Articles/308445 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1406 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1438-L1477 https://github.com/libfuse/osspd/blob/14a9cff0/osspd.c#L1479-L1510 Corresponding libfuse example/test also describes FOPEN_NONSEEKABLE as "somewhat pipe-like files ..." with read handler not using offset. However that test implements only read without write and cannot exercise the deadlock scenario: https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L124-L131 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L146-L163 https://github.com/libfuse/libfuse/blob/fuse-3.4.2-3-ga1bff7d/example/poll.c#L209-L216 I've actually hit the read vs write deadlock for real while implementing my FUSE filesystem where there is /head/watch file, for which open creates separate bidirectional socket-like stream in between filesystem and its user with both read and write being later performed simultaneously. And there it is semantically not easy to split the stream into two separate read-only and write-only channels: https://lab.nexedi.com/kirr/wendelin.core/blob/f13aa600/wcfs/wcfs.go#L88-169 Let's fix this regression. The plan is: 1. We can't change nonseekable_open to include &~FMODE_ATOMIC_POS - doing so would break many in-kernel nonseekable_open users which actually use ppos in read/write handlers. 2. Add stream_open() to kernel to open stream-like non-seekable file descriptors. Read and write on such file descriptors would never use nor change ppos. And with that property on stream-like files read and write will be running without taking f_pos lock - i.e. read and write could be running simultaneously. 3. With semantic patch search and convert to stream_open all in-kernel nonseekable_open users for which read and write actually do not depend on ppos and where there is no other methods in file_operations which assume @offset access. 4. Add FOPEN_STREAM to fs/fuse/ and open in-kernel file-descriptors via steam_open if that bit is present in filesystem open reply. It was tempting to change fs/fuse/ open handler to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. 5. Add stream_open and FOPEN_STREAM handling to stable kernels starting from v3.14+ (the kernel where 9c225f2655 first appeared). This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in their open handler and this way avoid the deadlock on all kernel versions. This should work because fs/fuse/ ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. This patch adds stream_open, converts /proc/xen/xenbus to it and adds semantic patch to automatically locate in-kernel places that are either required to be converted due to read vs write deadlock, or that are just safe to be converted because read and write do not use ppos and there are no other funky methods in file_operations. Regarding semantic patch I've verified each generated change manually - that it is correct to convert - and each other nonseekable_open instance left - that it is either not correct to convert there, or that it is not converted due to current stream_open.cocci limitations. The script also does not convert files that should be valid to convert, but that currently have .llseek = noop_llseek or generic_file_llseek for unknown reason despite file being opened with nonseekable_open (e.g. drivers/input/mousedev.c) Cc: Michael Kerrisk Cc: Yongzhi Pan Cc: Jonathan Corbet Cc: David Vrabel Cc: Juergen Gross Cc: Miklos Szeredi Cc: Tejun Heo Cc: Kirill Tkhai Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: Julia Lawall Cc: Nikolaus Rath Cc: Han-Wen Nienhuys Signed-off-by: Kirill Smelkov Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- fs/open.c | 18 ++++++++++++++++++ fs/read_write.c | 5 +++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index f1c2f855fd43..a00350018a47 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1215,3 +1215,21 @@ int nonseekable_open(struct inode *inode, struct file *filp) } EXPORT_SYMBOL(nonseekable_open); + +/* + * stream_open is used by subsystems that want stream-like file descriptors. + * Such file descriptors are not seekable and don't have notion of position + * (file.f_pos is always 0). Contrary to file descriptors of other regular + * files, .read() and .write() can run simultaneously. + * + * stream_open never fails and is marked to return int so that it could be + * directly used as file_operations.open . + */ +int stream_open(struct inode *inode, struct file *filp) +{ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS); + filp->f_mode |= FMODE_STREAM; + return 0; +} + +EXPORT_SYMBOL(stream_open); diff --git a/fs/read_write.c b/fs/read_write.c index 562974a0616c..85fd7a8ee29e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -560,12 +560,13 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ static inline loff_t file_pos_read(struct file *file) { - return file->f_pos; + return file->f_mode & FMODE_STREAM ? 0 : file->f_pos; } static inline void file_pos_write(struct file *file, loff_t pos) { - file->f_pos = pos; + if ((file->f_mode & FMODE_STREAM) == 0) + file->f_pos = pos; } ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) -- cgit v1.2.3 From e361ccccdd51f45a2d54f6454192acda06cb2598 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Tue, 16 Apr 2019 14:17:11 +0200 Subject: kernfs: fix barrier usage in __kernfs_new_node() commit 998267900cee901c5d1dfa029a6304d00acbc29f upstream. smp_mb__before_atomic() can not be applied to atomic_set(). Remove the barrier and rely on RELEASE synchronization. Fixes: ba16b2846a8c6 ("kernfs: add an API to get kernfs node from inode number") Cc: stable@vger.kernel.org Signed-off-by: Andrea Parri Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 4ca0b5c18192..853a69e493f5 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -650,11 +650,10 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->id.generation = gen; /* - * set ino first. This barrier is paired with atomic_inc_not_zero in + * set ino first. This RELEASE is paired with atomic_inc_not_zero in * kernfs_find_and_get_node_by_ino */ - smp_mb__before_atomic(); - atomic_set(&kn->count, 1); + atomic_set_release(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); -- cgit v1.2.3 From 58be7c109cea9ea619dde5ac504ef9e001a4c2c0 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Sat, 13 Apr 2019 08:37:37 +0100 Subject: afs: Unlock pages for __pagevec_release() [ Upstream commit 21bd68f196ca91fc0f3d9bd1b32f6e530e8c1c88 ] __pagevec_release() complains loudly if any page in the vector is still locked. The pages need to be locked for generic_error_remove_page(), but that function doesn't actually unlock them. Unlock the pages afterwards. Signed-off-by: Marc Dionne Signed-off-by: David Howells Tested-by: Jonathan Billings Signed-off-by: Sasha Levin --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 19c04caf3c01..e00461a6de9a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -253,6 +253,7 @@ static void afs_kill_pages(struct address_space *mapping, first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); + unlock_page(page); } __pagevec_release(&pv); -- cgit v1.2.3 From 58db3813680ef3fd74dea8154124831043dd3860 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 13 May 2019 17:15:33 -0700 Subject: mm/huge_memory: fix vmf_insert_pfn_{pmd, pud}() crash, handle unaligned addresses commit fce86ff5802bac3a7b19db171aa1949ef9caac31 upstream. Starting with c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()") vmf_insert_pfn_pmd() internally calls pmdp_set_access_flags(). That helper enforces a pmd aligned @address argument via VM_BUG_ON() assertion. Update the implementation to take a 'struct vm_fault' argument directly and apply the address alignment fixup internally to fix crash signatures like: kernel BUG at arch/x86/mm/pgtable.c:515! invalid opcode: 0000 [#1] SMP NOPTI CPU: 51 PID: 43713 Comm: java Tainted: G OE 4.19.35 #1 [..] RIP: 0010:pmdp_set_access_flags+0x48/0x50 [..] Call Trace: vmf_insert_pfn_pmd+0x198/0x350 dax_iomap_fault+0xe82/0x1190 ext4_dax_huge_fault+0x103/0x1f0 ? __switch_to_asm+0x40/0x70 __handle_mm_fault+0x3f6/0x1370 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 handle_mm_fault+0xda/0x200 __do_page_fault+0x249/0x4f0 do_page_fault+0x32/0x110 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 Link: http://lkml.kernel.org/r/155741946350.372037.11148198430068238140.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()") Signed-off-by: Dan Williams Reported-by: Piotr Balcer Tested-by: Yan Ma Tested-by: Pankaj Gupta Reviewed-by: Matthew Wilcox Reviewed-by: Jan Kara Reviewed-by: Aneesh Kumar K.V Cc: Chandan Rajendra Cc: Souptick Joarder Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/dax.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 09fa70683c41..004c8ac1117c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1660,8 +1660,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, } trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, - write); + result = vmf_insert_pfn_pmd(vmf, pfn, write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: @@ -1775,8 +1774,7 @@ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, break; #ifdef CONFIG_FS_DAX_PMD case PE_SIZE_PMD: - ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, true); + ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); break; #endif default: -- cgit v1.2.3 From a3ccc156f365a9615890b9fbf99d8a663f2c4b56 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 May 2019 17:19:41 -0700 Subject: hugetlb: use same fault hash key for shared and private mappings commit 1b426bac66e6cc83c9f2d92b96e4e72acf43419a upstream. hugetlb uses a fault mutex hash table to prevent page faults of the same pages concurrently. The key for shared and private mappings is different. Shared keys off address_space and file index. Private keys off mm and virtual address. Consider a private mappings of a populated hugetlbfs file. A fault will map the page from the file and if needed do a COW to map a writable page. Hugetlbfs hole punch uses the fault mutex to prevent mappings of file pages. It uses the address_space file index key. However, private mappings will use a different key and could race with this code to map the file page. This causes problems (BUG) for the page cache remove code as it expects the page to be unmapped. A sample stack is: page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) kernel BUG at mm/filemap.c:169! ... RIP: 0010:unaccount_page_cache_page+0x1b8/0x200 ... Call Trace: __delete_from_page_cache+0x39/0x220 delete_from_page_cache+0x45/0x70 remove_inode_hugepages+0x13c/0x380 ? __add_to_page_cache_locked+0x162/0x380 hugetlbfs_fallocate+0x403/0x540 ? _cond_resched+0x15/0x30 ? __inode_security_revalidate+0x5d/0x70 ? selinux_file_permission+0x100/0x130 vfs_fallocate+0x13f/0x270 ksys_fallocate+0x3c/0x80 __x64_sys_fallocate+0x1a/0x20 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 There seems to be another potential COW issue/race with this approach of different private and shared keys as noted in commit 8382d914ebf7 ("mm, hugetlb: improve page-fault scalability"). Since every hugetlb mapping (even anon and private) is actually a file mapping, just use the address_space index key for all mappings. This results in potentially more hash collisions. However, this should not be the common case. Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5 Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hugetlbfs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a3a3d256fb0e..7a24f91af29e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -426,9 +426,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -625,8 +623,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, - index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ -- cgit v1.2.3 From 3574bc98e2fe3f66090682dd91ebd9b96d0236f3 Mon Sep 17 00:00:00 2001 From: Shuning Zhang Date: Mon, 13 May 2019 17:15:56 -0700 Subject: ocfs2: fix ocfs2 read inode data panic in ocfs2_iget commit e091eab028f9253eac5c04f9141bbc9d170acab3 upstream. In some cases, ocfs2_iget() reads the data of inode, which has been deleted for some reason. That will make the system panic. So We should judge whether this inode has been deleted, and tell the caller that the inode is a bad inode. For example, the ocfs2 is used as the backed of nfs, and the client is nfsv3. This issue can be reproduced by the following steps. on the nfs server side, ..../patha/pathb Step 1: The process A was scheduled before calling the function fh_verify. Step 2: The process B is removing the 'pathb', and just completed the call to function dput. Then the dentry of 'pathb' has been deleted from the dcache, and all ancestors have been deleted also. The relationship of dentry and inode was deleted through the function hlist_del_init. The following is the call stack. dentry_iput->hlist_del_init(&dentry->d_u.d_alias) At this time, the inode is still in the dcache. Step 3: The process A call the function ocfs2_get_dentry, which get the inode from dcache. Then the refcount of inode is 1. The following is the call stack. nfsd3_proc_getacl->fh_verify->exportfs_decode_fh->fh_to_dentry(ocfs2_get_dentry) Step 4: Dirty pages are flushed by bdi threads. So the inode of 'patha' is evicted, and this directory was deleted. But the inode of 'pathb' can't be evicted, because the refcount of the inode was 1. Step 5: The process A keep running, and call the function reconnect_path(in exportfs_decode_fh), which call function ocfs2_get_parent of ocfs2. Get the block number of parent directory(patha) by the name of ... Then read the data from disk by the block number. But this inode has been deleted, so the system panic. Process A Process B 1. in nfsd3_proc_getacl | 2. | dput 3. fh_to_dentry(ocfs2_get_dentry) | 4. bdi flush dirty cache | 5. ocfs2_iget | [283465.542049] OCFS2: ERROR (device sdp): ocfs2_validate_inode_block: Invalid dinode #580640: OCFS2_VALID_FL not set [283465.545490] Kernel panic - not syncing: OCFS2: (device sdp): panic forced after error [283465.546889] CPU: 5 PID: 12416 Comm: nfsd Tainted: G W 4.1.12-124.18.6.el6uek.bug28762940v3.x86_64 #2 [283465.548382] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 09/21/2015 [283465.549657] 0000000000000000 ffff8800a56fb7b8 ffffffff816e839c ffffffffa0514758 [283465.550392] 000000000008dc20 ffff8800a56fb838 ffffffff816e62d3 0000000000000008 [283465.551056] ffff880000000010 ffff8800a56fb848 ffff8800a56fb7e8 ffff88005df9f000 [283465.551710] Call Trace: [283465.552516] [] dump_stack+0x63/0x81 [283465.553291] [] panic+0xcb/0x21b [283465.554037] [] ocfs2_handle_error+0xf0/0xf0 [ocfs2] [283465.554882] [] __ocfs2_error+0x67/0x70 [ocfs2] [283465.555768] [] ocfs2_validate_inode_block+0x229/0x230 [ocfs2] [283465.556683] [] ocfs2_read_blocks+0x46c/0x7b0 [ocfs2] [283465.557408] [] ? ocfs2_inode_cache_io_unlock+0x20/0x20 [ocfs2] [283465.557973] [] ocfs2_read_inode_block_full+0x3b/0x60 [ocfs2] [283465.558525] [] ocfs2_iget+0x4aa/0x880 [ocfs2] [283465.559082] [] ocfs2_get_parent+0x9e/0x220 [ocfs2] [283465.559622] [] reconnect_path+0xb5/0x300 [283465.560156] [] exportfs_decode_fh+0xf6/0x2b0 [283465.560708] [] ? nfsd_proc_getattr+0xa0/0xa0 [nfsd] [283465.561262] [] ? prepare_creds+0x26/0x110 [283465.561932] [] fh_verify+0x350/0x660 [nfsd] [283465.562862] [] ? nfsd_cache_lookup+0x44/0x630 [nfsd] [283465.563697] [] nfsd3_proc_getattr+0x69/0xf0 [nfsd] [283465.564510] [] nfsd_dispatch+0xe0/0x290 [nfsd] [283465.565358] [] ? svc_tcp_adjust_wspace+0x12/0x30 [sunrpc] [283465.566272] [] svc_process_common+0x412/0x6a0 [sunrpc] [283465.567155] [] svc_process+0x123/0x210 [sunrpc] [283465.568020] [] nfsd+0xff/0x170 [nfsd] [283465.568962] [] ? nfsd_destroy+0x80/0x80 [nfsd] [283465.570112] [] kthread+0xcb/0xf0 [283465.571099] [] ? kthread_create_on_node+0x180/0x180 [283465.572114] [] ret_from_fork+0x58/0x90 [283465.573156] [] ? kthread_create_on_node+0x180/0x180 Link: http://lkml.kernel.org/r/1554185919-3010-1-git-send-email-sunny.s.zhang@oracle.com Signed-off-by: Shuning Zhang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: piaojun Cc: "Gang He" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/export.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 4bf8d5854b27..af2888d23de3 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) u64 blkno; struct dentry *parent; struct inode *dir = d_inode(child); + int set; trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); + status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + parent = ERR_PTR(status); + goto bail; + } + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); parent = ERR_PTR(status); - goto bail; + goto unlock_nfs_sync; } status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); @@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } + status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + parent = ERR_PTR(status); + goto bail_unlock; + } + + trace_ocfs2_get_dentry_test_bit(status, set); + if (!set) { + status = -ESTALE; + parent = ERR_PTR(status); + goto bail_unlock; + } + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); bail_unlock: ocfs2_inode_unlock(dir, 0); +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1); + bail: trace_ocfs2_get_parent_end(parent); -- cgit v1.2.3 From 001fe0dab4eacd9e714b7c9de5f13af2bb3c3e1a Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Sat, 6 Apr 2019 18:57:40 -0400 Subject: jbd2: check superblock mapped prior to committing commit 742b06b5628f2cd23cb51a034cb54dc33c6162c5 upstream. We hit a BUG at fs/buffer.c:3057 if we detached the nbd device before unmounting ext4 filesystem. The typical chain of events leading to the BUG: jbd2_write_superblock submit_bh submit_bh_wbc BUG_ON(!buffer_mapped(bh)); The block device is removed and all the pages are invalidated. JBD2 was trying to write journal superblock to the block device which is no longer present. Fix this by checking the journal superblock's buffer head prior to submitting. Reported-by: Eric Ren Signed-off-by: Jiufei Xue Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/journal.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 88f2a49338a1..9e618777c699 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1366,6 +1366,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) journal_superblock_t *sb = journal->j_superblock; int ret; + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(bh)) + return -EIO; + trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); -- cgit v1.2.3 From 71478ef67d7c39ba735c07ce3e39eae21ad8681a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 6 Apr 2019 18:33:06 -0400 Subject: ext4: make sanity check in mballoc more strict commit 31562b954b60f02acb91b7349dc6432d3f8c3c5f upstream. The sanity check in mb_find_extent() only checked that returned extent does not extend past blocksize * 8, however it should not extend past EXT4_CLUSTERS_PER_GROUP(sb). This can happen when clusters_per_group < blocksize * 8 and the tail of the bitmap is not properly filled by 1s which happened e.g. when ancient kernels have grown the filesystem. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e29fce2fbf25..cc229f3357f7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1539,7 +1539,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, ex->fe_len += 1 << order; } - if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) { + if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent " -- cgit v1.2.3 From f0f805f8b9e72be1d77bc1cc6f85eab0c3fd637a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 10 Apr 2019 00:37:36 -0400 Subject: ext4: ignore e_value_offs for xattrs with value-in-ea-inode commit e5d01196c0428a206f307e9ee5f6842964098ff0 upstream. In other places in fs/ext4/xattr.c, if e_value_inum is non-zero, the code ignores the value in e_value_offs. The e_value_offs *should* be zero, but we shouldn't depend upon it, since it might not be true in a corrupted/fuzzed file system. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202897 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202877 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 006c277dc22e..f73fc90e5daa 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1700,7 +1700,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* No failures allowed past this point. */ - if (!s->not_found && here->e_value_size && here->e_value_offs) { + if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); -- cgit v1.2.3 From b12a8d80a46e8ab02eb9124a36562a93bbd7224b Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 25 Apr 2019 11:44:15 -0400 Subject: ext4: avoid drop reference to iloc.bh twice commit 8c380ab4b7b59c0c602743810be1b712514eaebc upstream. The reference to iloc.bh has been dropped in ext4_mark_iloc_dirty. However, the reference is dropped again if error occurs during ext4_handle_dirty_metadata, which may result in use-after-free bugs. Fixes: fb265c9cb49e("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases") Signed-off-by: Pan Bian Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e7ae26e36c9c..4d5c0fc9d23a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -874,6 +874,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); if (unlikely(err)) { ext4_std_error(sb, err); + iloc.bh = NULL; goto errout; } brelse(dind); -- cgit v1.2.3 From 2a18c9c76718f77a08177f1fc20007fab2fdafdd Mon Sep 17 00:00:00 2001 From: Barret Rhoden Date: Thu, 25 Apr 2019 11:55:50 -0400 Subject: ext4: fix use-after-free race with debug_want_extra_isize commit 7bc04c5c2cc467c5b40f2b03ba08da174a0d5fa7 upstream. When remounting with debug_want_extra_isize, we were not performing the same checks that we do during a normal mount. That allowed us to set a value for s_want_extra_isize that reached outside the s_inode_size. Fixes: e2b911c53584 ("ext4: clean up feature test macros with predicate functions") Reported-by: syzbot+f584efa0ac7213c226b7@syzkaller.appspotmail.com Reviewed-by: Jan Kara Signed-off-by: Barret Rhoden Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 58 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index abba7ece78e9..a2437d352e09 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3514,6 +3514,37 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } +static void ext4_clamp_want_extra_isize(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + /* determine the minimum size of new large inodes, if present */ + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && + sbi->s_want_extra_isize == 0) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_want_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_want_extra_isize); + if (sbi->s_want_extra_isize < + le16_to_cpu(es->s_min_extra_isize)) + sbi->s_want_extra_isize = + le16_to_cpu(es->s_min_extra_isize); + } + } + /* Check if enough inode space is available */ + if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > + sbi->s_inode_size) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + ext4_msg(sb, KERN_INFO, + "required extra inode space not available"); + } +} + static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; @@ -4388,30 +4419,7 @@ no_journal: } else if (ret) goto failed_mount4a; - /* determine the minimum size of new large inodes, if present */ - if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && - sbi->s_want_extra_isize == 0) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - if (ext4_has_feature_extra_isize(sb)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size) { - sbi->s_want_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - ext4_msg(sb, KERN_INFO, "required extra inode space not" - "available"); - } + ext4_clamp_want_extra_isize(sb); ext4_set_resv_clusters(sb); @@ -5197,6 +5205,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + ext4_clamp_want_extra_isize(sb); + if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " -- cgit v1.2.3 From f795247578aaf398d2af4de902264d194c12222f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 25 Apr 2019 13:06:18 -0400 Subject: ext4: actually request zeroing of inode table after grow commit 310a997fd74de778b9a4848a64be9cda9f18764a upstream. It is never possible, that number of block groups decreases, since only online grow is supported. But after a growing occured, we have to zero inode tables for just created new block groups. Fixes: 19c5246d2516 ("ext4: add new online resize interface") Signed-off-by: Kirill Tkhai Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5f24fdc140ad..53d57cdf3c4d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -977,7 +977,7 @@ mext_out: if (err == 0) err = err2; mnt_drop_write_file(filp); - if (!err && (o_group > EXT4_SB(sb)->s_groups_count) && + if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, o_group); -- cgit v1.2.3 From 45123ae534e042e4e21d7e5882f2f22c92f35f46 Mon Sep 17 00:00:00 2001 From: Debabrata Banerjee Date: Tue, 30 Apr 2019 23:08:15 -0400 Subject: ext4: fix ext4_show_options for file systems w/o journal commit 50b29d8f033a7c88c5bc011abc2068b1691ab755 upstream. Instead of removing EXT4_MOUNT_JOURNAL_CHECKSUM from s_def_mount_opt as I assume was intended, all other options were blown away leading to _ext4_show_options() output being incorrect. Fixes: 1e381f60dad9 ("ext4: do not allow journal_opts for fs w/o journal") Signed-off-by: Debabrata Banerjee Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a2437d352e09..cac6ea956659 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4270,7 +4270,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "data=, fs mounted w/o journal"); goto failed_mount_wq; } - sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM; + sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); sbi->s_journal = NULL; -- cgit v1.2.3 From d8925a1fee71acb0fe1496a5fa80bda4c978d257 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 12 Mar 2019 17:10:40 +0800 Subject: btrfs: Check the first key and level for cached extent buffer commit 448de471cd4cab0cedd15770082567a69a784a11 upstream. [BUG] When reading a file from a fuzzed image, kernel can panic like: BTRFS warning (device loop0): csum failed root 5 ino 270 off 0 csum 0x98f94189 expected csum 0x00000000 mirror 1 assertion failed: !memcmp_extent_buffer(b, &disk_key, offsetof(struct btrfs_leaf, items[0].key), sizeof(disk_key)), file: fs/btrfs/ctree.c, line: 2544 ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.h:3500! invalid opcode: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:btrfs_search_slot.cold.24+0x61/0x63 [btrfs] Call Trace: btrfs_lookup_csum+0x52/0x150 [btrfs] __btrfs_lookup_bio_sums+0x209/0x640 [btrfs] btrfs_submit_bio_hook+0x103/0x170 [btrfs] submit_one_bio+0x59/0x80 [btrfs] extent_read_full_page+0x58/0x80 [btrfs] generic_file_read_iter+0x2f6/0x9d0 __vfs_read+0x14d/0x1a0 vfs_read+0x8d/0x140 ksys_read+0x52/0xc0 do_syscall_64+0x60/0x210 entry_SYSCALL_64_after_hwframe+0x49/0xbe [CAUSE] The fuzzed image has a corrupted leaf whose first key doesn't match its parent: checksum tree key (CSUM_TREE ROOT_ITEM 0) node 29741056 level 1 items 14 free 107 generation 19 owner CSUM_TREE fs uuid 3381d111-94a3-4ac7-8f39-611bbbdab7e6 chunk uuid 9af1c3c7-2af5-488b-8553-530bd515f14c ... key (EXTENT_CSUM EXTENT_CSUM 79691776) block 29761536 gen 19 leaf 29761536 items 1 free space 1726 generation 19 owner CSUM_TREE leaf 29761536 flags 0x1(WRITTEN) backref revision 1 fs uuid 3381d111-94a3-4ac7-8f39-611bbbdab7e6 chunk uuid 9af1c3c7-2af5-488b-8553-530bd515f14c item 0 key (EXTENT_CSUM EXTENT_CSUM 8798638964736) itemoff 1751 itemsize 2244 range start 8798638964736 end 8798641262592 length 2297856 When reading the above tree block, we have extent_buffer->refs = 2 in the context: - initial one from __alloc_extent_buffer() alloc_extent_buffer() |- __alloc_extent_buffer() |- atomic_set(&eb->refs, 1) - one being added to fs_info->buffer_radix alloc_extent_buffer() |- check_buffer_tree_ref() |- atomic_inc(&eb->refs) So if even we call free_extent_buffer() in read_tree_block or other similar situation, we only decrease the refs by 1, it doesn't reach 0 and won't be freed right now. The staled eb and its corrupted content will still be kept cached. Furthermore, we have several extra cases where we either don't do first key check or the check is not proper for all callers: - scrub We just don't have first key in this context. - shared tree block One tree block can be shared by several snapshot/subvolume trees. In that case, the first key check for one subvolume doesn't apply to another. So for the above reasons, a corrupted extent buffer can sneak into the buffer cache. [FIX] Call verify_level_key in read_block_for_search to do another verification. For that purpose the function is exported. Due to above reasons, although we can free corrupted extent buffer from cache, we still need the check in read_block_for_search(), for scrub and shared tree blocks. Link: https://bugzilla.kernel.org/show_bug.cgi?id=202755 Link: https://bugzilla.kernel.org/show_bug.cgi?id=202757 Link: https://bugzilla.kernel.org/show_bug.cgi?id=202759 Link: https://bugzilla.kernel.org/show_bug.cgi?id=202761 Link: https://bugzilla.kernel.org/show_bug.cgi?id=202767 Link: https://bugzilla.kernel.org/show_bug.cgi?id=202769 Reported-by: Yoon Jungyeon CC: stable@vger.kernel.org # 4.19+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.c | 10 ++++++++++ fs/btrfs/disk-io.c | 10 +++++----- fs/btrfs/disk-io.h | 3 +++ 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 48ac8b7c43a5..79ac1ebabaf7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2436,6 +2436,16 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { + /* + * Do extra check for first_key, eb can be stale due to + * being cached, read from scrub, or have multiple + * parents (shared tree blocks). + */ + if (btrfs_verify_level_key(fs_info, tmp, + parent_level - 1, &first_key, gen)) { + free_extent_buffer(tmp); + return -EUCLEAN; + } *eb_ret = tmp; return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b4f61a3d560a..d67dbbd9e6ed 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -408,9 +408,9 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, return ret; } -static int verify_level_key(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int level, - struct btrfs_key *first_key, u64 parent_transid) +int btrfs_verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid) { int found_level; struct btrfs_key found_key; @@ -487,8 +487,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, if (verify_parent_transid(io_tree, eb, parent_transid, 0)) ret = -EIO; - else if (verify_level_key(fs_info, eb, level, - first_key, parent_transid)) + else if (btrfs_verify_level_key(fs_info, eb, level, + first_key, parent_transid)) ret = -EUCLEAN; else break; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4cccba22640f..7a4a60f26dbf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -39,6 +39,9 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; +int btrfs_verify_level_key(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int level, + struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid, int level, struct btrfs_key *first_key); -- cgit v1.2.3 From 87dcf0c61985e4e6b44d9a95de54373438eb31cd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 14 Mar 2019 09:52:35 +0200 Subject: btrfs: Correctly free extent buffer in case btree_read_extent_buffer_pages fails commit 537f38f019fa0b762dbb4c0fc95d7fcce9db8e2d upstream. If a an eb fails to be read for whatever reason - it's corrupted on disk and parent transid/key validations fail or IO for eb pages fail then this buffer must be removed from the buffer cache. Currently the code calls free_extent_buffer if an error occurs. Unfortunately this doesn't achieve the desired behavior since btrfs_find_create_tree_block returns with eb->refs == 2. On the other hand free_extent_buffer will only decrement the refs once leaving it added to the buffer cache radix tree. This enables later code to look up the buffer from the cache and utilize it potentially leading to a crash. The correct way to free the buffer is call free_extent_buffer_stale. This function will correctly call atomic_dec explicitly for the buffer and subsequently call release_extent_buffer which will decrement the final reference thus correctly remove the invalid buffer from buffer cache. This change affects only newly allocated buffers since they have eb->refs == 2. Link: https://bugzilla.kernel.org/show_bug.cgi?id=202755 Reported-by: Jungyeon CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d67dbbd9e6ed..b2dc613ebed2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -995,13 +995,18 @@ void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = fs_info->btree_inode; + int ret; buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return; - read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, - buf, WAIT_NONE, 0); - free_extent_buffer(buf); + + ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, + WAIT_NONE, 0); + if (ret < 0) + free_extent_buffer_stale(buf); + else + free_extent_buffer(buf); } int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -1021,12 +1026,12 @@ int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK, mirror_num); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ret; } if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return -EIO; } else if (extent_buffer_uptodate(buf)) { *eb = buf; @@ -1080,7 +1085,7 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid, level, first_key); if (ret) { - free_extent_buffer(buf); + free_extent_buffer_stale(buf); return ERR_PTR(ret); } return buf; -- cgit v1.2.3 From 8b13bb911f0c0c77d41e5ddc41ad3c127c356b8a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 25 Mar 2019 14:31:21 +0200 Subject: btrfs: Honour FITRIM range constraints during free space trim commit c2d1b3aae33605a61cbab445d8ae1c708ccd2698 upstream. Up until now trimming the freespace was done irrespective of what the arguments of the FITRIM ioctl were. For example fstrim's -o/-l arguments will be entirely ignored. Fix it by correctly handling those paramter. This requires breaking if the found freespace extent is after the end of the passed range as well as completing trim after trimming fstrim_range::len bytes. Fixes: 499f377f49f0 ("btrfs: iterate over unused chunk space in FITRIM") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0db7785cede..809c2c307c64 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10789,9 +10789,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, - u64 minlen, u64 *trimmed) + struct fstrim_range *range, u64 *trimmed) { - u64 start = 0, len = 0; + u64 start = range->start, len = 0; int ret; *trimmed = 0; @@ -10834,8 +10834,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, if (!trans) up_read(&fs_info->commit_root_sem); - ret = find_free_dev_extent_start(trans, device, minlen, start, - &start, &len); + ret = find_free_dev_extent_start(trans, device, range->minlen, + start, &start, &len); if (trans) { up_read(&fs_info->commit_root_sem); btrfs_put_transaction(trans); @@ -10848,6 +10848,16 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, break; } + /* If we are out of the passed range break */ + if (start > range->start + range->len - 1) { + mutex_unlock(&fs_info->chunk_mutex); + ret = 0; + break; + } + + start = max(range->start, start); + len = min(range->len, len); + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); mutex_unlock(&fs_info->chunk_mutex); @@ -10857,6 +10867,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, start += len; *trimmed += bytes; + /* We've trimmed enough */ + if (*trimmed >= range->len) + break; + if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -10940,8 +10954,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) mutex_lock(&fs_info->fs_devices->device_list_mutex); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - ret = btrfs_trim_free_extents(device, range->minlen, - &group_trimmed); + ret = btrfs_trim_free_extents(device, range, &group_trimmed); if (ret) { dev_failed++; dev_ret = ret; -- cgit v1.2.3 From 74ca0a7671ccbaf5ed93e3d601c645d3f9335ad7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Apr 2019 09:29:36 +0100 Subject: Btrfs: send, flush dellaloc in order to avoid data loss commit 9f89d5de8631c7930898a601b6612e271aa2261c upstream. When we set a subvolume to read-only mode we do not flush dellaloc for any of its inodes (except if the filesystem is mounted with -o flushoncommit), since it does not affect correctness for any subsequent operations - except for a future send operation. The send operation will not be able to see the delalloc data since the respective file extent items, inode item updates, backreferences, etc, have not hit yet the subvolume and extent trees. Effectively this means data loss, since the send stream will not contain any data from existing delalloc. Another problem from this is that if the writeback starts and finishes while the send operation is in progress, we have the subvolume tree being being modified concurrently which can result in send failing unexpectedly with EIO or hitting runtime errors, assertion failures or hitting BUG_ONs, etc. Simple reproducer: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ btrfs subvolume create /mnt/sv $ xfs_io -f -c "pwrite -S 0xea 0 108K" /mnt/sv/foo $ btrfs property set /mnt/sv ro true $ btrfs send -f /tmp/send.stream /mnt/sv $ od -t x1 -A d /mnt/sv/foo 0000000 ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea ea * 0110592 $ umount /mnt $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ btrfs receive -f /tmp/send.stream /mnt $ echo $? 0 $ od -t x1 -A d /mnt/sv/foo 0000000 # ---> empty file Since this a problem that affects send only, fix it in send by flushing dellaloc for all the roots used by the send operation before send starts to process the commit roots. This is a problem that affects send since it was introduced (commit 31db9f7c23fbf7 ("Btrfs: introduce BTRFS_IOC_SEND for btrfs send/receive")) but backporting it to older kernels has some dependencies: - For kernels between 3.19 and 4.20, it depends on commit 3cd24c698004d2 ("btrfs: use tagged writepage to mitigate livelock of snapshot") because the function btrfs_start_delalloc_snapshot() does not exist before that commit. So one has to either pick that commit or replace the calls to btrfs_start_delalloc_snapshot() in this patch with calls to btrfs_start_delalloc_inodes(). - For kernels older than 3.19 it also requires commit e5fa8f865b3324 ("Btrfs: ensure send always works on roots without orphans") because it depends on the function ensure_commit_roots_uptodate() which that commits introduced. - No dependencies for 5.0+ kernels. A test case for fstests follows soon. CC: stable@vger.kernel.org # 3.19+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 84cb6e5ef36c..635e419f2a2d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6583,6 +6583,38 @@ commit_trans: return btrfs_commit_transaction(trans); } +/* + * Make sure any existing dellaloc is flushed for any root used by a send + * operation so that we do not miss any data and we do not race with writeback + * finishing and changing a tree while send is using the tree. This could + * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and + * a send operation then uses the subvolume. + * After flushing delalloc ensure_commit_roots_uptodate() must be called. + */ +static int flush_delalloc_roots(struct send_ctx *sctx) +{ + struct btrfs_root *root = sctx->parent_root; + int ret; + int i; + + if (root) { + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + for (i = 0; i < sctx->clone_roots_cnt; i++) { + root = sctx->clone_roots[i].root; + ret = btrfs_start_delalloc_snapshot(root); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); + } + + return 0; +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -6807,6 +6839,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) NULL); sort_clone_roots = 1; + ret = flush_delalloc_roots(sctx); + if (ret) + goto out; + ret = ensure_commit_roots_uptodate(sctx); if (ret) goto out; -- cgit v1.2.3 From 0388d45afc5097c8141303d356b3bc1aaad80509 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 15 Apr 2019 14:50:51 +0100 Subject: Btrfs: do not start a transaction during fiemap commit 03628cdbc64db6262e50d0357960a4e9562676a1 upstream. During fiemap, for regular extents (non inline) we need to check if they are shared and if they are, set the shared bit. Checking if an extent is shared requires checking the delayed references of the currently running transaction, since some reference might have not yet hit the extent tree and be only in the in-memory delayed references. However we were using a transaction join for this, which creates a new transaction when there is no transaction currently running. That means that two more potential failures can happen: creating the transaction and committing it. Further, if no write activity is currently happening in the system, and fiemap calls keep being done, we end up creating and committing transactions that do nothing. In some extreme cases this can result in the commit of the transaction created by fiemap to fail with ENOSPC when updating the root item of a subvolume tree because a join does not reserve any space, leading to a trace like the following: heisenberg kernel: ------------[ cut here ]------------ heisenberg kernel: BTRFS: Transaction aborted (error -28) heisenberg kernel: WARNING: CPU: 0 PID: 7137 at fs/btrfs/root-tree.c:136 btrfs_update_root+0x22b/0x320 [btrfs] (...) heisenberg kernel: CPU: 0 PID: 7137 Comm: btrfs-transacti Not tainted 4.19.0-4-amd64 #1 Debian 4.19.28-2 heisenberg kernel: Hardware name: FUJITSU LIFEBOOK U757/FJNB2A5, BIOS Version 1.21 03/19/2018 heisenberg kernel: RIP: 0010:btrfs_update_root+0x22b/0x320 [btrfs] (...) heisenberg kernel: RSP: 0018:ffffb5448828bd40 EFLAGS: 00010286 heisenberg kernel: RAX: 0000000000000000 RBX: ffff8ed56bccef50 RCX: 0000000000000006 heisenberg kernel: RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8ed6bda166a0 heisenberg kernel: RBP: 00000000ffffffe4 R08: 00000000000003df R09: 0000000000000007 heisenberg kernel: R10: 0000000000000000 R11: 0000000000000001 R12: ffff8ed63396a078 heisenberg kernel: R13: ffff8ed092d7c800 R14: ffff8ed64f5db028 R15: ffff8ed6bd03d068 heisenberg kernel: FS: 0000000000000000(0000) GS:ffff8ed6bda00000(0000) knlGS:0000000000000000 heisenberg kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 heisenberg kernel: CR2: 00007f46f75f8000 CR3: 0000000310a0a002 CR4: 00000000003606f0 heisenberg kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 heisenberg kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 heisenberg kernel: Call Trace: heisenberg kernel: commit_fs_roots+0x166/0x1d0 [btrfs] heisenberg kernel: ? _cond_resched+0x15/0x30 heisenberg kernel: ? btrfs_run_delayed_refs+0xac/0x180 [btrfs] heisenberg kernel: btrfs_commit_transaction+0x2bd/0x870 [btrfs] heisenberg kernel: ? start_transaction+0x9d/0x3f0 [btrfs] heisenberg kernel: transaction_kthread+0x147/0x180 [btrfs] heisenberg kernel: ? btrfs_cleanup_transaction+0x530/0x530 [btrfs] heisenberg kernel: kthread+0x112/0x130 heisenberg kernel: ? kthread_bind+0x30/0x30 heisenberg kernel: ret_from_fork+0x35/0x40 heisenberg kernel: ---[ end trace 05de912e30e012d9 ]--- Since fiemap (and btrfs_check_shared()) is a read-only operation, do not do a transaction join to avoid the overhead of creating a new transaction (if there is currently no running transaction) and introducing a potential point of failure when the new transaction gets committed, instead use a transaction attach to grab a handle for the currently running transaction if any. Reported-by: Christoph Anton Mitterer Link: https://lore.kernel.org/linux-btrfs/b2a668d7124f1d3e410367f587926f622b3f03a4.camel@scientia.net/ Fixes: afce772e87c36c ("btrfs: fix check_shared for fiemap ioctl") CC: stable@vger.kernel.org # 4.14+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/backref.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ae750b1574a2..73e715d1a24e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1452,8 +1452,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * callers (such as fiemap) which want to know whether the extent is * shared but do not need a ref count. * - * This attempts to allocate a transaction in order to account for - * delayed refs, but continues on even when the alloc fails. + * This attempts to attach to the running transaction in order to account for + * delayed refs, but continues on even when no running transaction exists. * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ @@ -1476,13 +1476,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) tmp = ulist_alloc(GFP_NOFS); roots = ulist_alloc(GFP_NOFS); if (!tmp || !roots) { - ulist_free(tmp); - ulist_free(roots); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) { + ret = PTR_ERR(trans); + goto out; + } trans = NULL; down_read(&fs_info->commit_root_sem); } else { @@ -1515,6 +1518,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) } else { up_read(&fs_info->commit_root_sem); } +out: ulist_free(tmp); ulist_free(roots); return ret; -- cgit v1.2.3 From 8a8f671b3dadf878967d917682563e9f0028da53 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 17 Apr 2019 11:30:30 +0100 Subject: Btrfs: do not start a transaction at iterate_extent_inodes() commit bfc61c36260ca990937539cd648ede3cd749bc10 upstream. When finding out which inodes have references on a particular extent, done by backref.c:iterate_extent_inodes(), from the BTRFS_IOC_LOGICAL_INO (both v1 and v2) ioctl and from scrub we use the transaction join API to grab a reference on the currently running transaction, since in order to give accurate results we need to inspect the delayed references of the currently running transaction. However, if there is currently no running transaction, the join operation will create a new transaction. This is inefficient as the transaction will eventually be committed, doing unnecessary IO and introducing a potential point of failure that will lead to a transaction abort due to -ENOSPC, as recently reported [1]. That's because the join, creates the transaction but does not reserve any space, so when attempting to update the root item of the root passed to btrfs_join_transaction(), during the transaction commit, we can end up failling with -ENOSPC. Users of a join operation are supposed to actually do some filesystem changes and reserve space by some means, which is not the case of iterate_extent_inodes(), it is a read-only operation for all contextes from which it is called. The reported [1] -ENOSPC failure stack trace is the following: heisenberg kernel: ------------[ cut here ]------------ heisenberg kernel: BTRFS: Transaction aborted (error -28) heisenberg kernel: WARNING: CPU: 0 PID: 7137 at fs/btrfs/root-tree.c:136 btrfs_update_root+0x22b/0x320 [btrfs] (...) heisenberg kernel: CPU: 0 PID: 7137 Comm: btrfs-transacti Not tainted 4.19.0-4-amd64 #1 Debian 4.19.28-2 heisenberg kernel: Hardware name: FUJITSU LIFEBOOK U757/FJNB2A5, BIOS Version 1.21 03/19/2018 heisenberg kernel: RIP: 0010:btrfs_update_root+0x22b/0x320 [btrfs] (...) heisenberg kernel: RSP: 0018:ffffb5448828bd40 EFLAGS: 00010286 heisenberg kernel: RAX: 0000000000000000 RBX: ffff8ed56bccef50 RCX: 0000000000000006 heisenberg kernel: RDX: 0000000000000007 RSI: 0000000000000092 RDI: ffff8ed6bda166a0 heisenberg kernel: RBP: 00000000ffffffe4 R08: 00000000000003df R09: 0000000000000007 heisenberg kernel: R10: 0000000000000000 R11: 0000000000000001 R12: ffff8ed63396a078 heisenberg kernel: R13: ffff8ed092d7c800 R14: ffff8ed64f5db028 R15: ffff8ed6bd03d068 heisenberg kernel: FS: 0000000000000000(0000) GS:ffff8ed6bda00000(0000) knlGS:0000000000000000 heisenberg kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 heisenberg kernel: CR2: 00007f46f75f8000 CR3: 0000000310a0a002 CR4: 00000000003606f0 heisenberg kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 heisenberg kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 heisenberg kernel: Call Trace: heisenberg kernel: commit_fs_roots+0x166/0x1d0 [btrfs] heisenberg kernel: ? _cond_resched+0x15/0x30 heisenberg kernel: ? btrfs_run_delayed_refs+0xac/0x180 [btrfs] heisenberg kernel: btrfs_commit_transaction+0x2bd/0x870 [btrfs] heisenberg kernel: ? start_transaction+0x9d/0x3f0 [btrfs] heisenberg kernel: transaction_kthread+0x147/0x180 [btrfs] heisenberg kernel: ? btrfs_cleanup_transaction+0x530/0x530 [btrfs] heisenberg kernel: kthread+0x112/0x130 heisenberg kernel: ? kthread_bind+0x30/0x30 heisenberg kernel: ret_from_fork+0x35/0x40 heisenberg kernel: ---[ end trace 05de912e30e012d9 ]--- So fix that by using the attach API, which does not create a transaction when there is currently no running transaction. [1] https://lore.kernel.org/linux-btrfs/b2a668d7124f1d3e410367f587926f622b3f03a4.camel@scientia.net/ Reported-by: Zygo Blaxell CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/backref.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 73e715d1a24e..2a4f52c7be22 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1908,13 +1908,19 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, extent_item_objectid); if (!search_commit_root) { - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + trans = btrfs_attach_transaction(fs_info->extent_root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT && + PTR_ERR(trans) != -EROFS) + return PTR_ERR(trans); + trans = NULL; + } + } + + if (trans) btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - } else { + else down_read(&fs_info->commit_root_sem); - } ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, @@ -1947,7 +1953,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: - if (!search_commit_root) { + if (trans) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); btrfs_end_transaction(trans); } else { -- cgit v1.2.3 From 986d3453bee4e932b76261679e65881021eb2a8b Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Fri, 17 May 2019 14:31:44 -0700 Subject: fs/writeback.c: use rcu_barrier() to wait for inflight wb switches going into workqueue when umount commit ec084de929e419e51bcdafaafe567d9e7d0273b7 upstream. synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb switch may not go to the workqueue after synchronize_rcu(). Thus previous scheduled switches was not finished even flushing the workqueue, which will cause a NULL pointer dereferenced followed below. VFS: Busy inodes after unmount of vdd. Self-destruct in 5 seconds. Have a nice day... BUG: unable to handle kernel NULL pointer dereference at 0000000000000278 evict+0xb3/0x180 iput+0x1b0/0x230 inode_switch_wbs_work_fn+0x3c0/0x6a0 worker_thread+0x4e/0x490 ? process_one_work+0x410/0x410 kthread+0xe6/0x100 ret_from_fork+0x39/0x50 Replace the synchronize_rcu() call with a rcu_barrier() to wait for all pending callbacks to finish. And inc isw_nr_in_flight after call_rcu() in inode_switch_wbs() to make more sense. Link: http://lkml.kernel.org/r/20190429024108.54150-1-jiufei.xue@linux.alibaba.com Signed-off-by: Jiufei Xue Acked-by: Tejun Heo Suggested-by: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 82ce6d4f7e31..9544e2f8b79f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -530,8 +530,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inode = inode; - atomic_inc(&isw_nr_in_flight); - /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page @@ -539,6 +537,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + + atomic_inc(&isw_nr_in_flight); + goto out_unlock; out_free: @@ -908,7 +909,11 @@ restart: void cgroup_writeback_umount(void) { if (atomic_read(&isw_nr_in_flight)) { - synchronize_rcu(); + /* + * Use rcu_barrier() to wait for all pending callbacks to + * ensure that all in-flight wb switches are in the workqueue. + */ + rcu_barrier(); flush_workqueue(isw_wq); } } -- cgit v1.2.3 From 25d010f4e0ece1ddf0d8d57942c0b0f1568fe498 Mon Sep 17 00:00:00 2001 From: Sriram Rajagopalan Date: Fri, 10 May 2019 19:28:06 -0400 Subject: ext4: zero out the unused memory region in the extent tree block commit 592acbf16821288ecdc4192c47e3774a4c48bb64 upstream. This commit zeroes out the unused memory region in the buffer_head corresponding to the extent metablock after writing the extent header and the corresponding extent node entries. This is done to prevent random uninitialized data from getting into the filesystem when the extent block is synced. This fixes CVE-2019-11833. Signed-off-by: Sriram Rajagopalan Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 72a361d5ef74..45aea792d22a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1035,6 +1035,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ int err = 0; + size_t ext_size = 0; /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ @@ -1126,6 +1127,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, le16_add_cpu(&neh->eh_entries, m); } + /* zero out unused area in the extent block */ + ext_size = sizeof(struct ext4_extent_header) + + sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries); + memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1205,6 +1210,11 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent_idx) * m); le16_add_cpu(&neh->eh_entries, m); } + /* zero out unused area in the extent block */ + ext_size = sizeof(struct ext4_extent_header) + + (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries)); + memset(bh->b_data + ext_size, 0, + inode->i_sb->s_blocksize - ext_size); ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -1270,6 +1280,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock, goal = 0; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; + size_t ext_size = 0; /* Try to prepend new index to old one */ if (ext_depth(inode)) @@ -1295,9 +1306,11 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, goto out; } + ext_size = sizeof(EXT4_I(inode)->i_data); /* move top-level index/leaf into new block */ - memmove(bh->b_data, EXT4_I(inode)->i_data, - sizeof(EXT4_I(inode)->i_data)); + memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size); + /* zero out unused area in the extent block */ + memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size); /* set size of new block */ neh = ext_block_hdr(bh); -- cgit v1.2.3 From 0db24122bd7f06e9a7ae24dbda6411b477f5b9dd Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 10 May 2019 21:45:33 -0400 Subject: ext4: fix data corruption caused by overlapping unaligned and aligned IO commit 57a0da28ced8707cb9f79f071a016b9d005caf5a upstream. Unaligned AIO must be serialized because the zeroing of partial blocks of unaligned AIO can result in data corruption in case it's overlapping another in flight IO. Currently we wait for all unwritten extents before we submit unaligned AIO which protects data in case of unaligned AIO is following overlapping IO. However if a unaligned AIO is followed by overlapping aligned AIO we can still end up corrupting data. To fix this, we must make sure that the unaligned AIO is the only IO in flight by waiting for unwritten extents conversion not just before the IO submission, but right after it as well. This problem can be reproduced by xfstest generic/538 Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 98ec11f69cd4..2c5baa5e8291 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -264,6 +264,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); + /* + * Unaligned direct AIO must be the only IO in flight. Otherwise + * overlapping aligned IO after unaligned might result in data + * corruption. + */ + if (ret == -EIOCBQUEUED && unaligned_aio) + ext4_unwritten_wait(inode); inode_unlock(inode); if (ret > 0) -- cgit v1.2.3 From c19db366c0a809027289c470051fba31fbddbe76 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Fri, 10 May 2019 22:00:33 -0400 Subject: ext4: fix use-after-free in dx_release() commit 08fc98a4d6424af66eb3ac4e2cedd2fc927ed436 upstream. The buffer_head (frames[0].bh) and it's corresping page can be potentially free'd once brelse() is done inside the for loop but before the for loop exits in dx_release(). It can be free'd in another context, when the page cache is flushed via drop_caches_sysctl_handler(). This results into below data abort when accessing info->indirect_levels in dx_release(). Unable to handle kernel paging request at virtual address ffffffc17ac3e01e Call trace: dx_release+0x70/0x90 ext4_htree_fill_tree+0x2d4/0x300 ext4_readdir+0x244/0x6f8 iterate_dir+0xbc/0x160 SyS_getdents64+0x94/0x174 Signed-off-by: Sahitya Tummala Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4f8de2b9e87e..4c5aa5df6573 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -871,12 +871,15 @@ static void dx_release(struct dx_frame *frames) { struct dx_root_info *info; int i; + unsigned int indirect_levels; if (frames[0].bh == NULL) return; info = &((struct dx_root *)frames[0].bh->b_data)->info; - for (i = 0; i <= info->indirect_levels; i++) { + /* save local copy, "info" may be freed after brelse() */ + indirect_levels = info->indirect_levels; + for (i = 0; i <= indirect_levels; i++) { if (frames[i].bh == NULL) break; brelse(frames[i].bh); -- cgit v1.2.3 From 316063bf7d11c6a6feb8aba51987cf23181544aa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 17 May 2019 17:37:18 -0400 Subject: ext4: avoid panic during forced reboot due to aborted journal commit 2c1d0e3631e5732dba98ef49ac0bec1388776793 upstream. Handling of aborted journal is a special code path different from standard ext4_error() one and it can call panic() as well. Commit 1dc1097ff60e ("ext4: avoid panic during forced reboot") forgot to update this path so fix that omission. Fixes: 1dc1097ff60e ("ext4: avoid panic during forced reboot") Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org # 5.1 Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cac6ea956659..07d73fb22b60 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -698,7 +698,7 @@ void __ext4_abort(struct super_block *sb, const char *function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); } - if (test_opt(sb, ERRORS_PANIC)) { + if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { if (EXT4_SB(sb)->s_journal && !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) return; -- cgit v1.2.3 From 5b8567682489dbbb0742c9fc3f6711bb6a01f540 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 10 May 2019 21:15:47 -0400 Subject: jbd2: fix potential double free commit 0d52154bb0a700abb459a2cbce0a30fc2549b67e upstream. When failing from creating cache jbd2_inode_cache, we will destroy the previously created cache jbd2_handle_cache twice. This patch fixes this by moving each cache initialization/destruction to its own separate, individual function. Signed-off-by: Chengguang Xu Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/journal.c | 49 +++++++++++++++++++++++++++++++------------------ fs/jbd2/revoke.c | 32 ++++++++++++++++++++------------ fs/jbd2/transaction.c | 8 +++++--- 3 files changed, 56 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9e618777c699..e9cf88f0bc29 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2389,22 +2389,19 @@ static struct kmem_cache *jbd2_journal_head_cache; static atomic_t nr_journal_heads = ATOMIC_INIT(0); #endif -static int jbd2_journal_init_journal_head_cache(void) +static int __init jbd2_journal_init_journal_head_cache(void) { - int retval; - - J_ASSERT(jbd2_journal_head_cache == NULL); + J_ASSERT(!jbd2_journal_head_cache); jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ - retval = 0; if (!jbd2_journal_head_cache) { - retval = -ENOMEM; printk(KERN_EMERG "JBD2: no memory for journal_head cache\n"); + return -ENOMEM; } - return retval; + return 0; } static void jbd2_journal_destroy_journal_head_cache(void) @@ -2650,28 +2647,38 @@ static void __exit jbd2_remove_jbd_stats_proc_entry(void) struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache; +static int __init jbd2_journal_init_inode_cache(void) +{ + J_ASSERT(!jbd2_inode_cache); + jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); + if (!jbd2_inode_cache) { + pr_emerg("JBD2: failed to create inode cache\n"); + return -ENOMEM; + } + return 0; +} + static int __init jbd2_journal_init_handle_cache(void) { + J_ASSERT(!jbd2_handle_cache); jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY); - if (jbd2_handle_cache == NULL) { + if (!jbd2_handle_cache) { printk(KERN_EMERG "JBD2: failed to create handle cache\n"); return -ENOMEM; } - jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0); - if (jbd2_inode_cache == NULL) { - printk(KERN_EMERG "JBD2: failed to create inode cache\n"); - kmem_cache_destroy(jbd2_handle_cache); - return -ENOMEM; - } return 0; } +static void jbd2_journal_destroy_inode_cache(void) +{ + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; +} + static void jbd2_journal_destroy_handle_cache(void) { kmem_cache_destroy(jbd2_handle_cache); jbd2_handle_cache = NULL; - kmem_cache_destroy(jbd2_inode_cache); - jbd2_inode_cache = NULL; } /* @@ -2682,11 +2689,15 @@ static int __init journal_init_caches(void) { int ret; - ret = jbd2_journal_init_revoke_caches(); + ret = jbd2_journal_init_revoke_record_cache(); + if (ret == 0) + ret = jbd2_journal_init_revoke_table_cache(); if (ret == 0) ret = jbd2_journal_init_journal_head_cache(); if (ret == 0) ret = jbd2_journal_init_handle_cache(); + if (ret == 0) + ret = jbd2_journal_init_inode_cache(); if (ret == 0) ret = jbd2_journal_init_transaction_cache(); return ret; @@ -2694,9 +2705,11 @@ static int __init journal_init_caches(void) static void jbd2_journal_destroy_caches(void) { - jbd2_journal_destroy_revoke_caches(); + jbd2_journal_destroy_revoke_record_cache(); + jbd2_journal_destroy_revoke_table_cache(); jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); + jbd2_journal_destroy_inode_cache(); jbd2_journal_destroy_transaction_cache(); jbd2_journal_destroy_slabs(); } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index a1143e57a718..69b9bc329964 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -178,33 +178,41 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, return NULL; } -void jbd2_journal_destroy_revoke_caches(void) +void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; +} + +void jbd2_journal_destroy_revoke_table_cache(void) +{ kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } -int __init jbd2_journal_init_revoke_caches(void) +int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); - J_ASSERT(!jbd2_revoke_table_cache); - jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); - if (!jbd2_revoke_record_cache) - goto record_cache_failure; + if (!jbd2_revoke_record_cache) { + pr_emerg("JBD2: failed to create revoke_record cache\n"); + return -ENOMEM; + } + return 0; +} + +int __init jbd2_journal_init_revoke_table_cache(void) +{ + J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); - if (!jbd2_revoke_table_cache) - goto table_cache_failure; - return 0; -table_cache_failure: - jbd2_journal_destroy_revoke_caches(); -record_cache_failure: + if (!jbd2_revoke_table_cache) { + pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; + } + return 0; } static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 914e725c82c4..e20a6703531f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -42,9 +42,11 @@ int __init jbd2_journal_init_transaction_cache(void) 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); - if (transaction_cache) - return 0; - return -ENOMEM; + if (!transaction_cache) { + pr_emerg("JBD2: failed to create transaction cache\n"); + return -ENOMEM; + } + return 0; } void jbd2_journal_destroy_transaction_cache(void) -- cgit v1.2.3 From f4bf101be366d1075767f21a10b1bf587ce2a52f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Oct 2018 11:17:42 -0700 Subject: pstore: Centralize init/exit routines commit cb095afd44768bf495894b9ad063bd078e4bb201 upstream. In preparation for having additional actions during init/exit, this moves the init/exit into platform.c, centralizing the logic to make call outs to the fs init/exit. Signed-off-by: Kees Cook Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/pstore/inode.c | 11 ++--------- fs/pstore/internal.h | 5 +++-- fs/pstore/platform.c | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 5fcb845b9fec..8cf2218b46a7 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -482,12 +482,10 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; -static int __init init_pstore_fs(void) +int __init pstore_init_fs(void) { int err; - pstore_choose_compression(); - /* Create a convenient mount point for people to access pstore */ err = sysfs_create_mount_point(fs_kobj, "pstore"); if (err) @@ -500,14 +498,9 @@ static int __init init_pstore_fs(void) out: return err; } -module_init(init_pstore_fs) -static void __exit exit_pstore_fs(void) +void __exit pstore_exit_fs(void) { unregister_filesystem(&pstore_fs_type); sysfs_remove_mount_point(fs_kobj, "pstore"); } -module_exit(exit_pstore_fs) - -MODULE_AUTHOR("Tony Luck "); -MODULE_LICENSE("GPL"); diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index fb767e28aeb2..7062ea4bc57c 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -37,7 +37,8 @@ extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); -/* Called during module_init() */ -extern void __init pstore_choose_compression(void); +/* Called during pstore init/exit. */ +int __init pstore_init_fs(void); +void __exit pstore_exit_fs(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 15e99d5a681d..d61e26812af6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -780,8 +780,31 @@ void __init pstore_choose_compression(void) } } +static int __init pstore_init(void) +{ + int ret; + + pstore_choose_compression(); + + ret = pstore_init_fs(); + if (ret) + return ret; + + return 0; +} +module_init(pstore_init) + +static void __exit pstore_exit(void) +{ + pstore_exit_fs(); +} +module_exit(pstore_exit) + module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "Pstore compression to use"); module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "Pstore backend to use"); + +MODULE_AUTHOR("Tony Luck "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From fea8b84765a12c5c2aef32602381d9ffcc6d3507 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 17 Oct 2018 03:13:55 -0700 Subject: pstore: Allocate compression during late_initcall() commit 416031653eb55f844e3547fb8f8576399a800da0 upstream. ramoops's call of pstore_register() was recently moved to run during late_initcall() because the crypto backend may not have been ready during postcore_initcall(). This meant early-boot crash dumps were not getting caught by pstore any more. Instead, lets allow calls to pstore_register() earlier, and once crypto is ready we can initialize the compression. Reported-by: Sai Prakash Ranjan Signed-off-by: Joel Fernandes (Google) Tested-by: Sai Prakash Ranjan Fixes: cb3bee0369bc ("pstore: Use crypto compress API") [kees: trivial rebase] Signed-off-by: Kees Cook Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/pstore/platform.c | 10 +++++++++- fs/pstore/ram.c | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index d61e26812af6..578f178a695f 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -786,13 +786,21 @@ static int __init pstore_init(void) pstore_choose_compression(); + /* + * Check if any pstore backends registered earlier but did not + * initialize compression because crypto was not ready. If so, + * initialize compression now. + */ + if (psinfo && !tfm) + allocate_buf_for_compression(); + ret = pstore_init_fs(); if (ret) return ret; return 0; } -module_init(pstore_init) +late_initcall(pstore_init); static void __exit pstore_exit(void) { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index eb67bb7f04de..44ed6b193d2e 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -956,7 +956,7 @@ static int __init ramoops_init(void) return ret; } -late_initcall(ramoops_init); +postcore_initcall(ramoops_init); static void __exit ramoops_exit(void) { -- cgit v1.2.3 From 953e826e8d0f24904241e7cd3633a54af511b42a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 17 Oct 2018 14:00:12 -0700 Subject: pstore: Refactor compression initialization commit 95047b0519c17a28e09df5f38750f5354e3db4c4 upstream. This refactors compression initialization slightly to better handle getting potentially called twice (via early pstore_register() calls and later pstore_init()) and improves the comments and reporting to be more verbose. Signed-off-by: Kees Cook Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- fs/pstore/platform.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 578f178a695f..b821054ca3ed 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -274,36 +274,56 @@ static int pstore_decompress(void *in, void *out, static void allocate_buf_for_compression(void) { + struct crypto_comp *ctx; + int size; + char *buf; + + /* Skip if not built-in or compression backend not selected yet. */ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) return; + /* Skip if no pstore backend yet or compression init already done. */ + if (!psinfo || tfm) + return; + if (!crypto_has_comp(zbackend->name, 0, 0)) { - pr_err("No %s compression\n", zbackend->name); + pr_err("Unknown compression: %s\n", zbackend->name); return; } - big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize); - if (big_oops_buf_sz <= 0) + size = zbackend->zbufsize(psinfo->bufsize); + if (size <= 0) { + pr_err("Invalid compression size for %s: %d\n", + zbackend->name, size); return; + } - big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); - if (!big_oops_buf) { - pr_err("allocate compression buffer error!\n"); + buf = kmalloc(size, GFP_KERNEL); + if (!buf) { + pr_err("Failed %d byte compression buffer allocation for: %s\n", + size, zbackend->name); return; } - tfm = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - kfree(big_oops_buf); - big_oops_buf = NULL; - pr_err("crypto_alloc_comp() failed!\n"); + ctx = crypto_alloc_comp(zbackend->name, 0, 0); + if (IS_ERR_OR_NULL(ctx)) { + kfree(buf); + pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, + PTR_ERR(ctx)); return; } + + /* A non-NULL big_oops_buf indicates compression is available. */ + tfm = ctx; + big_oops_buf_sz = size; + big_oops_buf = buf; + + pr_info("Using compression: %s\n", zbackend->name); } static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm)) + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) crypto_free_comp(tfm); kfree(big_oops_buf); big_oops_buf = NULL; @@ -774,7 +794,6 @@ void __init pstore_choose_compression(void) for (step = zbackends; step->name; step++) { if (!strcmp(compress, step->name)) { zbackend = step; - pr_info("using %s compression\n", zbackend->name); return; } } @@ -791,8 +810,7 @@ static int __init pstore_init(void) * initialize compression because crypto was not ready. If so, * initialize compression now. */ - if (psinfo && !tfm) - allocate_buf_for_compression(); + allocate_buf_for_compression(); ret = pstore_init_fs(); if (ret) -- cgit v1.2.3 From 6172ae55a187378b091282c7a5b4895635e63371 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 21 Feb 2019 11:29:10 -0500 Subject: ext4: fix compile error when using BUFFER_TRACE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ddccb6dbe780d68133191477571cb7c69e17bb8c upstream. Fix compile error below when using BUFFER_TRACE. fs/ext4/inode.c: In function ‘ext4_expand_extra_isize’: fs/ext4/inode.c:5979:19: error: request for member ‘bh’ in something not a structure or union BUFFER_TRACE(iloc.bh, "get_write_access"); Fixes: c03b45b853f58 ("ext4, project: expand inode extra size if possible") Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c43c5b92229..18884f39610f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5971,7 +5971,7 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); - BUFFER_TRACE(iloc.bh, "get_write_access"); + BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); -- cgit v1.2.3 From e8816d3bc5956a3d4d0f00651138a8a3b36a07bf Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Thu, 14 Feb 2019 17:52:18 -0500 Subject: ext4: don't update s_rev_level if not required commit c9e716eb9b3455a83ed7c5f5a81256a3da779a95 upstream. Don't update the superblock s_rev_level during mount if it isn't actually necessary, only if superblock features are being set by the kernel. This was originally added for ext3 since it always set the INCOMPAT_RECOVER and HAS_JOURNAL features during mount, but this is not needed since no journal mode was added to ext4. That will allow Geert to mount his 20-year-old ext2 rev 0.0 m68k filesystem, as a testament of the backward compatibility of ext4. Fixes: 0390131ba84f ("ext4: Allow ext4 to run without a journal") Signed-off-by: Andreas Dilger Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 6 +++++- fs/ext4/inode.c | 1 - fs/ext4/super.c | 1 - 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2ddf7833350d..1ee51d3a978a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1670,6 +1670,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +extern void ext4_update_dynamic_rev(struct super_block *sb); + #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ @@ -1678,6 +1680,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ @@ -1695,6 +1698,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ @@ -1712,6 +1716,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ @@ -2679,7 +2684,6 @@ do { \ #endif -extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 18884f39610f..67e8aa35197e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5320,7 +5320,6 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; - ext4_update_dynamic_rev(sb); ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 07d73fb22b60..a270391228af 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2259,7 +2259,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); -- cgit v1.2.3 From 7928396df91eab24047ab5d8fb105adfe9f50daa Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 19 Apr 2019 14:55:12 -0400 Subject: proc: prevent changes to overridden credentials commit 35a196bef449b5824033865b963ed9a43fb8c730 upstream. Prevent userspace from changing the the /proc/PID/attr values if the task's credentials are currently overriden. This not only makes sense conceptually, it also prevents some really bizarre error cases caused when trying to commit credentials to a task with overridden credentials. Cc: Reported-by: "chengjian (D)" Signed-off-by: Paul Moore Acked-by: John Johansen Acked-by: James Morris Acked-by: Casey Schaufler Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 81d77b15b347..f999e8bd3771 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2542,6 +2542,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, rcu_read_unlock(); return -EACCES; } + /* Prevent changes to overridden credentials. */ + if (current_cred() != current_real_cred()) { + rcu_read_unlock(); + return -EBUSY; + } rcu_read_unlock(); if (count > PAGE_SIZE) -- cgit v1.2.3 From c939121b5435e920d2cec3c664163eb81d41b233 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Mar 2019 22:23:19 -0400 Subject: dcache: sort the freeing-without-RCU-delay mess for good. commit 5467a68cbf6884c9a9d91e2a89140afb1839c835 upstream. For lockless accesses to dentries we don't have pinned we rely (among other things) upon having an RCU delay between dropping the last reference and actually freeing the memory. On the other hand, for things like pipes and sockets we neither do that kind of lockless access, nor want to deal with the overhead of an RCU delay every time a socket gets closed. So delay was made optional - setting DCACHE_RCUACCESS in ->d_flags made sure it would happen. We tried to avoid setting it unless we knew we need it. Unfortunately, that had led to recurring class of bugs, in which we missed the need to set it. We only really need it for dentries that are created by d_alloc_pseudo(), so let's not bother with trying to be smart - just make having an RCU delay the default. The ones that do *not* get it set the replacement flag (DCACHE_NORCU) and we'd better use that sparingly. d_alloc_pseudo() is the only such user right now. FWIW, the race that finally prompted that switch had been between __lock_parent() of immediate subdirectory of what's currently the root of a disconnected tree (e.g. from open-by-handle in progress) racing with d_splice_alias() elsewhere picking another alias for the same inode, either on outright corrupted fs image, or (in case of open-by-handle on NFS) that subdirectory having been just moved on server. It's not easy to hit, so the sky is not falling, but that's not the first race on similar missed cases and the logics for settinf DCACHE_RCUACCESS has gotten ridiculously convoluted. Cc: stable@vger.kernel.org Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 24 +++++++++++++----------- fs/nsfs.c | 3 +-- 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index cb515f183482..6e0022326afe 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -344,7 +344,7 @@ static void dentry_free(struct dentry *dentry) } } /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) + if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); @@ -1694,7 +1694,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1719,7 +1718,7 @@ struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { - dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR; + dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; @@ -1732,10 +1731,17 @@ struct dentry *d_alloc_cursor(struct dentry * parent) * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. + * This is used for pipes, sockets et.al. - the stuff that should + * never be anyone's children or parents. Unlike all other + * dentries, these will not have RCU delay between dropping the + * last reference and freeing them. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { - return __d_alloc(sb, name); + struct dentry *dentry = __d_alloc(sb, name); + if (likely(dentry)) + dentry->d_flags |= DCACHE_NORCU; + return dentry; } EXPORT_SYMBOL(d_alloc_pseudo); @@ -1899,12 +1905,10 @@ struct dentry *d_make_root(struct inode *root_inode) if (root_inode) { res = d_alloc_anon(root_inode->i_sb); - if (res) { - res->d_flags |= DCACHE_RCUACCESS; + if (res) d_instantiate(res, root_inode); - } else { + else iput(root_inode); - } } return res; } @@ -2769,9 +2773,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; - if (dentry == old_parent) - dentry->d_flags |= DCACHE_RCUACCESS; - else + if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; diff --git a/fs/nsfs.c b/fs/nsfs.c index 60702d677bd4..30d150a4f0c6 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -85,13 +85,12 @@ slow: inode->i_fop = &ns_file_operations; inode->i_private = ns; - dentry = d_alloc_pseudo(mnt->mnt_sb, &empty_name); + dentry = d_alloc_anon(mnt->mnt_sb); if (!dentry) { iput(inode); return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); - dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { -- cgit v1.2.3 From 939db6fdbea6a84ef8f574a2e7f3a71a8b9507e5 Mon Sep 17 00:00:00 2001 From: Christoph Probst Date: Tue, 7 May 2019 17:16:40 +0200 Subject: cifs: fix strcat buffer overflow and reduce raciness in smb21_set_oplock_level() commit 6a54b2e002c9d00b398d35724c79f9fe0d9b38fb upstream. Change strcat to strncpy in the "None" case to fix a buffer overflow when cinode->oplock is reset to 0 by another thread accessing the same cinode. It is never valid to append "None" to any other message. Consolidate multiple writes to cinode->oplock to reduce raciness. Signed-off-by: Christoph Probst Reviewed-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2ops.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2001184afe70..0ccf8f9b63a2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2348,26 +2348,28 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { char message[5] = {0}; + unsigned int new_oplock = 0; oplock &= 0xFF; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; - cinode->oplock = 0; if (oplock & SMB2_LEASE_READ_CACHING_HE) { - cinode->oplock |= CIFS_CACHE_READ_FLG; + new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); } if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) { - cinode->oplock |= CIFS_CACHE_HANDLE_FLG; + new_oplock |= CIFS_CACHE_HANDLE_FLG; strcat(message, "H"); } if (oplock & SMB2_LEASE_WRITE_CACHING_HE) { - cinode->oplock |= CIFS_CACHE_WRITE_FLG; + new_oplock |= CIFS_CACHE_WRITE_FLG; strcat(message, "W"); } - if (!cinode->oplock) - strcat(message, "None"); + if (!new_oplock) + strncpy(message, "None", sizeof(message)); + + cinode->oplock = new_oplock; cifs_dbg(FYI, "%s Lease granted on inode %p\n", message, &cinode->vfs_inode); } -- cgit v1.2.3 From d3dd6057d2d61e0963a9be9cb80343add5457279 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Mon, 6 May 2019 11:57:03 +0800 Subject: NFS4: Fix v4.0 client state corruption when mount commit f02f3755dbd14fb935d24b14650fff9ba92243b8 upstream. stat command with soft mount never return after server is stopped. When alloc a new client, the state of the client will be set to NFS4CLNT_LEASE_EXPIRED. When the server is stopped, the state manager will work, and accord the state to recover. But the state is NFS4CLNT_LEASE_EXPIRED, it will drain the slot table and lead other task to wait queue, until the client recovered. Then the stat command is hung. When discover server trunking, the client will renew the lease, but check the client state, it lead the client state corruption. So, we need to call state manager to recover it when detect server ip trunking. Signed-off-by: ZhangXiaoxu Cc: stable@vger.kernel.org Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d2f645d34eb1..3ba2087469ac 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -159,6 +159,10 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); + + /* If the client state need to recover, do it. */ + if (clp->cl_state) + nfs4_schedule_state_manager(clp); } out: return status; -- cgit v1.2.3 From 04f34b76368f19f4ade154a0190b664e91bfa0df Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 7 May 2019 13:41:49 -0400 Subject: PNFS fallback to MDS if no deviceid found commit b1029c9bc078a6f1515f55dd993b507dcc7e3440 upstream. If we fail to find a good deviceid while trying to pnfs instead of propogating an error back fallback to doing IO to the MDS. Currently, code with fals the IO with EINVAL. Signed-off-by: Olga Kornievskaia Fixes: 8d40b0f14846f ("NFS filelayout:call GETDEVICEINFO after pnfs_layout_process completes" Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/filelayout/filelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d175724ff566..2478a69da0f0 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -904,7 +904,7 @@ fl_pnfs_update_layout(struct inode *ino, status = filelayout_check_deviceid(lo, fl, gfp_flags); if (status) { pnfs_put_lseg(lseg); - lseg = ERR_PTR(status); + lseg = NULL; } out: return lseg; -- cgit v1.2.3 From a452f733f93e943d991fd98da1fda20e4cec77d1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 24 Apr 2019 17:05:06 +0200 Subject: fuse: fix writepages on 32bit commit 9de5be06d0a89ca97b5ab902694d42dfd2bb77d2 upstream. Writepage requests were cropped to i_size & 0xffffffff, which meant that mmaped writes to any file larger than 4G might be silently discarded. Fix by storing the file size in a properly sized variable (loff_t instead of size_t). Reported-by: Antonio SJ Musumeci Fixes: 6eaf4782eb09 ("fuse: writepages: crop secondary requests") Cc: # v3.13 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bd500c3b7858..0ada6e91cc38 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1526,7 +1526,7 @@ __acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - size_t crop = i_size_read(inode); + loff_t crop = i_size_read(inode); struct fuse_req *req; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { -- cgit v1.2.3 From 979d2433b873cde6d2d360644754d16fe9c5481b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 18 Apr 2019 04:04:41 +0800 Subject: fuse: honor RLIMIT_FSIZE in fuse_file_fallocate commit 0cbade024ba501313da3b7e5dd2a188a6bc491b5 upstream. fstests generic/228 reported this failure that fuse fallocate does not honor what 'ulimit -f' has set. This adds the necessary inode_newsize_ok() check. Signed-off-by: Liu Bo Fixes: 05ba1f082300 ("fuse: add FALLOCATE operation") Cc: # v3.5 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0ada6e91cc38..21fd510183c3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2975,6 +2975,13 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } } + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + length > i_size_read(inode)) { + err = inode_newsize_ok(inode, offset + length); + if (err) + return err; + } + if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); -- cgit v1.2.3 From a9676c96e7e06f3f90e9f2e7413b949dfc4d2df5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 22 Jan 2019 07:01:39 +0200 Subject: ovl: fix missing upper fs freeze protection on copy up for ioctl commit 3428030da004a1128cbdcf93dc03e16f184d845b upstream. Generalize the helper ovl_open_maybe_copy_up() and use it to copy up file with data before FS_IOC_SETFLAGS ioctl. The FS_IOC_SETFLAGS ioctl is a bit of an odd ball in vfs, which probably caused the confusion. File may be open O_RDONLY, but ioctl modifies the file. VFS does not call mnt_want_write_file() nor lock inode mutex, but fs-specific code for FS_IOC_SETFLAGS does. So ovl_ioctl() calls mnt_want_write_file() for the overlay file, and fs-specific code calls mnt_want_write_file() for upper fs file, but there was no call for ovl_want_write() for copy up duration which prevents overlayfs from copying up on a frozen upper fs. Fixes: dab5ca8fd9dd ("ovl: add lsattr/chattr support") Cc: # v4.19 Signed-off-by: Amir Goldstein Acked-by: Vivek Goyal Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/copy_up.c | 6 +++--- fs/overlayfs/file.c | 5 ++--- fs/overlayfs/overlayfs.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 75eeee08d848..ffc73600216b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -878,14 +878,14 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags) return true; } -int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) +int ovl_maybe_copy_up(struct dentry *dentry, int flags) { int err = 0; - if (ovl_open_need_copy_up(dentry, file_flags)) { + if (ovl_open_need_copy_up(dentry, flags)) { err = ovl_want_write(dentry); if (!err) { - err = ovl_copy_up_flags(dentry, file_flags); + err = ovl_copy_up_flags(dentry, flags); ovl_drop_write(dentry); } } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 986313da0c88..0c810f20f778 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -116,11 +116,10 @@ static int ovl_real_fdget(const struct file *file, struct fd *real) static int ovl_open(struct inode *inode, struct file *file) { - struct dentry *dentry = file_dentry(file); struct file *realfile; int err; - err = ovl_open_maybe_copy_up(dentry, file->f_flags); + err = ovl_maybe_copy_up(file_dentry(file), file->f_flags); if (err) return err; @@ -390,7 +389,7 @@ static long ovl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (ret) return ret; - ret = ovl_copy_up_with_data(file_dentry(file)); + ret = ovl_maybe_copy_up(file_dentry(file), O_WRONLY); if (!ret) { ret = ovl_real_ioctl(file, cmd, arg); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d9c16ceebfe7..80fb66426760 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -411,7 +411,7 @@ extern const struct file_operations ovl_file_operations; int ovl_copy_up(struct dentry *dentry); int ovl_copy_up_with_data(struct dentry *dentry); int ovl_copy_up_flags(struct dentry *dentry, int flags); -int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); +int ovl_maybe_copy_up(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper); -- cgit v1.2.3 From 77ca91441696be994644adf53233766994b60317 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 May 2019 09:20:54 -0400 Subject: ceph: flush dirty inodes before proceeding with remount commit 00abf69dd24f4444d185982379c5cc3bb7b6d1fc upstream. xfstest generic/452 was triggering a "Busy inodes after umount" warning. ceph was allowing the mount to go read-only without first flushing out dirty inodes in the cache. Ensure we sync out the filesystem before allowing a remount to proceed. Cc: stable@vger.kernel.org Link: http://tracker.ceph.com/issues/39571 Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eab1359d0553..c5cf46e43f2e 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -819,6 +819,12 @@ static void ceph_umount_begin(struct super_block *sb) return; } +static int ceph_remount(struct super_block *sb, int *flags, char *data) +{ + sync_filesystem(sb); + return 0; +} + static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .destroy_inode = ceph_destroy_inode, @@ -826,6 +832,7 @@ static const struct super_operations ceph_super_ops = { .drop_inode = ceph_drop_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, + .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, -- cgit v1.2.3 From fea685000caf196f676fc5a813599f63634c2c60 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Wed, 24 Apr 2019 07:13:57 +0000 Subject: fuse: Add FOPEN_STREAM to use stream_open() commit bbd84f33652f852ce5992d65db4d020aba21f882 upstream. Starting from commit 9c225f2655e3 ("vfs: atomic f_pos accesses as per POSIX") files opened even via nonseekable_open gate read and write via lock and do not allow them to be run simultaneously. This can create read vs write deadlock if a filesystem is trying to implement a socket-like file which is intended to be simultaneously used for both read and write from filesystem client. See commit 10dce8af3422 ("fs: stream_open - opener for stream-like files so that read and write can run simultaneously without deadlock") for details and e.g. commit 581d21a2d02a ("xenbus: fix deadlock on writes to /proc/xen/xenbus") for a similar deadlock example on /proc/xen/xenbus. To avoid such deadlock it was tempting to adjust fuse_finish_open to use stream_open instead of nonseekable_open on just FOPEN_NONSEEKABLE flags, but grepping through Debian codesearch shows users of FOPEN_NONSEEKABLE, and in particular GVFS which actually uses offset in its read and write handlers https://codesearch.debian.net/search?q=-%3Enonseekable+%3D https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1080 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1247-1346 https://gitlab.gnome.org/GNOME/gvfs/blob/1.40.0-6-gcbc54396/client/gvfsfusedaemon.c#L1399-1481 so if we would do such a change it will break a real user. Add another flag (FOPEN_STREAM) for filesystem servers to indicate that the opened handler is having stream-like semantics; does not use file position and thus the kernel is free to issue simultaneous read and write request on opened file handle. This patch together with stream_open() should be added to stable kernels starting from v3.14+. This will allow to patch OSSPD and other FUSE filesystems that provide stream-like files to return FOPEN_STREAM | FOPEN_NONSEEKABLE in open handler and this way avoid the deadlock on all kernel versions. This should work because fuse_finish_open ignores unknown open flags returned from a filesystem and so passing FOPEN_STREAM to a kernel that is not aware of this flag cannot hurt. In turn the kernel that is not aware of FOPEN_STREAM will be < v3.14 where just FOPEN_NONSEEKABLE is sufficient to implement streams without read vs write deadlock. Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Kirill Smelkov Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 21fd510183c3..59e8bb72dc14 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -179,7 +179,9 @@ void fuse_finish_open(struct inode *inode, struct file *file) file->f_op = &fuse_direct_io_file_operations; if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); - if (ff->open_flags & FOPEN_NONSEEKABLE) + if (ff->open_flags & FOPEN_STREAM) + stream_open(inode, file); + else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); -- cgit v1.2.3 From a06fdd99a3393032d1c145ffb991c4cd916a8ba4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 May 2019 22:46:11 -0400 Subject: ufs: fix braino in ufs_get_inode_gid() for solaris UFS flavour [ Upstream commit 4e9036042fedaffcd868d7f7aa948756c48c637d ] To choose whether to pick the GID from the old (16bit) or new (32bit) field, we should check if the old gid field is set to 0xffff. Mainline checks the old *UID* field instead - cut'n'paste from the corresponding code in ufs_get_inode_uid(). Fixes: 252e211e90ce Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/ufs/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1fd3011ea623..7fd4802222b8 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -229,7 +229,7 @@ ufs_get_inode_gid(struct super_block *sb, struct ufs_inode *inode) case UFS_UID_44BSD: return fs32_to_cpu(sb, inode->ui_u3.ui_44.ui_gid); case UFS_UID_EFT: - if (inode->ui_u1.oldids.ui_suid == 0xFFFF) + if (inode->ui_u1.oldids.ui_sgid == 0xFFFF) return fs32_to_cpu(sb, inode->ui_u3.ui_sun.ui_gid); /* Fall through */ default: -- cgit v1.2.3 From 71e430fd593b2627baaa67e808ff27131b1418e0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 May 2019 23:35:28 -0400 Subject: ext4: do not delete unlinked inode from orphan list on failed truncate commit ee0ed02ca93ef1ecf8963ad96638795d55af2c14 upstream. It is possible that unlinked inode enters ext4_setattr() (e.g. if somebody calls ftruncate(2) on unlinked but still open file). In such case we should not delete the inode from the orphan list if truncate fails. Note that this is mostly a theoretical concern as filesystem is corrupted if we reach this path anyway but let's be consistent in our orphan handling. Reviewed-by: Ira Weiny Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 67e8aa35197e..82dee8addb4b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5596,7 +5596,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) { - if (orphan) + if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); goto err_out; } -- cgit v1.2.3 From 5220582c427bb377c8c4b28ed911f545e7da859d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 May 2019 23:07:08 -0400 Subject: ext4: wait for outstanding dio during truncate in nojournal mode commit 82a25b027ca48d7ef197295846b352345853dfa8 upstream. We didn't wait for outstanding direct IO during truncate in nojournal mode (as we skip orphan handling in that case). This can lead to fs corruption or stale data exposure if truncate ends up freeing blocks and these get reallocated before direct IO finishes. Fix the condition determining whether the wait is necessary. CC: stable@vger.kernel.org Fixes: 1c9114f9c0f1 ("ext4: serialize unlocked dio reads with truncate") Reviewed-by: Ira Weiny Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 82dee8addb4b..05dc5a4ba481 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5601,20 +5601,17 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) goto err_out; } } - if (!shrink) + if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); - - /* - * Blocks are going to be removed from the inode. Wait - * for dio in flight. Temporarily disable - * dioread_nolock to prevent livelock. - */ - if (orphan) { - if (!ext4_should_journal_data(inode)) { - inode_dio_wait(inode); - } else - ext4_wait_for_tail_page_commit(inode); + } else { + /* + * Blocks are going to be removed from the inode. Wait + * for dio in flight. + */ + inode_dio_wait(inode); } + if (orphan && ext4_should_journal_data(inode)) + ext4_wait_for_tail_page_commit(inode); down_write(&EXT4_I(inode)->i_mmap_sem); rc = ext4_break_layouts(inode); -- cgit v1.2.3 From 70d33cce97f01c79f2aa951684cf2116ec9ba1c5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 16 Mar 2019 09:13:06 +0900 Subject: f2fs: Fix use of number of devices commit 0916878da355650d7e77104a7ac0fa1784eca852 upstream. For a single device mount using a zoned block device, the zone information for the device is stored in the sbi->devs single entry array and sbi->s_ndevs is set to 1. This differs from a single device mount using a regular block device which does not allocate sbi->devs and sets sbi->s_ndevs to 0. However, sbi->s_devs == 0 condition is used throughout the code to differentiate a single device mount from a multi-device mount where sbi->s_ndevs is always larger than 1. This results in problems with single zoned block device volumes as these are treated as multi-device mounts but do not have the start_blk and end_blk information set. One of the problem observed is skipping of zone discard issuing resulting in write commands being issued to full zones or unaligned to a zone write pointer. Fix this problem by simply treating the cases sbi->s_ndevs == 0 (single regular block device mount) and sbi->s_ndevs == 1 (single zoned block device mount) in the same manner. This is done by introducing the helper function f2fs_is_multi_device() and using this helper in place of direct tests of sbi->s_ndevs value, improving code readability. Fixes: 7bb3a371d199 ("f2fs: Fix zoned block device support") Cc: Signed-off-by: Damien Le Moal Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/data.c | 17 +++++++++++------ fs/f2fs/f2fs.h | 13 ++++++++++++- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 2 +- fs/f2fs/segment.c | 13 +++++++------ 5 files changed, 32 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 08314fb42652..4d02e76b648a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -197,12 +197,14 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, struct block_device *bdev = sbi->sb->s_bdev; int i; - for (i = 0; i < sbi->s_ndevs; i++) { - if (FDEV(i).start_blk <= blk_addr && - FDEV(i).end_blk >= blk_addr) { - blk_addr -= FDEV(i).start_blk; - bdev = FDEV(i).bdev; - break; + if (f2fs_is_multi_device(sbi)) { + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } } } if (bio) { @@ -216,6 +218,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) { int i; + if (!f2fs_is_multi_device(sbi)) + return 0; + for (i = 0; i < sbi->s_ndevs; i++) if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) return i; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1f5d5f62bb77..a4b6eacf22ea 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1336,6 +1336,17 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) } #endif +/* + * Test if the mounted volume is a multi-device volume. + * - For a single regular disk volume, sbi->s_ndevs is 0. + * - For a single zoned disk volume, sbi->s_ndevs is 1. + * - For a multi-device volume, sbi->s_ndevs is always 2 or more. + */ +static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) +{ + return sbi->s_ndevs > 1; +} + /* For write statistics. Suppose sector size is 512 bytes, * and the return value is in kbytes. s is of struct f2fs_sb_info. */ @@ -3455,7 +3466,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) { return (f2fs_post_read_required(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || - F2FS_I_SB(inode)->s_ndevs); + f2fs_is_multi_device(F2FS_I_SB(inode))); } #ifdef CONFIG_F2FS_FAULT_INJECTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b3f46e3bec17..8d1eb8dec605 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2539,7 +2539,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; - if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num || sbi->segs_per_sec != 1) { f2fs_msg(sbi->sb, KERN_WARNING, "Can't flush %u in %d for segs_per_sec %u != 1\n", diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c8d00422237..d44b57a363ff 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1256,7 +1256,7 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi) sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES; /* give warm/cold data area from slower device */ - if (sbi->s_ndevs && sbi->segs_per_sec == 1) + if (f2fs_is_multi_device(sbi) && sbi->segs_per_sec == 1) SIT_I(sbi)->last_victim[ALLOC_NEXT] = GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ac038563273d..03fa2c4d3d79 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -574,7 +574,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) int ret = 0; int i; - if (!sbi->s_ndevs) + if (!f2fs_is_multi_device(sbi)) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { @@ -640,7 +640,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) return ret; } - if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) { + if (atomic_inc_return(&fcc->issing_flush) == 1 || + f2fs_is_multi_device(sbi)) { ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->issing_flush); @@ -746,7 +747,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) { int ret = 0, i; - if (!sbi->s_ndevs) + if (!f2fs_is_multi_device(sbi)) return 0; for (i = 1; i < sbi->s_ndevs; i++) { @@ -1289,7 +1290,7 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, trace_f2fs_queue_discard(bdev, blkstart, blklen); - if (sbi->s_ndevs) { + if (f2fs_is_multi_device(sbi)) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; @@ -1638,7 +1639,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, block_t lblkstart = blkstart; int devi = 0; - if (sbi->s_ndevs) { + if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } @@ -2971,7 +2972,7 @@ static void update_device_state(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; unsigned int devidx; - if (!sbi->s_ndevs) + if (!f2fs_is_multi_device(sbi)) return; devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); -- cgit v1.2.3 From fdc78eedc54d3d4eb0c3cd747226e5e9948fa860 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 17 May 2019 19:18:43 +0100 Subject: gfs2: Fix sign extension bug in gfs2_update_stats commit 5a5ec83d6ac974b12085cd99b196795f14079037 upstream. Commit 4d207133e9c3 changed the types of the statistic values in struct gfs2_lkstats from s64 to u64. Because of that, what should be a signed value in gfs2_update_stats turned into an unsigned value. When shifted right, we end up with a large positive value instead of a small negative value, which results in an incorrect variance estimate. Fixes: 4d207133e9c3 ("gfs2: Make statistics unsigned, suitable for use with do_div()") Signed-off-by: Andreas Gruenbacher Cc: stable@vger.kernel.org # v4.4+ Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/lock_dlm.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index ac7caa267ed6..62edf8f5615f 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -31,9 +31,10 @@ * @delta is the difference between the current rtt sample and the * running average srtt. We add 1/8 of that to the srtt in order to * update the current srtt estimate. The variance estimate is a bit - * more complicated. We subtract the abs value of the @delta from - * the current variance estimate and add 1/4 of that to the running - * total. + * more complicated. We subtract the current variance estimate from + * the abs value of the @delta and add 1/4 of that to the running + * total. That's equivalent to 3/4 of the current variance + * estimate plus 1/4 of the abs of @delta. * * Note that the index points at the array entry containing the smoothed * mean value, and the variance is always in the following entry @@ -49,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, s64 delta = sample - s->stats[index]; s->stats[index] += (delta >> 3); index++; - s->stats[index] += ((abs(delta) - s->stats[index]) >> 2); + s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2; } /** -- cgit v1.2.3 From ce21e6586eece6502da23a9c275a60f2fdf8b8b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 3 May 2019 11:10:06 -0400 Subject: btrfs: don't double unlock on error in btrfs_punch_hole commit 8fca955057b9c58467d1b231e43f19c4cf26ae8c upstream. If we have an error writing out a delalloc range in btrfs_punch_hole_lock_range we'll unlock the inode and then goto out_only_mutex, where we will again unlock the inode. This is bad, don't do this. Fixes: f27451f22996 ("Btrfs: add support for fallocate's zero range operation") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ca4902c66dc4..ed67aa91914c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2565,10 +2565,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); - if (ret) { - inode_unlock(inode); + if (ret) goto out_only_mutex; - } path = btrfs_alloc_path(); if (!path) { -- cgit v1.2.3 From 7ec747c811ab16deab4eb85ee2d67cb2f89cc384 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 29 Apr 2019 13:08:14 +0100 Subject: Btrfs: do not abort transaction at btrfs_update_root() after failure to COW path commit 72bd2323ec87722c115a5906bc6a1b31d11e8f54 upstream. Currently when we fail to COW a path at btrfs_update_root() we end up always aborting the transaction. However all the current callers of btrfs_update_root() are able to deal with errors returned from it, many do end up aborting the transaction themselves (directly or not, such as the transaction commit path), other BUG_ON() or just gracefully cancel whatever they were doing. When syncing the fsync log, we call btrfs_update_root() through tree-log.c:update_log_root(), and if it returns an -ENOSPC error, the log sync code does not abort the transaction, instead it gracefully handles the error and returns -EAGAIN to the fsync handler, so that it falls back to a transaction commit. Any other error different from -ENOSPC, makes the log sync code abort the transaction. So remove the transaction abort from btrfs_update_log() when we fail to COW a path to update the root item, so that if an -ENOSPC failure happens we avoid aborting the current transaction and have a chance of the fsync succeeding after falling back to a transaction commit. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203413 Fixes: 79787eaab46121 ("btrfs: replace many BUG_ONs with proper error handling") Cc: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Reviewed-by: Anand Jain Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/root-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 65bda0682928..fb205ebeca39 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -132,10 +132,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, 0, 1); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); + if (ret < 0) goto out; - } if (ret != 0) { btrfs_print_leaf(path->nodes[0]); -- cgit v1.2.3 From 4f9a774dda97c0a07ae1e475182f29889b109cba Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 May 2019 16:43:51 +0100 Subject: Btrfs: avoid fallback to transaction commit during fsync of files with holes commit ebb929060aeb162417b4c1307e63daee47b208d9 upstream. When we are doing a full fsync (bit BTRFS_INODE_NEEDS_FULL_SYNC set) of a file that has holes and has file extent items spanning two or more leafs, we can end up falling to back to a full transaction commit due to a logic bug that leads to failure to insert a duplicate file extent item that is meant to represent a hole between the last file extent item of a leaf and the first file extent item in the next leaf. The failure (EEXIST error) leads to a transaction commit (as most errors when logging an inode do). For example, we have the two following leafs: Leaf N: ----------------------------------------------- | ..., ..., ..., (257, FILE_EXTENT_ITEM, 64K) | ----------------------------------------------- The file extent item at the end of leaf N has a length of 4Kb, representing the file range from 64K to 68K - 1. Leaf N + 1: ----------------------------------------------- | (257, FILE_EXTENT_ITEM, 72K), ..., ..., ... | ----------------------------------------------- The file extent item at the first slot of leaf N + 1 has a length of 4Kb too, representing the file range from 72K to 76K - 1. During the full fsync path, when we are at tree-log.c:copy_items() with leaf N as a parameter, after processing the last file extent item, that represents the extent at offset 64K, we take a look at the first file extent item at the next leaf (leaf N + 1), and notice there's a 4K hole between the two extents, and therefore we insert a file extent item representing that hole, starting at file offset 68K and ending at offset 72K - 1. However we don't update the value of *last_extent, which is used to represent the end offset (plus 1, non-inclusive end) of the last file extent item inserted in the log, so it stays with a value of 68K and not with a value of 72K. Then, when copy_items() is called for leaf N + 1, because the value of *last_extent is smaller then the offset of the first extent item in the leaf (68K < 72K), we look at the last file extent item in the previous leaf (leaf N) and see it there's a 4K gap between it and our first file extent item (again, 68K < 72K), so we decide to insert a file extent item representing the hole, starting at file offset 68K and ending at offset 72K - 1, this insertion will fail with -EEXIST being returned from btrfs_insert_file_extent() because we already inserted a file extent item representing a hole for this offset (68K) in the previous call to copy_items(), when processing leaf N. The -EEXIST error gets propagated to the fsync callback, btrfs_sync_file(), which falls back to a full transaction commit. Fix this by adjusting *last_extent after inserting a hole when we had to look at the next leaf. Fixes: 4ee3fad34a9c ("Btrfs: fix fsync after hole punching when using no-holes feature") Cc: stable@vger.kernel.org # 4.14+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2f4f0958e5f2..75051d36dc1a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4121,6 +4121,7 @@ fill_holes: *last_extent, 0, 0, len, 0, len, 0, 0, 0); + *last_extent += len; } } } -- cgit v1.2.3 From 92f907d7d63b178da6d01a6870dea7831e6abf3e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 May 2019 16:44:02 +0100 Subject: Btrfs: fix race between ranged fsync and writeback of adjacent ranges commit 0c713cbab6200b0ab6473b50435e450a6e1de85d upstream. When we do a full fsync (the bit BTRFS_INODE_NEEDS_FULL_SYNC is set in the inode) that happens to be ranged, which happens during a msync() or writes for files opened with O_SYNC for example, we can end up with a corrupt log, due to different file extent items representing ranges that overlap with each other, or hit some assertion failures. When doing a ranged fsync we only flush delalloc and wait for ordered exents within that range. If while we are logging items from our inode ordered extents for adjacent ranges complete, we end up in a race that can make us insert the file extent items that overlap with others we logged previously and the assertion failures. For example, if tree-log.c:copy_items() receives a leaf that has the following file extents items, all with a length of 4K and therefore there is an implicit hole in the range 68K to 72K - 1: (257 EXTENT_ITEM 64K), (257 EXTENT_ITEM 72K), (257 EXTENT_ITEM 76K), ... It copies them to the log tree. However due to the need to detect implicit holes, it may release the path, in order to look at the previous leaf to detect an implicit hole, and then later it will search again in the tree for the first file extent item key, with the goal of locking again the leaf (which might have changed due to concurrent changes to other inodes). However when it locks again the leaf containing the first key, the key corresponding to the extent at offset 72K may not be there anymore since there is an ordered extent for that range that is finishing (that is, somewhere in the middle of btrfs_finish_ordered_io()), and it just removed the file extent item but has not yet replaced it with a new file extent item, so the part of copy_items() that does hole detection will decide that there is a hole in the range starting from 68K to 76K - 1, and therefore insert a file extent item to represent that hole, having a key offset of 68K. After that we now have a log tree with 2 different extent items that have overlapping ranges: 1) The file extent item copied before copy_items() released the path, which has a key offset of 72K and a length of 4K, representing the file range 72K to 76K - 1. 2) And a file extent item representing a hole that has a key offset of 68K and a length of 8K, representing the range 68K to 76K - 1. This item was inserted after releasing the path, and overlaps with the extent item inserted before. The overlapping extent items can cause all sorts of unpredictable and incorrect behaviour, either when replayed or if a fast (non full) fsync happens later, which can trigger a BUG_ON() when calling btrfs_set_item_key_safe() through __btrfs_drop_extents(), producing a trace like the following: [61666.783269] ------------[ cut here ]------------ [61666.783943] kernel BUG at fs/btrfs/ctree.c:3182! [61666.784644] invalid opcode: 0000 [#1] PREEMPT SMP (...) [61666.786253] task: ffff880117b88c40 task.stack: ffffc90008168000 [61666.786253] RIP: 0010:btrfs_set_item_key_safe+0x7c/0xd2 [btrfs] [61666.786253] RSP: 0018:ffffc9000816b958 EFLAGS: 00010246 [61666.786253] RAX: 0000000000000000 RBX: 000000000000000f RCX: 0000000000030000 [61666.786253] RDX: 0000000000000000 RSI: ffffc9000816ba4f RDI: ffffc9000816b937 [61666.786253] RBP: ffffc9000816b998 R08: ffff88011dae2428 R09: 0000000000001000 [61666.786253] R10: 0000160000000000 R11: 6db6db6db6db6db7 R12: ffff88011dae2418 [61666.786253] R13: ffffc9000816ba4f R14: ffff8801e10c4118 R15: ffff8801e715c000 [61666.786253] FS: 00007f6060a18700(0000) GS:ffff88023f5c0000(0000) knlGS:0000000000000000 [61666.786253] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [61666.786253] CR2: 00007f6060a28000 CR3: 0000000213e69000 CR4: 00000000000006e0 [61666.786253] Call Trace: [61666.786253] __btrfs_drop_extents+0x5e3/0xaad [btrfs] [61666.786253] ? time_hardirqs_on+0x9/0x14 [61666.786253] btrfs_log_changed_extents+0x294/0x4e0 [btrfs] [61666.786253] ? release_extent_buffer+0x38/0xb4 [btrfs] [61666.786253] btrfs_log_inode+0xb6e/0xcdc [btrfs] [61666.786253] ? lock_acquire+0x131/0x1c5 [61666.786253] ? btrfs_log_inode_parent+0xee/0x659 [btrfs] [61666.786253] ? arch_local_irq_save+0x9/0xc [61666.786253] ? btrfs_log_inode_parent+0x1f5/0x659 [btrfs] [61666.786253] btrfs_log_inode_parent+0x223/0x659 [btrfs] [61666.786253] ? arch_local_irq_save+0x9/0xc [61666.786253] ? lockref_get_not_zero+0x2c/0x34 [61666.786253] ? rcu_read_unlock+0x3e/0x5d [61666.786253] btrfs_log_dentry_safe+0x60/0x7b [btrfs] [61666.786253] btrfs_sync_file+0x317/0x42c [btrfs] [61666.786253] vfs_fsync_range+0x8c/0x9e [61666.786253] SyS_msync+0x13c/0x1c9 [61666.786253] entry_SYSCALL_64_fastpath+0x18/0xad A sample of a corrupt log tree leaf with overlapping extents I got from running btrfs/072: item 14 key (295 108 200704) itemoff 2599 itemsize 53 extent data disk bytenr 0 nr 0 extent data offset 0 nr 458752 ram 458752 item 15 key (295 108 659456) itemoff 2546 itemsize 53 extent data disk bytenr 4343541760 nr 770048 extent data offset 606208 nr 163840 ram 770048 item 16 key (295 108 663552) itemoff 2493 itemsize 53 extent data disk bytenr 4343541760 nr 770048 extent data offset 610304 nr 155648 ram 770048 item 17 key (295 108 819200) itemoff 2440 itemsize 53 extent data disk bytenr 4334788608 nr 4096 extent data offset 0 nr 4096 ram 4096 The file extent item at offset 659456 (item 15) ends at offset 823296 (659456 + 163840) while the next file extent item (item 16) starts at offset 663552. Another different problem that the race can trigger is a failure in the assertions at tree-log.c:copy_items(), which expect that the first file extent item key we found before releasing the path exists after we have released path and that the last key we found before releasing the path also exists after releasing the path: $ cat -n fs/btrfs/tree-log.c 4080 if (need_find_last_extent) { 4081 /* btrfs_prev_leaf could return 1 without releasing the path */ 4082 btrfs_release_path(src_path); 4083 ret = btrfs_search_slot(NULL, inode->root, &first_key, 4084 src_path, 0, 0); 4085 if (ret < 0) 4086 return ret; 4087 ASSERT(ret == 0); (...) 4103 if (i >= btrfs_header_nritems(src_path->nodes[0])) { 4104 ret = btrfs_next_leaf(inode->root, src_path); 4105 if (ret < 0) 4106 return ret; 4107 ASSERT(ret == 0); 4108 src = src_path->nodes[0]; 4109 i = 0; 4110 need_find_last_extent = true; 4111 } (...) The second assertion implicitly expects that the last key before the path release still exists, because the surrounding while loop only stops after we have found that key. When this assertion fails it produces a stack like this: [139590.037075] assertion failed: ret == 0, file: fs/btrfs/tree-log.c, line: 4107 [139590.037406] ------------[ cut here ]------------ [139590.037707] kernel BUG at fs/btrfs/ctree.h:3546! [139590.038034] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI [139590.038340] CPU: 1 PID: 31841 Comm: fsstress Tainted: G W 5.0.0-btrfs-next-46 #1 (...) [139590.039354] RIP: 0010:assfail.constprop.24+0x18/0x1a [btrfs] (...) [139590.040397] RSP: 0018:ffffa27f48f2b9b0 EFLAGS: 00010282 [139590.040730] RAX: 0000000000000041 RBX: ffff897c635d92c8 RCX: 0000000000000000 [139590.041105] RDX: 0000000000000000 RSI: ffff897d36a96868 RDI: ffff897d36a96868 [139590.041470] RBP: ffff897d1b9a0708 R08: 0000000000000000 R09: 0000000000000000 [139590.041815] R10: 0000000000000008 R11: 0000000000000000 R12: 0000000000000013 [139590.042159] R13: 0000000000000227 R14: ffff897cffcbba88 R15: 0000000000000001 [139590.042501] FS: 00007f2efc8dee80(0000) GS:ffff897d36a80000(0000) knlGS:0000000000000000 [139590.042847] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [139590.043199] CR2: 00007f8c064935e0 CR3: 0000000232252002 CR4: 00000000003606e0 [139590.043547] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [139590.043899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [139590.044250] Call Trace: [139590.044631] copy_items+0xa3f/0x1000 [btrfs] [139590.045009] ? generic_bin_search.constprop.32+0x61/0x200 [btrfs] [139590.045396] btrfs_log_inode+0x7b3/0xd70 [btrfs] [139590.045773] btrfs_log_inode_parent+0x2b3/0xce0 [btrfs] [139590.046143] ? do_raw_spin_unlock+0x49/0xc0 [139590.046510] btrfs_log_dentry_safe+0x4a/0x70 [btrfs] [139590.046872] btrfs_sync_file+0x3b6/0x440 [btrfs] [139590.047243] btrfs_file_write_iter+0x45b/0x5c0 [btrfs] [139590.047592] __vfs_write+0x129/0x1c0 [139590.047932] vfs_write+0xc2/0x1b0 [139590.048270] ksys_write+0x55/0xc0 [139590.048608] do_syscall_64+0x60/0x1b0 [139590.048946] entry_SYSCALL_64_after_hwframe+0x49/0xbe [139590.049287] RIP: 0033:0x7f2efc4be190 (...) [139590.050342] RSP: 002b:00007ffe743243a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [139590.050701] RAX: ffffffffffffffda RBX: 0000000000008d58 RCX: 00007f2efc4be190 [139590.051067] RDX: 0000000000008d58 RSI: 00005567eca0f370 RDI: 0000000000000003 [139590.051459] RBP: 0000000000000024 R08: 0000000000000003 R09: 0000000000008d60 [139590.051863] R10: 0000000000000078 R11: 0000000000000246 R12: 0000000000000003 [139590.052252] R13: 00000000003d3507 R14: 00005567eca0f370 R15: 0000000000000000 (...) [139590.055128] ---[ end trace 193f35d0215cdeeb ]--- So fix this race between a full ranged fsync and writeback of adjacent ranges by flushing all delalloc and waiting for all ordered extents to complete before logging the inode. This is the simplest way to solve the problem because currently the full fsync path does not deal with ranges at all (it assumes a full range from 0 to LLONG_MAX) and it always needs to look at adjacent ranges for hole detection. For use cases of ranged fsyncs this can make a few fsyncs slower but on the other hand it can make some following fsyncs to other ranges do less work or no need to do anything at all. A full fsync is rare anyway and happens only once after loading/creating an inode and once after less common operations such as a shrinking truncate. This is an issue that exists for a long time, and was often triggered by generic/127, because it does mmap'ed writes and msync (which triggers a ranged fsync). Adding support for the tree checker to detect overlapping extents (next patch in the series) and trigger a WARN() when such cases are found, and then calling btrfs_check_leaf_full() at the end of btrfs_insert_file_extent() made the issue much easier to detect. Running btrfs/072 with that change to the tree checker and making fsstress open files always with O_SYNC made it much easier to trigger the issue (as triggering it with generic/127 is very rare). CC: stable@vger.kernel.org # 3.16+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ed67aa91914c..de4d3baec161 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2058,6 +2058,18 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; u64 len; + /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + /* * The range length can be represented by u64, we have to do the typecasts * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() -- cgit v1.2.3 From 946ad2ecef611c86166fbf3eeaf6d4d12e3d4305 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 13:39:11 +1000 Subject: btrfs: sysfs: Fix error path kobject memory leak commit 450ff8348808a89cc27436771aa05c2b90c0eef1 upstream. If a call to kobject_init_and_add() fails we must call kobject_put() otherwise we leak memory. Calling kobject_put() when kobject_init_and_add() fails drops the refcount back to 0 and calls the ktype release method (which in turn calls the percpu destroy and kfree). Add call to kobject_put() in the error path of call to kobject_init_and_add(). Cc: stable@vger.kernel.org # v4.4+ Reviewed-by: Greg Kroah-Hartman Signed-off-by: Tobin C. Harding Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 809c2c307c64..6d155d7d439d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3911,8 +3911,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) info->space_info_kobj, "%s", alloc_name(space_info->flags)); if (ret) { - percpu_counter_destroy(&space_info->total_bytes_pinned); - kfree(space_info); + kobject_put(&space_info->kobj); return ret; } -- cgit v1.2.3 From 94e1f96667b4d4cdc3a06592f109422d088eb352 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 13:39:12 +1000 Subject: btrfs: sysfs: don't leak memory when failing add fsid commit e32773357d5cc271b1d23550b3ed026eb5c2a468 upstream. A failed call to kobject_init_and_add() must be followed by a call to kobject_put(). Currently in the error path when adding fs_devices we are missing this call. This could be fixed by calling btrfs_sysfs_remove_fsid() if btrfs_sysfs_add_fsid() returns an error or by adding a call to kobject_put() directly in btrfs_sysfs_add_fsid(). Here we choose the second option because it prevents the slightly unusual error path handling requirements of kobject from leaking out into btrfs functions. Add a call to kobject_put() in the error path of kobject_add_and_init(). This causes the release method to be called if kobject_init_and_add() fails. open_tree() is the function that calls btrfs_sysfs_add_fsid() and the error code in this function is already written with the assumption that the release method is called during the error path of open_tree() (as seen by the call to btrfs_sysfs_remove_fsid() under the fail_fsdev_sysfs label). Cc: stable@vger.kernel.org # v4.4+ Reviewed-by: Greg Kroah-Hartman Signed-off-by: Tobin C. Harding Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/sysfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3717c864ba23..aefb0169d46d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -811,7 +811,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, fs_devs->fsid_kobj.kset = btrfs_kset; error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); - return error; + if (error) { + kobject_put(&fs_devs->fsid_kobj); + return error; + } + + return 0; } int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From e1eed6928b3ee66a1ac63c7d3eebc63066d59730 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 11 Jan 2019 19:04:44 -0500 Subject: NFSv4.2 fix unnecessary retry in nfs4_copy_file_range commit 45ac486ecf2dc998e25cf32f0cabf2deaad875be upstream. Currently nfs42_proc_copy_file_range() can not return EAGAIN. Fixes: e4648aa4f98a ("NFS recover from destination server reboot for copies") Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker Cc: Yu Xu Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4file.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4288a6ecaf75..31dac3f2033f 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,15 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { - ssize_t ret; - if (file_inode(file_in) == file_inode(file_out)) return -EINVAL; -retry: - ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); - if (ret == -EAGAIN) - goto retry; - return ret; + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) -- cgit v1.2.3 From cc1afc1050a94be73f85c81889941f1fa7cda0e4 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 11 Apr 2019 14:34:18 -0400 Subject: NFSv4.1 fix incorrect return value in copy_file_range commit 0769663b4f580566ef6cdf366f3073dbe8022c39 upstream. According to the NFSv4.2 spec if the input and output file is the same file, operation should fail with EINVAL. However, linux copy_file_range() system call has no such restrictions. Therefore, in such case let's return EOPNOTSUPP and allow VFS to fallback to doing do_splice_direct(). Also when copy_file_range is called on an NFSv4.0 or 4.1 mount (ie., a server that doesn't support COPY functionality), we also need to return EOPNOTSUPP and fallback to a regular copy. Fixes xfstest generic/075, generic/091, generic/112, generic/263 for all NFSv4.x versions. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust Cc: Yu Xu Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs42proc.c | 3 --- fs/nfs/nfs4file.c | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index fed06fd9998d..94f98e190e63 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -329,9 +329,6 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, }; ssize_t err, err2; - if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) - return -EOPNOTSUPP; - src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) return PTR_ERR(src_lock); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 31dac3f2033f..134858507268 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,8 +133,10 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) + return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) - return -EINVAL; + return -EOPNOTSUPP; return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); } -- cgit v1.2.3 From 9c0339dd381db5b06afd377b97c7713d9157cae9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 16 Jan 2019 11:00:57 -0500 Subject: btrfs: honor path->skip_locking in backref code commit 38e3eebff643db725633657d1d87a3be019d1018 upstream. Qgroups will do the old roots lookup at delayed ref time, which could be while walking down the extent root while running a delayed ref. This should be fine, except we specifically lock eb's in the backref walking code irrespective of path->skip_locking, which deadlocks the system. Fix up the backref code to honor path->skip_locking, nobody will be modifying the commit_root when we're searching so it's completely safe to do. This happens since fb235dc06fac ("btrfs: qgroup: Move half of the qgroup accounting time out of commit trans"), kernel may lockup with quota enabled. There is one backref trace triggered by snapshot dropping along with write operation in the source subvolume. The example can be reliably reproduced: btrfs-cleaner D 0 4062 2 0x80000000 Call Trace: schedule+0x32/0x90 btrfs_tree_read_lock+0x93/0x130 [btrfs] find_parent_nodes+0x29b/0x1170 [btrfs] btrfs_find_all_roots_safe+0xa8/0x120 [btrfs] btrfs_find_all_roots+0x57/0x70 [btrfs] btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs] btrfs_qgroup_trace_leaf_items+0x10b/0x140 [btrfs] btrfs_qgroup_trace_subtree+0xc8/0xe0 [btrfs] do_walk_down+0x541/0x5e3 [btrfs] walk_down_tree+0xab/0xe7 [btrfs] btrfs_drop_snapshot+0x356/0x71a [btrfs] btrfs_clean_one_deleted_snapshot+0xb8/0xf0 [btrfs] cleaner_kthread+0x12b/0x160 [btrfs] kthread+0x112/0x130 ret_from_fork+0x27/0x50 When dropping snapshots with qgroup enabled, we will trigger backref walk. However such backref walk at that timing is pretty dangerous, as if one of the parent nodes get WRITE locked by other thread, we could cause a dead lock. For example: FS 260 FS 261 (Dropped) node A node B / \ / \ node C node D node E / \ / \ / \ leaf F|leaf G|leaf H|leaf I|leaf J|leaf K The lock sequence would be: Thread A (cleaner) | Thread B (other writer) ----------------------------------------------------------------------- write_lock(B) | write_lock(D) | ^^^ called by walk_down_tree() | | write_lock(A) | write_lock(D) << Stall read_lock(H) << for backref walk | read_lock(D) << lock owner is | the same thread A | so read lock is OK | read_lock(A) << Stall | So thread A hold write lock D, and needs read lock A to unlock. While thread B holds write lock A, while needs lock D to unlock. This will cause a deadlock. This is not only limited to snapshot dropping case. As the backref walk, even only happens on commit trees, is breaking the normal top-down locking order, makes it deadlock prone. Fixes: fb235dc06fac ("btrfs: qgroup: Move half of the qgroup accounting time out of commit trans") CC: stable@vger.kernel.org # 4.14+ Reported-and-tested-by: David Sterba Reported-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: Filipe Manana [ rebase to latest branch and fix lock assert bug in btrfs/007 ] [ backport to linux-4.19.y branch, solve minor conflicts ] Signed-off-by: Qu Wenruo [ copy logs and deadlock analysis from Qu's patch ] Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/backref.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2a4f52c7be22..ac6c383d6314 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -710,7 +710,7 @@ out: * read tree blocks and add keys where required. */ static int add_missing_keys(struct btrfs_fs_info *fs_info, - struct preftrees *preftrees) + struct preftrees *preftrees, bool lock) { struct prelim_ref *ref; struct extent_buffer *eb; @@ -735,12 +735,14 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_extent_buffer(eb); return -EIO; } - btrfs_tree_read_lock(eb); + if (lock) + btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0); else btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); - btrfs_tree_read_unlock(eb); + if (lock) + btrfs_tree_read_unlock(eb); free_extent_buffer(eb); prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); cond_resched(); @@ -1225,7 +1227,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees); + ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; @@ -1286,11 +1288,14 @@ again: ret = -EIO; goto out; } - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + if (!path->skip_locking) { + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie, ignore_offset); - btrfs_tree_read_unlock_blocking(eb); + if (!path->skip_locking) + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; -- cgit v1.2.3 From 86c43c40fe05c78eae5b6a7d3665220457920224 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 28 Mar 2019 17:38:29 +0200 Subject: ovl: relax WARN_ON() for overlapping layers use case commit acf3062a7e1ccf67c6f7e7c28671a6708fde63b0 upstream. This nasty little syzbot repro: https://syzkaller.appspot.com/x/repro.syz?x=12c7a94f400000 Creates overlay mounts where the same directory is both in upper and lower layers. Simplified example: mkdir foo work mount -t overlay none foo -o"lowerdir=.,upperdir=foo,workdir=work" The repro runs several threads in parallel that attempt to chdir into foo and attempt to symlink/rename/exec/mkdir the file bar. The repro hits a WARN_ON() I placed in ovl_instantiate(), which suggests that an overlay inode already exists in cache and is hashed by the pointer of the real upper dentry that ovl_create_real() has just created. At the point of the WARN_ON(), for overlay dir inode lock is held and upper dir inode lock, so at first, I did not see how this was possible. On a closer look, I see that after ovl_create_real(), because of the overlapping upper and lower layers, a lookup by another thread can find the file foo/bar that was just created in upper layer, at overlay path foo/foo/bar and hash the an overlay inode with the new real dentry as lower dentry. This is possible because the overlay directory foo/foo is not locked and the upper dentry foo/bar is in dcache, so ovl_lookup() can find it without taking upper dir inode shared lock. Overlapping layers is considered a wrong setup which would result in unexpected behavior, but it shouldn't crash the kernel and it shouldn't trigger WARN_ON() either, so relax this WARN_ON() and leave a pr_warn() instead to cover all cases of failure to get an overlay inode. The error returned from failure to insert new inode to cache with inode_insert5() was changed to -EEXIST, to distinguish from the error -ENOMEM returned on failure to get/allocate inode with iget5_locked(). Reported-by: syzbot+9c69c282adc4edd2b540@syzkaller.appspotmail.com Fixes: 01b39dcc9568 ("ovl: use inode_insert5() to hash a newly...") Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/overlayfs/dir.c | 2 +- fs/overlayfs/inode.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index b2aadd3e1fec..336f04da80ed 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -260,7 +260,7 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, * hashed directory inode aliases. */ inode = ovl_get_inode(dentry->d_sb, &oip); - if (WARN_ON(IS_ERR(inode))) + if (IS_ERR(inode)) return PTR_ERR(inode); } else { WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 3b7ed5d2279c..b48273e846ad 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -832,7 +832,7 @@ struct inode *ovl_get_inode(struct super_block *sb, int fsid = bylower ? oip->lowerpath->layer->fsid : 0; bool is_dir, metacopy = false; unsigned long ino = 0; - int err = -ENOMEM; + int err = oip->newinode ? -EEXIST : -ENOMEM; if (!realinode) realinode = d_inode(lowerdentry); @@ -917,6 +917,7 @@ out: return inode; out_err: + pr_warn_ratelimited("overlayfs: failed to get inode (%i)\n", err); inode = ERR_PTR(err); goto out; } -- cgit v1.2.3 From 7c2bcb3cca0339b73f6e1883c14f1166f233c8d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Apr 2019 21:04:13 -0400 Subject: acct_on(): don't mess with freeze protection commit 9419a3191dcb27f24478d288abaab697228d28e6 upstream. What happens there is that we are replacing file->path.mnt of a file we'd just opened with a clone and we need the write count contribution to be transferred from original mount to new one. That's it. We do *NOT* want any kind of freeze protection for the duration of switchover. IOW, we should just use __mnt_{want,drop}_write() for that switchover; no need to bother with mnt_{want,drop}_write() there. Tested-by: Amir Goldstein Reported-by: syzbot+2a73a6ea9507b7112141@syzkaller.appspotmail.com Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/internal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index d410186bc369..d109665b9e50 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -80,9 +80,7 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); /* -- cgit v1.2.3 From 06a67c0f4abb34784cc296fc3d242fcd03459ca2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 29 May 2019 19:25:44 +0200 Subject: Revert "btrfs: Honour FITRIM range constraints during free space trim" This reverts commit 8b13bb911f0c0c77d41e5ddc41ad3c127c356b8a. There is currently no corresponding patch in master due to additional changes that would be significantly different from plain revert in the respective stable branch. The range argument was not handled correctly and could cause trim to overlap allocated areas or reach beyond the end of the device. The address space that fitrim normally operates on is in logical coordinates, while the discards are done on the physical device extents. This distinction cannot be made with the current ioctl interface and caused the confusion. The bug depends on the layout of block groups and does not always happen. The whole-fs trim (run by default by the fstrim tool) is not affected. Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6d155d7d439d..0cc800d22a08 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -10788,9 +10788,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, * held back allocations. */ static int btrfs_trim_free_extents(struct btrfs_device *device, - struct fstrim_range *range, u64 *trimmed) + u64 minlen, u64 *trimmed) { - u64 start = range->start, len = 0; + u64 start = 0, len = 0; int ret; *trimmed = 0; @@ -10833,8 +10833,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, if (!trans) up_read(&fs_info->commit_root_sem); - ret = find_free_dev_extent_start(trans, device, range->minlen, - start, &start, &len); + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); if (trans) { up_read(&fs_info->commit_root_sem); btrfs_put_transaction(trans); @@ -10847,16 +10847,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, break; } - /* If we are out of the passed range break */ - if (start > range->start + range->len - 1) { - mutex_unlock(&fs_info->chunk_mutex); - ret = 0; - break; - } - - start = max(range->start, start); - len = min(range->len, len); - ret = btrfs_issue_discard(device->bdev, start, len, &bytes); mutex_unlock(&fs_info->chunk_mutex); @@ -10866,10 +10856,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, start += len; *trimmed += bytes; - /* We've trimmed enough */ - if (*trimmed >= range->len) - break; - if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -10953,7 +10939,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) mutex_lock(&fs_info->fs_devices->device_list_mutex); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - ret = btrfs_trim_free_extents(device, range, &group_trimmed); + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); if (ret) { dev_failed++; dev_ret = ret; -- cgit v1.2.3 From bac8520892812dada1ac93f27d96317470d24b1f Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Wed, 27 Mar 2019 17:09:17 +0000 Subject: gfs2: Fix lru_count going negative [ Upstream commit 7881ef3f33bb80f459ea6020d1e021fc524a6348 ] Under certain conditions, lru_count may drop below zero resulting in a large amount of log spam like this: vmscan: shrink_slab: gfs2_dump_glock+0x3b0/0x630 [gfs2] \ negative objects to delete nr=-1 This happens as follows: 1) A glock is moved from lru_list to the dispose list and lru_count is decremented. 2) The dispose function calls cond_resched() and drops the lru lock. 3) Another thread takes the lru lock and tries to add the same glock to lru_list, checking if the glock is on an lru list. 4) It is on a list (actually the dispose list) and so it avoids incrementing lru_count. 5) The glock is moved to lru_list. 5) The original thread doesn't dispose it because it has been re-added to the lru list but the lru_count has still decreased by one. Fix by checking if the LRU flag is set on the glock rather than checking if the glock is on some list and rearrange the code so that the LRU flag is added/removed precisely when the glock is added/removed from lru_list. Signed-off-by: Ross Lagerwall Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin --- fs/gfs2/glock.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9d566e62684c..775256141e9f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -183,15 +183,19 @@ static int demote_ok(const struct gfs2_glock *gl) void gfs2_glock_add_to_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); - if (!list_empty(&gl->gl_lru)) - list_del_init(&gl->gl_lru); - else + list_del(&gl->gl_lru); + list_add_tail(&gl->gl_lru, &lru_list); + + if (!test_bit(GLF_LRU, &gl->gl_flags)) { + set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); + } - list_add_tail(&gl->gl_lru, &lru_list); - set_bit(GLF_LRU, &gl->gl_flags); spin_unlock(&lru_lock); } @@ -201,7 +205,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) return; spin_lock(&lru_lock); - if (!list_empty(&gl->gl_lru)) { + if (test_bit(GLF_LRU, &gl->gl_flags)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); @@ -1158,8 +1162,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } - if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) && - (glops->go_flags & GLOF_LRU)) + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); @@ -1455,6 +1458,7 @@ __acquires(&lru_lock) if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_add(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); continue; } @@ -1462,7 +1466,6 @@ add_back_to_lru: spin_unlock(&gl->gl_lockref.lock); goto add_back_to_lru; } - clear_bit(GLF_LRU, &gl->gl_flags); gl->gl_lockref.count++; if (demote_ok(gl)) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1497,6 +1500,7 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); freed++; continue; } -- cgit v1.2.3 From 36296b0034ae720d50206cf6a3e729780d59cf6b Mon Sep 17 00:00:00 2001 From: Roberto Bergantinos Corpas Date: Thu, 25 Apr 2019 15:36:51 +0200 Subject: NFS: make nfs_match_client killable [ Upstream commit 950a578c6128c2886e295b9c7ecb0b6b22fcc92b ] Actually we don't do anything with return value from nfs_wait_client_init_complete in nfs_match_client, as a consequence if we get a fatal signal and client is not fully initialised, we'll loop to "again" label This has been proven to cause soft lockups on some scenarios (no-carrier but configured network interfaces) Signed-off-by: Roberto Bergantinos Corpas Reviewed-by: Benjamin Coddington Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/client.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 751ca65da8a3..846d45cb1a3c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -290,6 +290,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat struct nfs_client *clp; const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); + int error; again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { @@ -302,8 +303,10 @@ again: if (clp->cl_cons_state > NFS_CS_READY) { refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); - nfs_wait_client_init_complete(clp); + error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); + if (error < 0) + return ERR_PTR(error); spin_lock(&nn->nfs_client_lock); goto again; } @@ -413,6 +416,8 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) clp = nfs_match_client(cl_init); if (clp) { spin_unlock(&nn->nfs_client_lock); + if (IS_ERR(clp)) + return clp; if (new) new->rpc_ops->free_client(new); return nfs_found_client(cl_init, clp); -- cgit v1.2.3 From c4b51dbcccfcd305b5bb3e108c908274ad1e442c Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 4 Apr 2019 21:11:11 +0100 Subject: gfs2: Fix occasional glock use-after-free [ Upstream commit 9287c6452d2b1f24ea8e84bd3cf6f3c6f267f712 ] This patch has to do with the life cycle of glocks and buffers. When gfs2 metadata or journaled data is queued to be written, a gfs2_bufdata object is assigned to track the buffer, and that is queued to various lists, including the glock's gl_ail_list to indicate it's on the active items list. Once the page associated with the buffer has been written, it is removed from the ail list, but its life isn't over until a revoke has been successfully written. So after the block is written, its bufdata object is moved from the glock's gl_ail_list to a file-system-wide list of pending revokes, sd_log_le_revoke. At that point the glock still needs to track how many revokes it contributed to that list (in gl_revokes) so that things like glock go_sync can ensure all the metadata has been not only written, but also revoked before the glock is granted to a different node. This is to guarantee journal replay doesn't replay the block once the glock has been granted to another node. Ross Lagerwall recently discovered a race in which an inode could be evicted, and its glock freed after its ail list had been synced, but while it still had unwritten revokes on the sd_log_le_revoke list. The evict decremented the glock reference count to zero, which allowed the glock to be freed. After the revoke was written, function revoke_lo_after_commit tried to adjust the glock's gl_revokes counter and clear its GLF_LFLUSH flag, at which time it referenced the freed glock. This patch fixes the problem by incrementing the glock reference count in gfs2_add_revoke when the glock's first bufdata object is moved from the glock to the global revokes list. Later, when the glock's last such bufdata object is freed, the reference count is decremented. This guarantees that whichever process finishes last (the revoke writing or the evict) will properly free the glock, and neither will reference the glock after it has been freed. Reported-by: Ross Lagerwall Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Sasha Levin --- fs/gfs2/glock.c | 1 + fs/gfs2/log.c | 3 ++- fs/gfs2/lops.c | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 775256141e9f..ccdd8c821abd 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -140,6 +140,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + BUG_ON(atomic_read(&gl->gl_revokes)); rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ee20ea42e7b5..cd85092723de 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -604,7 +604,8 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; - atomic_inc(&gl->gl_revokes); + if (atomic_inc_return(&gl->gl_revokes) == 1) + gfs2_glock_hold(gl); set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_le_revoke); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f2567f958d00..8f99b395d7bf 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -662,8 +662,10 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; - atomic_dec(&gl->gl_revokes); - clear_bit(GLF_LFLUSH, &gl->gl_flags); + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } kmem_cache_free(gfs2_bufdata_cachep, bd); } } -- cgit v1.2.3 From 1084fc9afbe382150899d3c58afe52861a08f484 Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Tue, 26 Mar 2019 11:56:11 +0800 Subject: Btrfs: fix data bytes_may_use underflow with fallocate due to failed quota reserve [ Upstream commit 39ad317315887c2cb9a4347a93a8859326ddf136 ] When doing fallocate, we first add the range to the reserve_list and then reserve the quota. If quota reservation fails, we'll release all reserved parts of reserve_list. However, cur_offset is not updated to indicate that this range is already been inserted into the list. Therefore, the same range is freed twice. Once at list_for_each_entry loop, and once at the end of the function. This will result in WARN_ON on bytes_may_use when we free the remaining space. At the end, under the 'out' label we have a call to: btrfs_free_reserved_data_space(inode, data_reserved, alloc_start, alloc_end - cur_offset); The start offset, third argument, should be cur_offset. Everything from alloc_start to cur_offset was freed by the list_for_each_entry_safe_loop. Fixes: 18513091af94 ("btrfs: update btrfs_space_info's bytes_may_use timely") Reviewed-by: Filipe Manana Signed-off-by: Robbie Ko Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index de4d3baec161..e24c0a69ff5d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3161,6 +3161,7 @@ static long btrfs_fallocate(struct file *file, int mode, ret = btrfs_qgroup_reserve_data(inode, &data_reserved, cur_offset, last_byte - cur_offset); if (ret < 0) { + cur_offset = last_byte; free_extent_map(em); break; } @@ -3210,7 +3211,7 @@ out: /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(inode, data_reserved, - alloc_start, alloc_end - cur_offset); + cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } -- cgit v1.2.3 From 431cbaec1287772d66baea828eec23546e5c40dc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 25 Feb 2019 11:14:45 -0500 Subject: btrfs: fix panic during relocation after ENOSPC before writeback happens [ Upstream commit ff612ba7849964b1898fd3ccd1f56941129c6aab ] We've been seeing the following sporadically throughout our fleet panic: kernel BUG at fs/btrfs/relocation.c:4584! netversion: 5.0-0 Backtrace: #0 [ffffc90003adb880] machine_kexec at ffffffff81041da8 #1 [ffffc90003adb8c8] __crash_kexec at ffffffff8110396c #2 [ffffc90003adb988] crash_kexec at ffffffff811048ad #3 [ffffc90003adb9a0] oops_end at ffffffff8101c19a #4 [ffffc90003adb9c0] do_trap at ffffffff81019114 #5 [ffffc90003adba00] do_error_trap at ffffffff810195d0 #6 [ffffc90003adbab0] invalid_op at ffffffff81a00a9b [exception RIP: btrfs_reloc_cow_block+692] RIP: ffffffff8143b614 RSP: ffffc90003adbb68 RFLAGS: 00010246 RAX: fffffffffffffff7 RBX: ffff8806b9c32000 RCX: ffff8806aad00690 RDX: ffff880850b295e0 RSI: ffff8806b9c32000 RDI: ffff88084f205bd0 RBP: ffff880849415000 R8: ffffc90003adbbe0 R9: ffff88085ac90000 R10: ffff8805f7369140 R11: 0000000000000000 R12: ffff880850b295e0 R13: ffff88084f205bd0 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffc90003adbbb0] __btrfs_cow_block at ffffffff813bf1cd #8 [ffffc90003adbc28] btrfs_cow_block at ffffffff813bf4b3 #9 [ffffc90003adbc78] btrfs_search_slot at ffffffff813c2e6c The way relocation moves data extents is by creating a reloc inode and preallocating extents in this inode and then copying the data into these preallocated extents. Once we've done this for all of our extents, we'll write out these dirty pages, which marks the extent written, and goes into btrfs_reloc_cow_block(). From here we get our current reloc_control, which _should_ match the reloc_control for the current block group we're relocating. However if we get an ENOSPC in this path at some point we'll bail out, never initiating writeback on this inode. Not a huge deal, unless we happen to be doing relocation on a different block group, and this block group is now rc->stage == UPDATE_DATA_PTRS. This trips the BUG_ON() in btrfs_reloc_cow_block(), because we expect to be done modifying the data inode. We are in fact done modifying the metadata for the data inode we're currently using, but not the one from the failed block group, and thus we BUG_ON(). (This happens when writeback finishes for extents from the previous group, when we are at btrfs_finish_ordered_io() which updates the data reloc tree (inode item, drops/adds extent items, etc).) Fix this by writing out the reloc data inode always, and then breaking out of the loop after that point to keep from tripping this BUG_ON() later. Signed-off-by: Josef Bacik Reviewed-by: Filipe Manana [ add note from Filipe ] Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/relocation.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0526b6c473c7..5d57ed629345 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4289,27 +4289,36 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) { + if (ret < 0) err = ret; - goto out; - } - - if (rc->extents_found == 0) - break; - - btrfs_info(fs_info, "found %llu extents", rc->extents_found); + /* + * We may have gotten ENOSPC after we already dirtied some + * extents. If writeout happens while we're relocating a + * different block group we could end up hitting the + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in + * btrfs_reloc_cow_block. Make sure we write everything out + * properly so we don't trip over this problem, and then break + * out of the loop if we hit an error. + */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); - if (ret) { + if (ret) err = ret; - goto out; - } invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } + + if (err < 0) + goto out; + + if (rc->extents_found == 0) + break; + + btrfs_info(fs_info, "found %llu extents", rc->extents_found); + } WARN_ON(rc->block_group->pinned > 0); -- cgit v1.2.3 From bd3d8f4cb956980c5fa08b043919a5e056cf8a41 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 26 Feb 2019 16:33:56 +0800 Subject: btrfs: Don't panic when we can't find a root key [ Upstream commit 7ac1e464c4d473b517bb784f30d40da1f842482e ] When we failed to find a root key in btrfs_update_root(), we just panic. That's definitely not cool, fix it by outputting an unique error message, aborting current transaction and return -EUCLEAN. This should not normally happen as the root has been used by the callers in some way. Reviewed-by: Filipe Manana Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/root-tree.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index fb205ebeca39..3228d3b3084a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -135,11 +135,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (ret < 0) goto out; - if (ret != 0) { - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, "unable to update root key %llu %u %llu", - key->objectid, key->type, key->offset); - BUG_ON(1); + if (ret > 0) { + btrfs_crit(fs_info, + "unable to find root key (%llu %u %llu) in tree %llu", + key->objectid, key->type, key->offset, + root->root_key.objectid); + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + goto out; } l = path->nodes[0]; -- cgit v1.2.3 From 65ec64f28a882096d9976963e2ccefd85085fb1a Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 15 Feb 2019 20:27:11 +0800 Subject: chardev: add additional check for minor range overlap [ Upstream commit de36e16d1557a0b6eb328bc3516359a12ba5c25c ] Current overlap checking cannot correctly handle a case which is baseminor < existing baseminor && baseminor + minorct > existing baseminor + minorct. Signed-off-by: Chengguang Xu Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- fs/char_dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index a279c58fe360..8a63cfa29005 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -159,6 +159,12 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, ret = -EBUSY; goto out; } + + if (new_min < old_min && new_max > old_max) { + ret = -EBUSY; + goto out; + } + } cd->next = *cp; -- cgit v1.2.3 From 26433652f0e429b7a641eb32927b683819ca5eb0 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 9 May 2019 07:25:21 -0400 Subject: NFS: Fix a double unlock from nfs_match,get_client [ Upstream commit c260121a97a3e4df6536edbc2f26e166eff370ce ] Now that nfs_match_client drops the nfs_client_lock, we should be careful to always return it in the same condition: locked. Fixes: 950a578c6128 ("NFS: make nfs_match_client killable") Reported-by: syzbot+228a82b263b5da91883d@syzkaller.appspotmail.com Signed-off-by: Benjamin Coddington Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 846d45cb1a3c..c092661147b3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -305,9 +305,9 @@ again: spin_unlock(&nn->nfs_client_lock); error = nfs_wait_client_init_complete(clp); nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); if (error < 0) return ERR_PTR(error); - spin_lock(&nn->nfs_client_lock); goto again; } -- cgit v1.2.3 From 7301bbeae98f265dbd3ba5571b27bdd39fbf4797 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 15 May 2019 16:02:47 +0100 Subject: Btrfs: fix wrong ctime and mtime of a directory after log replay commit 5338e43abbab13791144d37fd8846847062351c6 upstream. When replaying a log that contains a new file or directory name that needs to be added to its parent directory, we end up updating the mtime and the ctime of the parent directory to the current time after we have set their values to the correct ones (set at fsync time), efectivelly losing them. Sample reproducer: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/dir $ touch /mnt/dir/file # fsync of the directory is optional, not needed $ xfs_io -c fsync /mnt/dir $ xfs_io -c fsync /mnt/dir/file $ stat -c %Y /mnt/dir 1557856079 $ sleep 3 $ mount /dev/sdb /mnt $ stat -c %Y /mnt/dir 1557856082 --> should have been 1557856079, the mtime is updated to the current time when replaying the log Fix this by not updating the mtime and ctime to the current time at btrfs_add_link() when we are replaying a log tree. This could be triggered by my recent fsync fuzz tester for fstests, for which an fstests patch exists titled "fstests: generic, fsync fuzz tester with fsstress". Fixes: e02119d5a7b43 ("Btrfs: Add a write ahead tree log to optimize synchronous operations") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59f361f7d0c1..c1cd3fe2b295 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6426,8 +6426,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name_len * 2); inode_inc_iversion(&parent_inode->vfs_inode); - parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime = - current_time(&parent_inode->vfs_inode); + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { + struct timespec64 now = current_time(&parent_inode->vfs_inode); + + parent_inode->vfs_inode.i_mtime = now; + parent_inode->vfs_inode.i_ctime = now; + } ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode); if (ret) btrfs_abort_transaction(trans, ret); -- cgit v1.2.3 From 37fe038328a20980f45ba7a37c9fa64708a374e7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 15 May 2019 16:03:17 +0100 Subject: Btrfs: fix race updating log root item during fsync commit 06989c799f04810f6876900d4760c0edda369cf7 upstream. When syncing the log, the final phase of a fsync operation, we need to either create a log root's item or update the existing item in the log tree of log roots, and that depends on the current value of the log root's log_transid - if it's 1 we need to create the log root item, otherwise it must exist already and we update it. Since there is no synchronization between updating the log_transid and checking it for deciding whether the log root's item needs to be created or updated, we end up with a tiny race window that results in attempts to update the item to fail because the item was not yet created: CPU 1 CPU 2 btrfs_sync_log() lock root->log_mutex set log root's log_transid to 1 unlock root->log_mutex btrfs_sync_log() lock root->log_mutex sets log root's log_transid to 2 unlock root->log_mutex update_log_root() sees log root's log_transid with a value of 2 calls btrfs_update_root(), which fails with -EUCLEAN and causes transaction abort Until recently the race lead to a BUG_ON at btrfs_update_root(), but after the recent commit 7ac1e464c4d47 ("btrfs: Don't panic when we can't find a root key") we just abort the current transaction. A sample trace of the BUG_ON() on a SLE12 kernel: ------------[ cut here ]------------ kernel BUG at ../fs/btrfs/root-tree.c:157! Oops: Exception in kernel mode, sig: 5 [#1] SMP NR_CPUS=2048 NUMA pSeries (...) Supported: Yes, External CPU: 78 PID: 76303 Comm: rtas_errd Tainted: G X 4.4.156-94.57-default #1 task: c00000ffa906d010 ti: c00000ff42b08000 task.ti: c00000ff42b08000 NIP: d000000036ae5cdc LR: d000000036ae5cd8 CTR: 0000000000000000 REGS: c00000ff42b0b860 TRAP: 0700 Tainted: G X (4.4.156-94.57-default) MSR: 8000000002029033 CR: 22444484 XER: 20000000 CFAR: d000000036aba66c SOFTE: 1 GPR00: d000000036ae5cd8 c00000ff42b0bae0 d000000036bda220 0000000000000054 GPR04: 0000000000000001 0000000000000000 c00007ffff8d37c8 0000000000000000 GPR08: c000000000e19c00 0000000000000000 0000000000000000 3736343438312079 GPR12: 3930373337303434 c000000007a3a800 00000000007fffff 0000000000000023 GPR16: c00000ffa9d26028 c00000ffa9d261f8 0000000000000010 c00000ffa9d2ab28 GPR20: c00000ff42b0bc48 0000000000000001 c00000ff9f0d9888 0000000000000001 GPR24: c00000ffa9d26000 c00000ffa9d261e8 c00000ffa9d2a800 c00000ff9f0d9888 GPR28: c00000ffa9d26028 c00000ffa9d2aa98 0000000000000001 c00000ffa98f5b20 NIP [d000000036ae5cdc] btrfs_update_root+0x25c/0x4e0 [btrfs] LR [d000000036ae5cd8] btrfs_update_root+0x258/0x4e0 [btrfs] Call Trace: [c00000ff42b0bae0] [d000000036ae5cd8] btrfs_update_root+0x258/0x4e0 [btrfs] (unreliable) [c00000ff42b0bba0] [d000000036b53610] btrfs_sync_log+0x2d0/0xc60 [btrfs] [c00000ff42b0bce0] [d000000036b1785c] btrfs_sync_file+0x44c/0x4e0 [btrfs] [c00000ff42b0bd80] [c00000000032e300] vfs_fsync_range+0x70/0x120 [c00000ff42b0bdd0] [c00000000032e44c] do_fsync+0x5c/0xb0 [c00000ff42b0be10] [c00000000032e8dc] SyS_fdatasync+0x2c/0x40 [c00000ff42b0be30] [c000000000009488] system_call+0x3c/0x100 Instruction dump: 7f43d378 4bffebb9 60000000 88d90008 3d220000 e8b90000 3b390009 e87a01f0 e8898e08 e8f90000 4bfd48e5 60000000 <0fe00000> e95b0060 39200004 394a0ea0 ---[ end trace 8f2dc8f919cabab8 ]--- So fix this by doing the check of log_transid and updating or creating the log root's item while holding the root's log_mutex. Fixes: 7237f1833601d ("Btrfs: fix tree logs parallel sync") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 75051d36dc1a..f79855cee88b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3037,6 +3037,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; + /* + * Update or create log root item under the root's log_mutex to prevent + * races with concurrent log syncs that can lead to failure to update + * log root item because it was not created yet. + */ + ret = update_log_root(trans, log); /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to @@ -3056,8 +3062,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); - ret = update_log_root(trans, log); - mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ -- cgit v1.2.3 From a81071110d25a83e749b2f58f0e337ac6782c12c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 16 May 2019 15:48:55 +0100 Subject: Btrfs: fix fsync not persisting changed attributes of a directory commit 60d9f50308e5df19bc18c2fefab0eba4a843900a upstream. While logging an inode we follow its ancestors and for each one we mark it as logged in the current transaction, even if we have not logged it. As a consequence if we change an attribute of an ancestor, such as the UID or GID for example, and then explicitly fsync it, we end up not logging the inode at all despite returning success to user space, which results in the attribute being lost if a power failure happens after the fsync. Sample reproducer: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/dir $ chown 6007:6007 /mnt/dir $ sync $ chown 9003:9003 /mnt/dir $ touch /mnt/dir/file $ xfs_io -c fsync /mnt/dir/file # fsync our directory after fsync'ing the new file, should persist the # new values for the uid and gid. $ xfs_io -c fsync /mnt/dir $ mount /dev/sdb /mnt $ stat -c %u:%g /mnt/dir 6007:6007 --> should be 9003:9003, the uid and gid were not persisted, despite the explicit fsync on the directory prior to the power failure Fix this by not updating the logged_trans field of ancestor inodes when logging an inode, since we have not logged them. Let only future calls to btrfs_log_inode() to mark inodes as logged. This could be triggered by my recent fsync fuzz tester for fstests, for which an fstests patch exists titled "fstests: generic, fsync fuzz tester with fsstress". Fixes: 12fcfd22fe5b ("Btrfs: tree logging unlink/rename fixes") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f79855cee88b..0d5840d20efc 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5312,7 +5312,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct dentry *old_parent = NULL; - struct btrfs_inode *orig_inode = inode; /* * for regular files, if its inode is already on disk, we don't @@ -5332,16 +5331,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, } while (1) { - /* - * If we are logging a directory then we start with our inode, - * not our parent's inode, so we need to skip setting the - * logged_trans so that further down in the log code we don't - * think this inode has already been logged. - */ - if (inode != orig_inode) - inode->logged_trans = trans->transid; - smp_mb(); - if (btrfs_must_commit_transaction(trans, inode)) { ret = 1; break; @@ -6070,7 +6059,6 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * if this directory was already logged any new * names for this file/dir will get recorded */ - smp_mb(); if (dir->logged_trans == trans->transid) return; -- cgit v1.2.3 From 8a652fd142c38d03f6e83a054cff40f2e6878beb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 May 2019 09:55:42 +0100 Subject: Btrfs: incremental send, fix file corruption when no-holes feature is enabled commit 6b1f72e5b82a5c2a4da4d1ebb8cc01913ddbea21 upstream. When using the no-holes feature, if we have a file with prealloc extents with a start offset beyond the file's eof, doing an incremental send can cause corruption of the file due to incorrect hole detection. Such case requires that the prealloc extent(s) exist in both the parent and send snapshots, and that a hole is punched into the file that covers all its extents that do not cross the eof boundary. Example reproducer: $ mkfs.btrfs -f -O no-holes /dev/sdb $ mount /dev/sdb /mnt/sdb $ xfs_io -f -c "pwrite -S 0xab 0 500K" /mnt/sdb/foobar $ xfs_io -c "falloc -k 1200K 800K" /mnt/sdb/foobar $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/base $ btrfs send -f /tmp/base.snap /mnt/sdb/base $ xfs_io -c "fpunch 0 500K" /mnt/sdb/foobar $ btrfs subvolume snapshot -r /mnt/sdb /mnt/sdb/incr $ btrfs send -p /mnt/sdb/base -f /tmp/incr.snap /mnt/sdb/incr $ md5sum /mnt/sdb/incr/foobar 816df6f64deba63b029ca19d880ee10a /mnt/sdb/incr/foobar $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt/sdc $ btrfs receive -f /tmp/base.snap /mnt/sdc $ btrfs receive -f /tmp/incr.snap /mnt/sdc $ md5sum /mnt/sdc/incr/foobar cf2ef71f4a9e90c2f6013ba3b2257ed2 /mnt/sdc/incr/foobar --> Different checksum, because the prealloc extent beyond the file's eof confused the hole detection code and it assumed a hole starting at offset 0 and ending at the offset of the prealloc extent (1200Kb) instead of ending at the offset 500Kb (the file's size). Fix this by ensuring we never cross the file's size when issuing the write operations for a hole. Fixes: 16e7549f045d33 ("Btrfs: incompatible format change to remove hole extents") CC: stable@vger.kernel.org # 3.14+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 635e419f2a2d..258392b75048 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5021,6 +5021,12 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (offset >= sctx->cur_inode_size) return 0; + /* + * Don't go beyond the inode's i_size due to prealloc extents that start + * after the i_size. + */ + end = min_t(u64, end, sctx->cur_inode_size); + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); -- cgit v1.2.3 From 32d57c0c063cdab6639da074e8b0f28517862e41 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 17 May 2019 09:12:33 +0100 Subject: cifs: fix memory leak of pneg_inbuf on -EOPNOTSUPP ioctl case commit 210782038b54ec8e9059a3c12d6f6ae173efa3a9 upstream. Currently in the case where SMB2_ioctl returns the -EOPNOTSUPP error there is a memory leak of pneg_inbuf. Fix this by returning via the out_free_inbuf exit path that will perform the relevant kfree. Addresses-Coverity: ("Resource leak") Fixes: 969ae8e8d4ee ("cifs: Accept validate negotiate if server return NT_STATUS_NOT_SUPPORTED") CC: Stable # v5.1+ Signed-off-by: Colin Ian King Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 33afb637e6f8..c181f1621e1a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -887,7 +887,8 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) * not supported error. Client should accept it. */ cifs_dbg(VFS, "Server does not support validate negotiate\n"); - return 0; + rc = 0; + goto out_free_inbuf; } else if (rc != 0) { cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc); rc = -EIO; -- cgit v1.2.3 From 297a251062c0d616e5d6644d4a041532c77d601f Mon Sep 17 00:00:00 2001 From: Roberto Bergantinos Corpas Date: Tue, 28 May 2019 09:38:14 +0200 Subject: CIFS: cifs_read_allocate_pages: don't iterate through whole page array on ENOMEM commit 31fad7d41e73731f05b8053d17078638cf850fa6 upstream. In cifs_read_allocate_pages, in case of ENOMEM, we go through whole rdata->pages array but we have failed the allocation before nr_pages, therefore we may end up calling put_page with NULL pointer, causing oops Signed-off-by: Roberto Bergantinos Corpas Acked-by: Pavel Shilovsky Signed-off-by: Steve French CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d6b45682833b..23cee91ed442 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2988,7 +2988,9 @@ cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages) } if (rc) { - for (i = 0; i < nr_pages; i++) { + unsigned int nr_page_failed = i; + + for (i = 0; i < nr_page_failed; i++) { put_page(rdata->pages[i]); rdata->pages[i] = NULL; } -- cgit v1.2.3 From 873041930dab5e03e9ef6f6adbee9f0a3e9dc034 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 20 May 2019 10:33:07 -0400 Subject: Revert "lockd: Show pid of lockd for remote locks" commit 141731d15d6eb2fd9aaefbf9b935ce86ae243074 upstream. This reverts most of commit b8eee0e90f97 ("lockd: Show pid of lockd for remote locks"), which caused remote locks to not be differentiated between remote processes for NLM. We retain the fixup for setting the client's fl_pid to a negative value. Fixes: b8eee0e90f97 ("lockd: Show pid of lockd for remote locks") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Coddington Reviewed-by: XueWei Zhang Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/lockd/xdr.c | 4 ++-- fs/lockd/xdr4.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 9846f7e95282..7147e4aebecc 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -127,7 +127,7 @@ nlm_decode_lock(__be32 *p, struct nlm_lock *lock) locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = current->tgid; + fl->fl_pid = (pid_t)lock->svid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ start = ntohl(*p++); @@ -269,7 +269,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32) 0; - lock->fl.fl_pid = current->tgid; + lock->fl.fl_pid = (pid_t)lock->svid; if (!(p = nlm_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 70154f376695..7ed9edf9aed4 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -119,7 +119,7 @@ nlm4_decode_lock(__be32 *p, struct nlm_lock *lock) locks_init_lock(fl); fl->fl_owner = current->files; - fl->fl_pid = current->tgid; + fl->fl_pid = (pid_t)lock->svid; fl->fl_flags = FL_POSIX; fl->fl_type = F_RDLCK; /* as good as anything else */ p = xdr_decode_hyper(p, &start); @@ -266,7 +266,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p) memset(lock, 0, sizeof(*lock)); locks_init_lock(&lock->fl); lock->svid = ~(u32) 0; - lock->fl.fl_pid = current->tgid; + lock->fl.fl_pid = (pid_t)lock->svid; if (!(p = nlm4_decode_cookie(p, &argp->cookie)) || !(p = xdr_decode_string_inplace(p, &lock->caller, -- cgit v1.2.3 From ea0327b47754467e0190212dac1ec578262b8e70 Mon Sep 17 00:00:00 2001 From: Yihao Wu Date: Wed, 22 May 2019 01:57:10 +0800 Subject: NFSv4.1: Again fix a race where CB_NOTIFY_LOCK fails to wake a waiter commit 52b042ab9948cc367b61f9ca9c18603aa7813c3a upstream. Commit b7dbcc0e433f "NFSv4.1: Fix a race where CB_NOTIFY_LOCK fails to wake a waiter" found this bug. However it didn't fix it. This commit replaces schedule_timeout() with wait_woken() and default_wake_function() with woken_wake_function() in function nfs4_retry_setlk() and nfs4_wake_lock_waiter(). wait_woken() uses memory barriers in its implementation to avoid potential race condition when putting a process into sleeping state and then waking it up. Fixes: a1d617d8f134 ("nfs: allow blocking locks to be awoken by lock callbacks") Cc: stable@vger.kernel.org #4.9+ Signed-off-by: Yihao Wu Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 580e37bc3fe2..06d9548a14d0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6850,7 +6850,6 @@ struct nfs4_lock_waiter { struct task_struct *task; struct inode *inode; struct nfs_lowner *owner; - bool notified; }; static int @@ -6872,13 +6871,13 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo /* Make sure it's for the right inode */ if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) return 0; - - waiter->notified = true; } /* override "private" so we can use default_wake_function */ wait->private = waiter->task; - ret = autoremove_wake_function(wait, mode, flags, key); + ret = woken_wake_function(wait, mode, flags, key); + if (ret) + list_del_init(&wait->entry); wait->private = waiter; return ret; } @@ -6887,7 +6886,6 @@ static int nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { int status = -ERESTARTSYS; - unsigned long flags; struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; struct nfs_server *server = NFS_SERVER(state->inode); struct nfs_client *clp = server->nfs_client; @@ -6897,8 +6895,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) .s_dev = server->s_dev }; struct nfs4_lock_waiter waiter = { .task = current, .inode = state->inode, - .owner = &owner, - .notified = false }; + .owner = &owner}; wait_queue_entry_t wait; /* Don't bother with waitqueue if we don't expect a callback */ @@ -6911,21 +6908,14 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) add_wait_queue(q, &wait); while(!signalled()) { - waiter.notified = false; status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) break; status = -ERESTARTSYS; - spin_lock_irqsave(&q->lock, flags); - if (waiter.notified) { - spin_unlock_irqrestore(&q->lock, flags); - continue; - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&q->lock, flags); - - freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); + freezer_do_not_count(); + wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT); + freezer_count(); } finish_wait(q, &wait); -- cgit v1.2.3 From 56e3f73e838ae6315a17b70c99d1fe8bab3aca2d Mon Sep 17 00:00:00 2001 From: Yihao Wu Date: Mon, 13 May 2019 14:58:22 +0800 Subject: NFSv4.1: Fix bug only first CB_NOTIFY_LOCK is handled commit ba851a39c9703f09684a541885ed176f8fb7c868 upstream. When a waiter is waked by CB_NOTIFY_LOCK, it will retry nfs4_proc_setlk(). The waiter may fail to nfs4_proc_setlk() and sleep again. However, the waiter is already removed from clp->cl_lock_waitq when handling CB_NOTIFY_LOCK in nfs4_wake_lock_waiter(). So any subsequent CB_NOTIFY_LOCK won't wake this waiter anymore. We should put the waiter back to clp->cl_lock_waitq before retrying. Cc: stable@vger.kernel.org #4.9+ Signed-off-by: Yihao Wu Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 06d9548a14d0..53cf8599a46e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6905,20 +6905,22 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) init_wait(&wait); wait.private = &waiter; wait.func = nfs4_wake_lock_waiter; - add_wait_queue(q, &wait); while(!signalled()) { + add_wait_queue(q, &wait); status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) + if ((status != -EAGAIN) || IS_SETLK(cmd)) { + finish_wait(q, &wait); break; + } status = -ERESTARTSYS; freezer_do_not_count(); wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT); freezer_count(); + finish_wait(q, &wait); } - finish_wait(q, &wait); return status; } #else /* !CONFIG_NFS_V4_1 */ -- cgit v1.2.3 From a3b8b4ad6db7eea67a67eb88cc5f6e101f89dbc1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 May 2019 11:42:07 +0200 Subject: fuse: fallocate: fix return with locked inode commit 35d6fcbb7c3e296a52136347346a698a35af3fda upstream. Do the proper cleanup in case the size check fails. Tested with xfstests:generic/228 Reported-by: kbuild test robot Reported-by: Dan Carpenter Fixes: 0cbade024ba5 ("fuse: honor RLIMIT_FSIZE in fuse_file_fallocate") Cc: Liu Bo Cc: # v3.5 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 59e8bb72dc14..9a22aa580fe7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2981,7 +2981,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, offset + length > i_size_read(inode)) { err = inode_newsize_ok(inode, offset + length); if (err) - return err; + goto out; } if (!(mode & FALLOC_FL_KEEP_SIZE)) -- cgit v1.2.3 From c63ce7166dafd82c679873107ec649ff9f768d1e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 1 Nov 2018 14:08:07 -0700 Subject: pstore: Remove needless lock during console writes commit b77fa617a2ff4d6beccad3d3d4b3a1f2d10368aa upstream. Since the console writer does not use the preallocated crash dump buffer any more, there is no reason to perform locking around it. Fixes: 70ad35db3321 ("pstore: Convert console write to use ->write_buf") Signed-off-by: Kees Cook Reviewed-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- fs/pstore/platform.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b821054ca3ed..805283b5385d 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -476,31 +476,14 @@ static void pstore_unregister_kmsg(void) #ifdef CONFIG_PSTORE_CONSOLE static void pstore_console_write(struct console *con, const char *s, unsigned c) { - const char *e = s + c; + struct pstore_record record; - while (s < e) { - struct pstore_record record; - unsigned long flags; - - pstore_record_init(&record, psinfo); - record.type = PSTORE_TYPE_CONSOLE; - - if (c > psinfo->bufsize) - c = psinfo->bufsize; + pstore_record_init(&record, psinfo); + record.type = PSTORE_TYPE_CONSOLE; - if (oops_in_progress) { - if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) - break; - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - } - record.buf = (char *)s; - record.size = c; - psinfo->write(&record); - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - s += c; - c = e - s; - } + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); } static struct console pstore_console = { -- cgit v1.2.3 From d4128a1b580ca949e829fd919c2579dcaa9138d4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 30 Nov 2018 14:36:58 -0800 Subject: pstore: Convert buf_lock to semaphore commit ea84b580b95521644429cc6748b6c2bf27c8b0f3 upstream. Instead of running with interrupts disabled, use a semaphore. This should make it easier for backends that may need to sleep (e.g. EFI) when performing a write: |BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 |in_atomic(): 1, irqs_disabled(): 1, pid: 2236, name: sig-xstate-bum |Preemption disabled at: |[] pstore_dump+0x72/0x330 |CPU: 26 PID: 2236 Comm: sig-xstate-bum Tainted: G D 4.20.0-rc3 #45 |Call Trace: | dump_stack+0x4f/0x6a | ___might_sleep.cold.91+0xd3/0xe4 | __might_sleep+0x50/0x90 | wait_for_completion+0x32/0x130 | virt_efi_query_variable_info+0x14e/0x160 | efi_query_variable_store+0x51/0x1a0 | efivar_entry_set_safe+0xa3/0x1b0 | efi_pstore_write+0x109/0x140 | pstore_dump+0x11c/0x330 | kmsg_dump+0xa4/0xd0 | oops_exit+0x22/0x30 ... Reported-by: Sebastian Andrzej Siewior Fixes: 21b3ddd39fee ("efi: Don't use spinlocks for efi vars") Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/platform.c | 44 +++++++++++++++++++++++--------------------- fs/pstore/ram.c | 1 - 2 files changed, 23 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 805283b5385d..b2d63bd05e74 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -124,26 +124,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) } } -bool pstore_cannot_block_path(enum kmsg_dump_reason reason) +/* + * Should pstore_dump() wait for a concurrent pstore_dump()? If + * not, the current pstore_dump() will report a failure to dump + * and return. + */ +static bool pstore_cannot_wait(enum kmsg_dump_reason reason) { - /* - * In case of NMI path, pstore shouldn't be blocked - * regardless of reason. - */ + /* In NMI path, pstore shouldn't block regardless of reason. */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked by spin lock. */ + /* Emergency restart shouldn't be blocked. */ case KMSG_DUMP_EMERG: return true; default: return false; } } -EXPORT_SYMBOL_GPL(pstore_cannot_block_path); #if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) static int zbufsize_deflate(size_t size) @@ -378,23 +379,23 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; - unsigned long flags = 0; - int is_locked; int ret; why = get_reason_str(reason); - if (pstore_cannot_block_path(reason)) { - is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags); - if (!is_locked) { - pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" - , in_nmi() ? "NMI" : why); + if (down_trylock(&psinfo->buf_lock)) { + /* Failed to acquire lock: give up if we cannot wait. */ + if (pstore_cannot_wait(reason)) { + pr_err("dump skipped in %s path: may corrupt error record\n", + in_nmi() ? "NMI" : why); + return; + } + if (down_interruptible(&psinfo->buf_lock)) { + pr_err("could not grab semaphore?!\n"); return; } - } else { - spin_lock_irqsave(&psinfo->buf_lock, flags); - is_locked = 1; } + oopscount++; while (total < kmsg_bytes) { char *dst; @@ -411,7 +412,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { dst = big_oops_buf; dst_size = big_oops_buf_sz; } else { @@ -429,7 +430,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, dst_size, &dump_size)) break; - if (big_oops_buf && is_locked) { + if (big_oops_buf) { zipped_len = pstore_compress(dst, psinfo->buf, header_size + dump_size, psinfo->bufsize); @@ -452,8 +453,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - if (is_locked) - spin_unlock_irqrestore(&psinfo->buf_lock, flags); + + up(&psinfo->buf_lock); } static struct kmsg_dumper pstore_dumper = { @@ -572,6 +573,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); + sema_init(&psinfo->buf_lock, 1); spin_unlock(&pstore_lock); if (owner && !try_module_get(owner)) { diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 44ed6b193d2e..77050b472c49 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -814,7 +814,6 @@ static int ramoops_probe(struct platform_device *pdev) err = -ENOMEM; goto fail_clear; } - spin_lock_init(&cxt->pstore.buf_lock); cxt->pstore.flags = PSTORE_FLAGS_DMESG; if (cxt->console_size) -- cgit v1.2.3 From aa73a3b205a4043ed3146445f28f09536dadf67c Mon Sep 17 00:00:00 2001 From: Pi-Hsun Shih Date: Mon, 20 May 2019 14:51:19 +0800 Subject: pstore: Set tfm to NULL on free_buf_for_compression commit a9fb94a99bb515d8720ba8440ce3aba84aec80f8 upstream. Set tfm to NULL on free_buf_for_compression() after crypto_free_comp(). This avoid a use-after-free when allocate_buf_for_compression() and free_buf_for_compression() are called twice. Although free_buf_for_compression() freed the tfm, allocate_buf_for_compression() won't reinitialize the tfm since the tfm pointer is not NULL. Fixes: 95047b0519c1 ("pstore: Refactor compression initialization") Signed-off-by: Pi-Hsun Shih Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/platform.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b2d63bd05e74..3212cd36e2e7 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -324,8 +324,10 @@ static void allocate_buf_for_compression(void) static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { crypto_free_comp(tfm); + tfm = NULL; + } kfree(big_oops_buf); big_oops_buf = NULL; big_oops_buf_sz = 0; -- cgit v1.2.3 From f4d0227ff170b7ec365f75d435705b28295f08bd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 30 May 2019 23:37:29 -0700 Subject: pstore/ram: Run without kernel crash dump region commit 8880fa32c557600f5f624084152668ed3c2ea51e upstream. The ram pstore backend has always had the crash dumper frontend enabled unconditionally. However, it was possible to effectively disable it by setting a record_size=0. All the machinery would run (storing dumps to the temporary crash buffer), but 0 bytes would ultimately get stored due to there being no przs allocated for dumps. Commit 89d328f637b9 ("pstore/ram: Correctly calculate usable PRZ bytes"), however, assumed that there would always be at least one allocated dprz for calculating the size of the temporary crash buffer. This was, of course, not the case when record_size=0, and would lead to a NULL deref trying to find the dprz buffer size: BUG: unable to handle kernel NULL pointer dereference at (null) ... IP: ramoops_probe+0x285/0x37e (fs/pstore/ram.c:808) cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; Instead, we need to only enable the frontends based on the success of the prz initialization and only take the needed actions when those zones are available. (This also fixes a possible error in detecting if the ftrace frontend should be enabled.) Reported-and-tested-by: Yaro Slav Fixes: 89d328f637b9 ("pstore/ram: Correctly calculate usable PRZ bytes") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- fs/pstore/platform.c | 3 ++- fs/pstore/ram.c | 36 +++++++++++++++++++++++------------- 2 files changed, 25 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 3212cd36e2e7..4bae3f4fe829 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -583,7 +583,8 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - allocate_buf_for_compression(); + if (psi->flags & PSTORE_FLAGS_DMESG) + allocate_buf_for_compression(); if (pstore_is_mounted()) pstore_get_records(0); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 77050b472c49..316c16463b20 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -803,26 +803,36 @@ static int ramoops_probe(struct platform_device *pdev) cxt->pstore.data = cxt; /* - * Since bufsize is only used for dmesg crash dumps, it - * must match the size of the dprz record (after PRZ header - * and ECC bytes have been accounted for). + * Prepare frontend flags based on which areas are initialized. + * For ramoops_init_przs() cases, the "max count" variable tells + * if there are regions present. For ramoops_init_prz() cases, + * the single region size is how to check. */ - cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; - cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); - if (!cxt->pstore.buf) { - pr_err("cannot allocate pstore crash dump buffer\n"); - err = -ENOMEM; - goto fail_clear; - } - - cxt->pstore.flags = PSTORE_FLAGS_DMESG; + cxt->pstore.flags = 0; + if (cxt->max_dump_cnt) + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; - if (cxt->ftrace_size) + if (cxt->max_ftrace_cnt) cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; if (cxt->pmsg_size) cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + /* + * Since bufsize is only used for dmesg crash dumps, it + * must match the size of the dprz record (after PRZ header + * and ECC bytes have been accounted for). + */ + if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) { + cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + pr_err("cannot allocate pstore crash dump buffer\n"); + err = -ENOMEM; + goto fail_clear; + } + } + err = pstore_register(&cxt->pstore); if (err) { pr_err("registering with pstore failed\n"); -- cgit v1.2.3