From 7e767aae0ed129f6e67f5fec09fa870be452788c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 21 Nov 2013 15:41:06 +1100 Subject: xfs: growfs overruns AGFL buffer on V4 filesystems commit f94c44573e7c22860e2c3dfe349c45f72ba35ad3 upstream. This loop in xfs_growfs_data_private() is incorrect for V4 superblocks filesystems: for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); For V4 filesystems, we don't have a agfl header structure, and so XFS_AGFL_SIZE() returns an entire sector's worth of entries, which we then index from an offset into the sector. Hence: buffer overrun. This problem was introduced in 3.10 by commit 77c95bba ("xfs: add CRC checks to the AGFL") which changed the AGFL structure but failed to update the growfs code to handle the different structures. Fix it by using the correct offset into the buffer for both V4 and V5 filesystems. Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Ben Myers Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_fsops.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3c3644ea825b..2288db4e1784 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -216,6 +216,8 @@ xfs_growfs_data_private( */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* * AG freespace header block */ @@ -275,8 +277,10 @@ xfs_growfs_data_private( agfl->agfl_seqno = cpu_to_be32(agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) - agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); xfs_buf_relse(bp); -- cgit v1.2.3 From f75eb9d4085192dc58c30a9384cf4496194be851 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 31 Oct 2013 21:00:10 +0300 Subject: xfs: underflow bug in xfs_attrlist_by_handle() commit 31978b5cc66b8ba8a7e8eef60b12395d41b7b890 upstream. If we allocate less than sizeof(struct attrlist) then we end up corrupting memory or doing a ZERO_PTR_SIZE dereference. This can only be triggered with CAP_SYS_ADMIN. Reported-by: Nico Golde Reported-by: Fabian Yamaguchi Signed-off-by: Dan Carpenter Reviewed-by: Dave Chinner Signed-off-by: Ben Myers Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_ioctl.c | 3 ++- fs/xfs/xfs_ioctl32.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ca01d830e989..83dfe6e73235 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -409,7 +409,8 @@ xfs_attrlist_by_handle( return -XFS_ERROR(EPERM); if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c0c66259cc91..68799d7f02cc 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -359,7 +359,8 @@ xfs_compat_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) return -XFS_ERROR(EINVAL); /* -- cgit v1.2.3 From fbaa929d862503b59110081efb57a40213193a6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 2 Dec 2013 15:26:19 -0500 Subject: nfsd: when reusing an existing repcache entry, unhash it first commit 781c2a5a5f75eacc04663aced0f0f1a648d4f308 upstream. The DRC code will attempt to reuse an existing, expired cache entry in preference to allocating a new one. It'll then search the cache, and if it gets a hit it'll then free the cache entry that it was going to reuse. The cache code doesn't unhash the entry that it's going to reuse however, so it's possible for it end up designating an entry for reuse and then subsequently freeing the same entry after it finds it. This leads it to a later use-after-free situation and usually some list corruption warnings or an oops. Fix this by simply unhashing the entry that we intend to reuse. That will mean that it's not findable via a search and should prevent this situation from occurring. Reported-by: Christoph Hellwig Reported-by: g. artim Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfscache.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index e76244edd748..ec8d97ddc635 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -128,6 +128,13 @@ nfsd_reply_cache_alloc(void) return rp; } +static void +nfsd_reply_cache_unhash(struct svc_cacherep *rp) +{ + hlist_del_init(&rp->c_hash); + list_del_init(&rp->c_lru); +} + static void nfsd_reply_cache_free_locked(struct svc_cacherep *rp) { @@ -403,7 +410,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); if (nfsd_cache_entry_expired(rp) || num_drc_entries >= max_drc_entries) { - lru_put_end(rp); + nfsd_reply_cache_unhash(rp); prune_cache_entries(); goto search_cache; } -- cgit v1.2.3 From 6b047827d4cdd57ac7c4f9da2d779d811649566f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 10 Jan 2013 03:57:25 -0500 Subject: Btrfs: fix access_ok() check in btrfs_ioctl_send() commit 700ff4f095d78af0998953e922e041d75254518b upstream. The closing parenthesis is in the wrong place. We want to check "sizeof(*arg->clone_sources) * arg->clone_sources_count" instead of "sizeof(*arg->clone_sources * arg->clone_sources_count)". Signed-off-by: Dan Carpenter Reviewed-by: Jie Liu Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/send.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 09ea0bdde65f..256a9a46d544 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4623,8 +4623,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } if (!access_ok(VERIFY_READ, arg->clone_sources, - sizeof(*arg->clone_sources * - arg->clone_sources_count))) { + sizeof(*arg->clone_sources) * + arg->clone_sources_count)) { ret = -EFAULT; goto out; } -- cgit v1.2.3 From 89ec75229a591cd7a3f57c330dc0791b8646b112 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 6 Dec 2013 17:51:32 +0100 Subject: btrfs: call mnt_drop_write after interrupted subvol deletion commit e43f998e47bae27e37e159915625e8d4b130153b upstream. If btrfs_ioctl_snap_destroy blocks on the mutex and the process is killed, mnt_write count is unbalanced and leads to unmountable filesystem. Signed-off-by: David Sterba Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8dedf4019672..145b2c75ab83 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2093,7 +2093,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); if (err == -EINTR) - goto out; + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2235,6 +2235,7 @@ out_dput: dput(dentry); out_unlock_dir: mutex_unlock(&dir->i_mutex); +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); -- cgit v1.2.3 From 0f8285ad902ab61b7631f059f0ffdf7a23b0601f Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 2 Dec 2013 19:59:31 +0100 Subject: nfs: fix do_div() warning by instead using sector_div() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3873d064b8538686bbbd4b858dc8a07db1f7f43a upstream. When compiling a 32bit kernel with CONFIG_LBDAF=n the compiler complains like shown below. Fix this warning by instead using sector_div() which is provided by the kernel.h header file. fs/nfs/blocklayout/extents.c: In function ‘normalize’: include/asm-generic/div64.h:43:28: warning: comparison of distinct pointer types lacks a cast [enabled by default] fs/nfs/blocklayout/extents.c:47:13: note: in expansion of macro ‘do_div’ nfs/blocklayout/extents.c:47:2: warning: right shift count >= width of type [enabled by default] fs/nfs/blocklayout/extents.c:47:2: warning: passing argument 1 of ‘__div64_32’ from incompatible pointer type [enabled by default] include/asm-generic/div64.h:35:17: note: expected ‘uint64_t *’ but argument is of type ‘sector_t *’ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor); Signed-off-by: Helge Deller Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/blocklayout/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 9c3e117c3ed1..4d0161442565 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -44,7 +44,7 @@ static inline sector_t normalize(sector_t s, int base) { sector_t tmp = s; /* Since do_div modifies its argument */ - return s - do_div(tmp, base); + return s - sector_div(tmp, base); } static inline sector_t normalize_up(sector_t s, int base) -- cgit v1.2.3 From 39be1c3dd03ee291a006189fe1cf43a5322a6eaf Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Fri, 15 Nov 2013 16:36:16 -0500 Subject: NFSv4 wait on recovery for async session errors commit 4a82fd7c4e78a1b7a224f9ae8bb7e1fd95f670e0 upstream. When the state manager is processing the NFS4CLNT_DELEGRETURN flag, session draining is off, but DELEGRETURN can still get a session error. The async handler calls nfs4_schedule_session_recovery returns -EAGAIN, and the DELEGRETURN done then restarts the RPC task in the prepare state. With the state manager still processing the NFS4CLNT_DELEGRETURN flag with session draining off, these DELEGRETURNs will cycle with errors filling up the session slots. This prevents OPEN reclaims (from nfs_delegation_claim_opens) required by the NFS4CLNT_DELEGRETURN state manager processing from completing, hanging the state manager in the __rpc_wait_for_completion_task in nfs4_run_open_task as seen in this kernel thread dump: kernel: 4.12.32.53-ma D 0000000000000000 0 3393 2 0x00000000 kernel: ffff88013995fb60 0000000000000046 ffff880138cc5400 ffff88013a9df140 kernel: ffff8800000265c0 ffffffff8116eef0 ffff88013fc10080 0000000300000001 kernel: ffff88013a4ad058 ffff88013995ffd8 000000000000fbc8 ffff88013a4ad058 kernel: Call Trace: kernel: [] ? cache_alloc_refill+0x1c0/0x240 kernel: [] ? rpc_wait_bit_killable+0x0/0xa0 [sunrpc] kernel: [] rpc_wait_bit_killable+0x42/0xa0 [sunrpc] kernel: [] __wait_on_bit+0x5f/0x90 kernel: [] ? rpc_wait_bit_killable+0x0/0xa0 [sunrpc] kernel: [] out_of_line_wait_on_bit+0x78/0x90 kernel: [] ? wake_bit_function+0x0/0x50 kernel: [] __rpc_wait_for_completion_task+0x2d/0x30 [sunrpc] kernel: [] nfs4_run_open_task+0x11c/0x160 [nfs] kernel: [] nfs4_open_recover_helper+0x87/0x120 [nfs] kernel: [] nfs4_open_recover+0xc6/0x150 [nfs] kernel: [] ? nfs4_open_recoverdata_alloc+0x2f/0x60 [nfs] kernel: [] nfs4_open_delegation_recall+0x6a/0xa0 [nfs] kernel: [] nfs_end_delegation_return+0x120/0x2e0 [nfs] kernel: [] ? queue_work+0x1f/0x30 kernel: [] nfs_client_return_marked_delegations+0xd7/0x110 [nfs] kernel: [] nfs4_run_state_manager+0x548/0x620 [nfs] kernel: [] ? nfs4_run_state_manager+0x0/0x620 [nfs] kernel: [] kthread+0x96/0xa0 kernel: [] child_rip+0xa/0x20 kernel: [] ? kthread+0x0/0xa0 kernel: [] ? child_rip+0x0/0x20 The state manager can not therefore process the DELEGRETURN session errors. Change the async handler to wait for recovery on session errors. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 23881380dc6f..75e49d42a7fb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4222,8 +4222,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); - task->tk_status = 0; - return -EAGAIN; + goto wait_on_recovery; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); -- cgit v1.2.3 From fb5834ff2b2823e1eaecd920e2ade3e07f4a42f9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 29 Sep 2013 10:33:16 +0800 Subject: Btrfs: fix memory leak of chunks' extent map commit 7d3d1744f8a7d62e4875bd69cc2192a939813880 upstream. As we're hold a ref on looking up the extent map, we need to drop the ref before returning to callers. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8bffb9174afb..b6c23c4abae2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4248,6 +4248,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got " "%Lu-%Lu\n", logical, logical+len, em->start, em->start + em->len); + free_extent_map(em); return 1; } @@ -4429,6 +4430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, " "found %Lu-%Lu\n", logical, em->start, em->start + em->len); + free_extent_map(em); return -EINVAL; } -- cgit v1.2.3 From b395193ecf087a36e98b97c396f839b3fe9a4d19 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Oct 2013 17:23:08 -0400 Subject: Btrfs: fix hole check in log_one_extent commit ed9e8af88e2551aaa6bf51d8063a2493e2d71597 upstream. I added an assert to make sure we were looking up aligned offsets for csums and I tripped it when running xfstests. This is because log_one_extent was checking if block_start == 0 for a hole instead of EXTENT_MAP_HOLE. This worked out fine in practice it seems, but it adds a lot of extra work that is uneeded. With this fix I'm no longer tripping my assert. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cf68596b51fb..bca436330681 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3314,7 +3314,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_token_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG, &token); - if (em->block_start == 0) + if (em->block_start == EXTENT_MAP_HOLE) skip_csum = true; } -- cgit v1.2.3 From 9436cf971ed1aa296e6ca36489f5240d3868ff84 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Tue, 15 Oct 2013 18:44:00 +0100 Subject: Btrfs: fix incorrect inode acl reset commit 8185554d3eb09d23a805456b6fa98dcbb34aa518 upstream. When a directory has a default ACL and a subdirectory is created under that directory, btrfs_init_acl() is called when the subdirectory's inode is created to initialize the inode's ACL (inherited from the parent directory) but it was clearing the ACL from the inode after setting it if posix_acl_create() returned success, instead of clearing it only if it returned an error. To reproduce this issue: $ mkfs.btrfs -f /dev/loop0 $ mount /dev/loop0 /mnt $ mkdir /mnt/acl $ setfacl -d --set u::rwx,g::rwx,o::- /mnt/acl $ getfacl /mnt/acl user::rwx group::rwx other::r-x default:user::rwx default:group::rwx default:other::--- $ mkdir /mnt/acl/dir1 $ getfacl /mnt/acl/dir1 user::rwx group::rwx other::--- After unmounting and mounting again the filesystem, fgetacl returned the expected ACL: $ umount /mnt/acl $ mount /dev/loop0 /mnt $ getfacl /mnt/acl/dir1 user::rwx group::rwx other::--- default:user::rwx default:group::rwx default:other::--- Meaning that the underlying xattr was persisted. Reported-by: Giuseppe Fierro Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index e15d2b0d8d3b..0890c83643e9 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -229,7 +229,7 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, if (ret > 0) { /* we need an acl */ ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); - } else { + } else if (ret < 0) { cache_no_acl(inode); } } else { -- cgit v1.2.3 From f5749e3720c038fe92dc9ffb5e6fe7b4d8809ea1 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Oct 2013 10:45:05 +0800 Subject: Btrfs: do not run snapshot-aware defragment on error commit 6f519564d7d978c00351d9ab6abac3deeac31621 upstream. If something wrong happens in write endio, running snapshot-aware defragment can end up with undefined results, maybe a crash, so we should avoid it. In order to share similar code, this also adds a helper to free the struct for snapshot-aware defrag. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1e2288dc5346..0bcee78cde16 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2419,10 +2419,23 @@ out_unlock: return ret; } +static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) +{ + struct old_sa_defrag_extent *old, *tmp; + + if (!new) + return; + + list_for_each_entry_safe(old, tmp, &new->head, list) { + list_del(&old->list); + kfree(old); + } + kfree(new); +} + static void relink_file_extents(struct new_sa_defrag_extent *new) { struct btrfs_path *path; - struct old_sa_defrag_extent *old, *tmp; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; struct inode *inode; @@ -2465,16 +2478,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) kfree(prev); btrfs_free_path(path); - - list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); - kfree(old); - } out: + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); wake_up(&root->fs_info->transaction_wait); - - kfree(new); } static struct new_sa_defrag_extent * @@ -2484,7 +2492,7 @@ record_old_file_extents(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct btrfs_key key; - struct old_sa_defrag_extent *old, *tmp; + struct old_sa_defrag_extent *old; struct new_sa_defrag_extent *new; int ret; @@ -2532,7 +2540,7 @@ record_old_file_extents(struct inode *inode, if (slot >= btrfs_header_nritems(l)) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out_free_list; + goto out_free_path; else if (ret > 0) break; continue; @@ -2561,7 +2569,7 @@ record_old_file_extents(struct inode *inode, old = kmalloc(sizeof(*old), GFP_NOFS); if (!old) - goto out_free_list; + goto out_free_path; offset = max(new->file_pos, key.offset); end = min(new->file_pos + new->len, key.offset + num_bytes); @@ -2583,15 +2591,10 @@ next: return new; -out_free_list: - list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); - kfree(old); - } out_free_path: btrfs_free_path(path); out_kfree: - kfree(new); + free_sa_defrag_extent(new); return NULL; } @@ -2743,8 +2746,14 @@ out: btrfs_remove_ordered_extent(inode, ordered_extent); /* for snapshot-aware defrag */ - if (new) - relink_file_extents(new); + if (new) { + if (ret) { + free_sa_defrag_extent(new); + atomic_dec(&root->fs_info->defrag_running); + } else { + relink_file_extents(new); + } + } /* once for us */ btrfs_put_ordered_extent(ordered_extent); -- cgit v1.2.3 From 2c460e835ceecda4b327aacb6423b54f9a89042f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 26 Sep 2013 14:25:36 +0800 Subject: ceph: cleanup aborted requests when re-sending requests. commit eb1b8af33c2e42a9a57fc0a7588f4a7b255d2e79 upstream. Aborted requests usually get cleared when the reply is received. If MDS crashes, no reply will be received. So we need to cleanup aborted requests when re-sending requests. Signed-off-by: Yan, Zheng Reviewed-by: Greg Farnum Signed-off-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mds_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4d2920304be8..0af8e298d5b9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1840,8 +1840,11 @@ static int __do_request(struct ceph_mds_client *mdsc, int mds = -1; int err = -EAGAIN; - if (req->r_err || req->r_got_result) + if (req->r_err || req->r_got_result) { + if (req->r_aborted) + __unregister_request(mdsc, req); goto out; + } if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { -- cgit v1.2.3 From d58dd25632be5ec0250630ccddfea7e9bd913ecb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 31 Oct 2013 09:10:47 +0800 Subject: ceph: wake up 'safe' waiters when unregistering request commit fc55d2c9448b34218ca58733a6f51fbede09575b upstream. We also need to wake up 'safe' waiters if error occurs or request aborted. Otherwise sync(2)/fsync(2) may hang forever. Signed-off-by: Yan, Zheng Signed-off-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mds_client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0af8e298d5b9..1ce14c18c468 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -639,6 +639,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, req->r_unsafe_dir = NULL; } + complete_all(&req->r_safe_completion); + ceph_mdsc_put_request(req); } @@ -2154,7 +2156,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (head->safe) { req->r_got_safe = true; __unregister_request(mdsc, req); - complete_all(&req->r_safe_completion); if (req->r_got_unsafe) { /* -- cgit v1.2.3 From 6b8588219a59ae34f99e20f17bc9756f831a9b2e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Dec 2013 09:31:36 -0500 Subject: ext4: call ext4_error_inode() if jbd2_journal_dirty_metadata() fails commit ae1495b12df1897d4f42842a7aa7276d920f6290 upstream. While it's true that errors can only happen if there is a bug in jbd2_journal_dirty_metadata(), if a bug does happen, we need to halt the kernel or remount the file system read-only in order to avoid further data loss. The ext4_journal_abort_handle() function doesn't do any of this, and while it's likely that this call (since it doesn't adjust refcounts) will likely result in the file system eventually deadlocking since the current transaction will never be able to close, it's much cleaner to call let ext4's error handling system deal with this situation. There's a separate bug here which is that if certain jbd2 errors errors occur and file system is mounted errors=continue, the file system will probably eventually end grind to a halt as described above. But things have been this way in a long time, and usually when we have these sorts of errors it's pretty much a disaster --- and that's why the jbd2 layer aggressively retries memory allocations, which is the most likely cause of these jbd2 errors. Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4_jbd2.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 1c88061da526..1be3996b5942 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -223,6 +223,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "journal_dirty_metadata failed: " + "handle type %u started at line %u, " + "credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); } } else { if (inode) -- cgit v1.2.3 From e696abfcc89a087afef75ebca0848f37fb9877ae Mon Sep 17 00:00:00 2001 From: Junho Ryu Date: Tue, 3 Dec 2013 18:10:28 -0500 Subject: ext4: fix use-after-free in ext4_mb_new_blocks commit 4e8d2139802ce4f41936a687f06c560b12115247 upstream. ext4_mb_put_pa should hold pa->pa_lock before accessing pa->pa_count. While ext4_mb_use_preallocated checks pa->pa_deleted first and then increments pa->count later, ext4_mb_put_pa decrements pa->pa_count before holding pa->pa_lock and then sets pa->pa_deleted. * Free sequence ext4_mb_put_pa (1): atomic_dec_and_test pa->pa_count ext4_mb_put_pa (2): lock pa->pa_lock ext4_mb_put_pa (3): check pa->pa_deleted ext4_mb_put_pa (4): set pa->pa_deleted=1 ext4_mb_put_pa (5): unlock pa->pa_lock ext4_mb_put_pa (6): remove pa from a list ext4_mb_pa_callback: free pa * Use sequence ext4_mb_use_preallocated (1): iterate over preallocation ext4_mb_use_preallocated (2): lock pa->pa_lock ext4_mb_use_preallocated (3): check pa->pa_deleted ext4_mb_use_preallocated (4): increase pa->pa_count ext4_mb_use_preallocated (5): unlock pa->pa_lock ext4_mb_release_context: access pa * Use-after-free sequence [initial status] pa_deleted = 0, pa_count = 1> ext4_mb_use_preallocated (1): iterate over preallocation ext4_mb_use_preallocated (2): lock pa->pa_lock ext4_mb_use_preallocated (3): check pa->pa_deleted ext4_mb_put_pa (1): atomic_dec_and_test pa->pa_count [pa_count decremented] pa_deleted = 0, pa_count = 0> ext4_mb_use_preallocated (4): increase pa->pa_count [pa_count incremented] pa_deleted = 0, pa_count = 1> ext4_mb_use_preallocated (5): unlock pa->pa_lock ext4_mb_put_pa (2): lock pa->pa_lock ext4_mb_put_pa (3): check pa->pa_deleted ext4_mb_put_pa (4): set pa->pa_deleted=1 [race condition!] pa_deleted = 1, pa_count = 1> ext4_mb_put_pa (5): unlock pa->pa_lock ext4_mb_put_pa (6): remove pa from a list ext4_mb_pa_callback: free pa ext4_mb_release_context: access pa AddressSanitizer has detected use-after-free in ext4_mb_new_blocks Bug report: http://goo.gl/rG1On3 Signed-off-by: Junho Ryu Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59c6750b894f..c1f58e0f26c3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3423,6 +3423,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + + BUG_ON(atomic_read(&pa->pa_count)); + BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } @@ -3436,11 +3439,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_group_t grp; ext4_fsblk_t grp_blk; - if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) - return; - /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); + if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { + spin_unlock(&pa->pa_lock); + return; + } + if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; -- cgit v1.2.3 From ae21dda05193c441bde106a4bbf88c185a68fbed Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 3 Dec 2013 21:22:21 -0500 Subject: ext4: check for overlapping extents in ext4_valid_extent_entries() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5946d089379a35dda0e531710b48fca05446a196 upstream. A corrupted ext4 may have out of order leaf extents, i.e. extent: lblk 0--1023, len 1024, pblk 9217, flags: LEAF UNINIT extent: lblk 1000--2047, len 1024, pblk 10241, flags: LEAF UNINIT ^^^^ overlap with previous extent Reading such extent could hit BUG_ON() in ext4_es_cache_extent(). BUG_ON(end < lblk); The problem is that __read_extent_tree_block() tries to cache holes as well but assumes 'lblk' is greater than 'prev' and passes underflowed length to ext4_es_cache_extent(). Fix it by checking for overlapping extents in ext4_valid_extent_entries(). I hit this when fuzz testing ext4, and am able to reproduce it by modifying the on-disk extent by hand. Also add the check for (ee_block + len - 1) in ext4_valid_extent() to make sure the value is not overflow. Ran xfstests on patched ext4 and no regression. Cc: Lukáš Czerner Signed-off-by: Eryu Guan Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index dc1e03047226..d480d49d2696 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); + ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); + ext4_lblk_t last = lblock + len - 1; - if (len == 0) + if (lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t pblock = 0; + ext4_lblk_t lblock = 0; + ext4_lblk_t prev = 0; + int len = 0; while (entries) { if (!ext4_valid_extent(inode, ext)) return 0; + + /* Check for overlapping extents */ + lblock = le32_to_cpu(ext->ee_block); + len = ext4_ext_get_actual_len(ext); + if ((lblock <= prev) && prev) { + pblock = ext4_ext_pblock(ext); + es->s_last_error_block = cpu_to_le64(pblock); + return 0; + } ext++; entries--; + prev = lblock + len - 1; } } else { struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh); -- cgit v1.2.3 From b35e443037c2e162a015a5f46009e8c21e673b63 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 8 Dec 2013 21:11:59 -0500 Subject: ext4: Do not reserve clusters when fs doesn't support extents commit 30fac0f75da24dd5bb43c9e911d2039a984ac815 upstream. When the filesystem doesn't support extents (like in ext2/3 compatibility modes), there is no need to reserve any clusters. Space estimates for writing are exact, hole punching doesn't need new metadata, and there are no unwritten extents to convert. This fixes a problem when filesystem still having some free space when accessed with a native ext2/3 driver suddently reports ENOSPC when accessed with ext4 driver. Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Reviewed-by: Lukas Czerner Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3f7c39e6d097..e4923b6a9e39 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3213,10 +3213,18 @@ int ext4_calculate_overhead(struct super_block *sb) } -static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) +static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; + /* + * There's no need to reserve anything when we aren't using extents. + * The space estimates are exact, there are no unwritten extents, + * hole punching doesn't need new metadata... This is needed especially + * to keep ext2/3 backward compatibility. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + return 0; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run @@ -3225,7 +3233,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi) * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits; + resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> + EXT4_SB(sb)->s_cluster_bits; do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); @@ -3969,10 +3978,10 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi)); + err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sbi)); + "reserved pool", ext4_calculate_resv_clusters(sb)); goto failed_mount4a; } -- cgit v1.2.3 From ad403b48be4d1777fdfb65966cefee9e2d8a8bcd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 18 Dec 2013 00:44:44 -0500 Subject: ext4: fix deadlock when writing in ENOSPC conditions commit 34cf865d54813aab3497838132fb1bbd293f4054 upstream. Akira-san has been reporting rare deadlocks of his machine when running xfstests test 269 on ext4 filesystem. The problem turned out to be in ext4_da_reserve_metadata() and ext4_da_reserve_space() which called ext4_should_retry_alloc() while holding i_data_sem. Since ext4_should_retry_alloc() can force a transaction commit, this is a lock ordering violation and leads to deadlocks. Fix the problem by just removing the retry loops. These functions should just report ENOSPC to the caller (e.g. ext4_da_write_begin()) and that function must take care of retrying after dropping all necessary locks. Reported-and-tested-by: Akira Fujita Reviewed-by: Zheng Liu Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 904ca1a21dce..cb2bdc7ccb05 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1263,7 +1263,6 @@ static int ext4_journalled_write_end(struct file *file, */ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1275,7 +1274,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1295,10 +1293,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } return -ENOSPC; } ei->i_reserved_meta_blocks += md_needed; @@ -1312,7 +1306,6 @@ repeat: */ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) { - int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); unsigned int md_needed; @@ -1334,7 +1327,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) * in order to allocate nrblocks * worse case is one extent per block */ -repeat: spin_lock(&ei->i_block_reservation_lock); /* * ext4_calc_metadata_amount() has side effects, which we have @@ -1354,10 +1346,6 @@ repeat: ei->i_da_metadata_calc_len = save_len; ei->i_da_metadata_calc_last_lblock = save_last_lblock; spin_unlock(&ei->i_block_reservation_lock); - if (ext4_should_retry_alloc(inode->i_sb, &retries)) { - cond_resched(); - goto repeat; - } dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } -- cgit v1.2.3 From cd7442730bdf3c8ecdd9a0ae8d77750b359fe60e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 20 Dec 2013 09:29:35 -0500 Subject: ext4: add explicit casts when masking cluster sizes commit f5a44db5d2d677dfbf12deee461f85e9ec633961 upstream. The missing casts can cause the high 64-bits of the physical blocks to be lost. Set up new macros which allows us to make sure the right thing happen, even if at some point we end up supporting larger logical block numbers. Thanks to the Emese Revfy and the PaX security team for reporting this issue. Reported-by: PaX Team Reported-by: Emese Revfy Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 10 ++++++++++ fs/ext4/extents.c | 24 +++++++++++------------- fs/ext4/mballoc.c | 6 +++--- 3 files changed, 24 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5aae3d12d400..7bb2e2e55123 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -280,6 +280,16 @@ struct ext4_io_submit { /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) +/* Mask out the low bits to get the starting block of the cluster */ +#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ + ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ + ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Get the cluster offset */ +#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ + ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) +#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ + ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d480d49d2696..ecabf00829e2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1772,8 +1772,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, depth = ext_depth(inode); if (!path[depth].p_ext) goto out; - b2 = le32_to_cpu(path[depth].p_ext->ee_block); - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block)); /* * get the next allocated block if the extent in the path @@ -1783,7 +1782,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi, b2 = ext4_ext_next_allocated_block(path); if (b2 == EXT_MAX_BLOCKS) goto out; - b2 &= ~(sbi->s_cluster_ratio - 1); + b2 = EXT4_LBLK_CMASK(sbi, b2); } /* check for wrap through zero on extent logical start block*/ @@ -2444,7 +2443,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * truncate operation has removed all of the blocks in * the cluster. */ - if (pblk & (sbi->s_cluster_ratio - 1) && + if (EXT4_PBLK_COFF(sbi, pblk) && (ee_len == num)) *partial_cluster = EXT4_B2C(sbi, pblk); else @@ -3675,7 +3674,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t lblk_start, lblk_end; - lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); + lblk_start = EXT4_LBLK_CMASK(sbi, lblk); lblk_end = lblk_start + sbi->s_cluster_ratio - 1; return ext4_find_delalloc_range(inode, lblk_start, lblk_end); @@ -3734,9 +3733,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks); /* Check towards left side */ - c_offset = lblk_start & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start); if (c_offset) { - lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); + lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start); lblk_to = lblk_from + c_offset - 1; if (ext4_find_delalloc_range(inode, lblk_from, lblk_to)) @@ -3744,7 +3743,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } /* Now check towards right. */ - c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); + c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks); if (allocated_clusters && c_offset) { lblk_from = lblk_start + num_blks; lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; @@ -3952,7 +3951,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, struct ext4_ext_path *path) { struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ext4_lblk_t ex_cluster_start, ex_cluster_end; ext4_lblk_t rr_cluster_start; ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); @@ -3970,8 +3969,7 @@ static int get_implied_cluster_alloc(struct super_block *sb, (rr_cluster_start == ex_cluster_start)) { if (rr_cluster_start == ex_cluster_end) ee_start += ee_len - 1; - map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) + - c_offset; + map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset; map->m_len = min(map->m_len, (unsigned) sbi->s_cluster_ratio - c_offset); /* @@ -4125,7 +4123,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); + cluster_offset = EXT4_LBLK_CMASK(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned @@ -4193,7 +4191,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * needed so that future calls to get_implied_cluster_alloc() * work correctly. */ - offset = map->m_lblk & (sbi->s_cluster_ratio - 1); + offset = EXT4_LBLK_COFF(sbi, map->m_lblk); ar.len = EXT4_NUM_B2C(sbi, offset+allocated); ar.goal -= offset; ar.logical -= offset; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c1f58e0f26c3..4c0e3e023fc8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4107,7 +4107,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ - ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1); + ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; @@ -4644,7 +4644,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ - overflow = block & (sbi->s_cluster_ratio - 1); + overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; @@ -4658,7 +4658,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += overflow; } } - overflow = count & (sbi->s_cluster_ratio - 1); + overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) -- cgit v1.2.3 From 8164ddf1f6f9fca6020143a259a9f9667d53c1e4 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 30 Oct 2013 11:10:52 -0400 Subject: ext4: fix FITRIM in no journal mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8f9ff189205a6817aee5a1f996f876541f86e07c upstream. When using FITRIM ioctl on a file system without journal it will only trim the block group once, no matter how many times you invoke FITRIM ioctl and how many block you release from the block group. It is because we only clear EXT4_GROUP_INFO_WAS_TRIMMED_BIT in journal callback. Fix this by clearing the bit in no journal mode as well. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reported-by: Jorge Fábregas Signed-off-by: Greg Kroah-Hartman --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4c0e3e023fc8..fba960ee26de 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4771,8 +4771,8 @@ do_more: " group:%d block:%d count:%lu failed" " with %d", block_group, bit, count, err); - } - + } else + EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); -- cgit v1.2.3 From 88285f766c96205ccf30afd3d0621401701ff26c Mon Sep 17 00:00:00 2001 From: Li Wang Date: Wed, 13 Nov 2013 15:22:14 +0800 Subject: ceph: Avoid data inconsistency due to d-cache aliasing in readpage() commit 56f91aad69444d650237295f68c195b74d888d95 upstream. If the length of data to be read in readpage() is exactly PAGE_CACHE_SIZE, the original code does not flush d-cache for data consistency after finishing reading. This patches fixes this. Signed-off-by: Li Wang Signed-off-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/addr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3e68ac101040..5da06f020986 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -213,9 +213,13 @@ static int readpage_nounlock(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); goto out; - } else if (err < PAGE_CACHE_SIZE) { + } else { + if (err < PAGE_CACHE_SIZE) { /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, err, PAGE_CACHE_SIZE); + } else { + flush_dcache_page(page); + } } SetPageUptodate(page); -- cgit v1.2.3 From 21b6291b88c534d8c6018ad483e232abab63f644 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 6 Dec 2013 11:52:34 +0000 Subject: GFS2: don't hold s_umount over blkdev_put commit dfe5b9ad83a63180f358b27d1018649a27b394a9 upstream. This is a GFS2 version of Tejun's patch: 4f331f01b9c43bf001d3ffee578a97a1e0633eac vfs: don't hold s_umount over close_bdev_exclusive() call In this case its blkdev_put itself that is the issue and this patch uses the same solution of dropping and retaking s_umount. Reported-by: Tejun Heo Reported-by: Al Viro Signed-off-by: Steven Whitehouse Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/ops_fstype.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 60ede2a0f43f..f7dd3b4f8ab0 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1317,8 +1317,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; -- cgit v1.2.3 From 8553459e73c6da3e5b9da9239dd8ef017181252a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 18 Dec 2013 14:14:52 +0000 Subject: GFS2: Fix incorrect invalidation for DIO/buffered I/O commit dfd11184d894cd0a92397b25cac18831a1a6a5bc upstream. In patch 209806aba9d540dde3db0a5ce72307f85f33468f we allowed local deferred locks to be granted against a cached exclusive lock. That opened up a corner case which this patch now fixes. The solution to the problem is to check whether we have cached pages each time we do direct I/O and if so to unmap, flush and invalidate those pages. Since the glock state machine normally does that for us, mostly the code will be a no-op. Signed-off-by: Steven Whitehouse Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/aops.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0bad69ed6336..76251600cbea 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -999,6 +999,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1019,6 +1020,35 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_length(iov, nr_segs); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + return rv; + truncate_inode_pages_range(mapping, lstart, end); + } + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); -- cgit v1.2.3 From ceed0859339443bc4fa22b9e0a825028ba8cd330 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 Dec 2013 21:12:59 -0500 Subject: jbd2: don't BUG but return ENOSPC if a handle runs out of space commit f6c07cad081ba222d63623d913aafba5586c1d2c upstream. If a handle runs out of space, we currently stop the kernel with a BUG in jbd2_journal_dirty_metadata(). This makes it hard to figure out what might be going on. So return an error of ENOSPC, so we can let the file system layer figure out what is going on, to make it more likely we can get useful debugging information). This should make it easier to debug problems such as the one which was reported by: https://bugzilla.kernel.org/show_bug.cgi?id=44731 The only two callers of this function are ext4_handle_dirty_metadata() and ocfs2_journal_dirty(). The ocfs2 function will trigger a BUG_ON(), which means there will be no change in behavior. The ext4 function will call ext4_error_inode() which will print the useful debugging information and then handle the situation using ext4's error handling mechanisms (i.e., which might mean halting the kernel or remounting the file system read-only). Also, since both file systems already call WARN_ON(), drop the WARN_ON from jbd2_journal_dirty_metadata() to avoid two stack traces from being displayed. Signed-off-by: "Theodore Ts'o" Cc: ocfs2-devel@oss.oracle.com Acked-by: Joel Becker Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/transaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e0c0bc275924..a6917125f215 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1151,7 +1151,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * once a transaction -bzzz */ jh->b_modified = 1; - J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + if (handle->h_buffer_credits <= 0) { + ret = -ENOSPC; + goto out_unlock_bh; + } handle->h_buffer_credits--; } @@ -1234,7 +1237,6 @@ out_unlock_bh: jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); - WARN_ON(ret); /* All errors are bugs, so dump the stack */ return ret; } -- cgit v1.2.3 From f1b6559cfa8c280249b52112223459b00dcdb6e9 Mon Sep 17 00:00:00 2001 From: Emil Goode Date: Tue, 28 May 2013 16:59:00 +0200 Subject: ceph: improve error handling in ceph_mdsmap_decode commit c213b50b7dcbf06abcfbf1e4eee5b76586718bd9 upstream. This patch makes the following improvements to the error handling in the ceph_mdsmap_decode function: - Add a NULL check for return value from kcalloc - Make use of the variable err Signed-off-by: Emil Goode Signed-off-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mdsmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 9278dec9e940..d4d38977dcbb 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -138,6 +138,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_info[mds].export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); + if (m->m_info[mds].export_targets == NULL) + goto badmem; for (j = 0; j < num_export_targets; j++) m->m_info[mds].export_targets[j] = ceph_decode_32(&pexport_targets); @@ -170,7 +172,7 @@ bad: DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); ceph_mdsmap_destroy(m); - return ERR_PTR(-EINVAL); + return ERR_PTR(err); } void ceph_mdsmap_destroy(struct ceph_mdsmap *m) -- cgit v1.2.3 From d5f9a684f9f21324fa58e97be127589db6c06a56 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 25 Jun 2013 14:48:19 +0800 Subject: ceph: Free mdsc if alloc mdsc->mdsmap failed. commit fb3101b6f0db9ae3f35dc8e6ec908d0af8cdf12e upstream. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mds_client.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 1ce14c18c468..63c789eb1e3e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3044,8 +3044,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) + if (mdsc->mdsmap == NULL) { + kfree(mdsc); return -ENOMEM; + } init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); -- cgit v1.2.3 From 724184c7a1fc44955f8900c382f18545423bc486 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 1 Jul 2013 18:33:39 -0400 Subject: ceph: avoid accessing invalid memory commit 5446429630257f4723829409337a26c076907d5d upstream. when mounting ceph with a dev name that starts with a slash, ceph would attempt to access the character before that slash. Since we don't actually own that byte of memory, we would trigger an invalid access: [ 43.499934] BUG: unable to handle kernel paging request at ffff880fa3a97fff [ 43.500984] IP: [] parse_mount_options+0x1a4/0x300 [ 43.501491] PGD 743b067 PUD 10283c4067 PMD 10282a6067 PTE 8000000fa3a97060 [ 43.502301] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 43.503006] Dumping ftrace buffer: [ 43.503596] (ftrace buffer empty) [ 43.504046] CPU: 0 PID: 10879 Comm: mount Tainted: G W 3.10.0-sasha #1129 [ 43.504851] task: ffff880fa625b000 ti: ffff880fa3412000 task.ti: ffff880fa3412000 [ 43.505608] RIP: 0010:[] [] parse_mount_options$ [ 43.506552] RSP: 0018:ffff880fa3413d08 EFLAGS: 00010286 [ 43.507133] RAX: ffff880fa3a98000 RBX: ffff880fa3a98000 RCX: 0000000000000000 [ 43.507893] RDX: ffff880fa3a98001 RSI: 000000000000002f RDI: ffff880fa3a98000 [ 43.508610] RBP: ffff880fa3413d58 R08: 0000000000001f99 R09: ffff880fa3fe64c0 [ 43.509426] R10: ffff880fa3413d98 R11: ffff880fa38710d8 R12: ffff880fa3413da0 [ 43.509792] R13: ffff880fa3a97fff R14: 0000000000000000 R15: ffff880fa3413d90 [ 43.509792] FS: 00007fa9c48757e0(0000) GS:ffff880fd2600000(0000) knlGS:000000000000$ [ 43.509792] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 43.509792] CR2: ffff880fa3a97fff CR3: 0000000fa3bb9000 CR4: 00000000000006b0 [ 43.509792] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 43.509792] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 43.509792] Stack: [ 43.509792] 0000e5180000000e ffffffff85ca1900 ffff880fa38710d8 ffff880fa3413d98 [ 43.509792] 0000000000000120 0000000000000000 ffff880fa3a98000 0000000000000000 [ 43.509792] ffffffff85cf32a0 0000000000000000 ffff880fa3413dc8 ffffffff818f3c72 [ 43.509792] Call Trace: [ 43.509792] [] ceph_mount+0xa2/0x390 [ 43.509792] [] ? pcpu_alloc+0x334/0x3c0 [ 43.509792] [] mount_fs+0x8d/0x1a0 [ 43.509792] [] ? __alloc_percpu+0x10/0x20 [ 43.509792] [] vfs_kern_mount+0x79/0x100 [ 43.509792] [] do_new_mount+0xcd/0x1c0 [ 43.509792] [] do_mount+0x15d/0x210 [ 43.509792] [] ? strndup_user+0x45/0x60 [ 43.509792] [] SyS_mount+0x9d/0xe0 [ 43.509792] [] tracesys+0xdd/0xe2 [ 43.509792] Code: 4c 8b 5d c0 74 0a 48 8d 50 01 49 89 14 24 eb 17 31 c0 48 83 c9 ff $ [ 43.509792] RIP [] parse_mount_options+0x1a4/0x300 [ 43.509792] RSP [ 43.509792] CR2: ffff880fa3a97fff [ 43.509792] ---[ end trace 22469cd81e93af51 ]--- Signed-off-by: Sasha Levin Reviewed-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7d377c9a5e35..6627b26a800c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -357,7 +357,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ - if (*dev_name_end != ':') { + if (dev_name_end < dev_name || *dev_name_end != ':') { pr_err("device name is missing path (no : separator in %s)\n", dev_name); goto out; -- cgit v1.2.3 From acb2e930c72a1d38504d6d9c1f338562bbb9aa55 Mon Sep 17 00:00:00 2001 From: Nathaniel Yazdani Date: Sun, 4 Aug 2013 21:04:30 -0700 Subject: ceph: fix null pointer dereference commit c338c07c51e3106711fad5eb599e375eadb6855d upstream. When register_session() is given an out-of-range argument for mds, ceph_mdsmap_get_addr() will return a null pointer, which would be given to ceph_con_open() & be dereferenced, causing a kernel oops. This fixes bug #4685 in the Ceph bug tracker . Signed-off-by: Nathaniel Yazdani Reviewed-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/mds_client.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 63c789eb1e3e..d6a536886472 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -414,6 +414,9 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; + if (mds >= mdsc->mdsmap->m_max_mds) + return ERR_PTR(-EINVAL); + s = kzalloc(sizeof(*s), GFP_NOFS); if (!s) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 10debbda102fe52eb75ba531b8ee79e98d3b8774 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 23 Jul 2013 16:48:01 +0300 Subject: ceph: cleanup types in striped_read() commit 688bac461ba3e9d221a879ab40b687f5d7b5b19c upstream. We pass in a u64 value for "len" and then immediately truncate away the upper 32 bits. Signed-off-by: Dan Carpenter Reviewed-by: Sage Weil Reviewed-by: Alex Elder Signed-off-by: Greg Kroah-Hartman --- fs/ceph/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 656e16907430..5b9baf5a867f 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -313,9 +313,9 @@ static int striped_read(struct inode *inode, { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len; + u64 pos, this_len, left; int io_align, page_align; - int left, pages_left; + int pages_left; int read; struct page **page_pos; int ret; @@ -346,7 +346,7 @@ more: ret = 0; hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret > 0) { @@ -378,7 +378,7 @@ more: if (pos + left > inode->i_size) left = inode->i_size - pos; - dout("zero tail %d\n", left); + dout("zero tail %llu\n", left); ceph_zero_page_vector_range(page_align + read, left, pages); read += left; -- cgit v1.2.3 From 01bb1b04a1dd3736d20b859a9bcf3d7509566c1f Mon Sep 17 00:00:00 2001 From: majianpeng Date: Fri, 2 Aug 2013 18:14:48 +0800 Subject: ceph: Add check returned value on func ceph_calc_ceph_pg. commit 2fbcbff1d6b9243ef71c64a8ab993bc3c7bb7af1 upstream. Func ceph_calc_ceph_pg maybe failed.So add check for returned value. Signed-off-by: Jianpeng Ma Reviewed-by: Sage Weil Signed-off-by: Sage Weil Signed-off-by: Greg Kroah-Hartman --- fs/ceph/ioctl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index a5ce62eb7806..669622fd1ae3 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -211,8 +211,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", ceph_ino(inode), dl.object_no); - ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, - ceph_file_layout_pg_pool(ci->i_layout)); + r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap, + ceph_file_layout_pg_pool(ci->i_layout)); + if (r < 0) { + up_read(&osdc->map_sem); + return r; + } dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); if (dl.osd >= 0) { -- cgit v1.2.3 From 533bc2950e085ee99225655f60276760c00af31f Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 6 Aug 2013 16:20:38 +0800 Subject: ceph: fix bugs about handling short-read for sync read mode. commit 02ae66d8b229708fd94b764f6c17ead1c7741fcf upstream. cephfs . show_layout >layyout.data_pool: 0 >layout.object_size: 4194304 >layout.stripe_unit: 4194304 >layout.stripe_count: 1 TestA: >dd if=/dev/urandom of=test bs=1M count=2 oflag=direct >dd if=/dev/urandom of=test bs=1M count=2 seek=4 oflag=direct >dd if=test of=/dev/null bs=6M count=1 iflag=direct The messages from func striped_read are: ceph: file.c:350 : striped_read 0~6291456 (read 0) got 2097152 HITSTRIPE SHORT ceph: file.c:350 : striped_read 2097152~4194304 (read 2097152) got 0 HITSTRIPE SHORT ceph: file.c:381 : zero tail 4194304 ceph: file.c:390 : striped_read returns 6291456 The hole of file is from 2M--4M.But actualy it zero the last 4M include the last 2M area which isn't a hole. Using this patch, the messages are: ceph: file.c:350 : striped_read 0~6291456 (read 0) got 2097152 HITSTRIPE SHORT ceph: file.c:358 : zero gap 2097152 to 4194304 ceph: file.c:350 : striped_read 4194304~2097152 (read 4194304) got 2097152 ceph: file.c:384 : striped_read returns 6291456 TestB: >echo majianpeng > test >dd if=test of=/dev/null bs=2M count=1 iflag=direct The messages are: ceph: file.c:350 : striped_read 0~6291456 (read 0) got 11 HITSTRIPE SHORT ceph: file.c:350 : striped_read 11~6291445 (read 11) got 0 HITSTRIPE SHORT ceph: file.c:390 : striped_read returns 11 For this case,it did once more striped_read.It's no meaningless. Using this patch, the message are: ceph: file.c:350 : striped_read 0~6291456 (read 0) got 11 HITSTRIPE SHORT ceph: file.c:384 : striped_read returns 11 Big thanks to Yan Zheng for the patch. Reviewed-by: Yan, Zheng Signed-off-by: Jianpeng Ma Signed-off-by: Greg Kroah-Hartman --- fs/ceph/file.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5b9baf5a867f..1ac0a59867dc 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -349,44 +349,37 @@ more: dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); - if (ret > 0) { - int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; - - if (read < pos - off) { - dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_align + read, - pos - off - read, pages); + if (ret >= 0) { + int didpages; + if (was_short && (pos + ret < inode->i_size)) { + u64 tmp = min(this_len - ret, + inode->i_size - pos - ret); + dout(" zero gap %llu to %llu\n", + pos + ret, pos + ret + tmp); + ceph_zero_page_vector_range(page_align + read + ret, + tmp, pages); + ret += tmp; } + + didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; pos += ret; read = pos - off; left -= ret; page_pos += didpages; pages_left -= didpages; - /* hit stripe? */ - if (left && hit_stripe) + /* hit stripe and need continue*/ + if (left && hit_stripe && pos < inode->i_size) goto more; } - if (was_short) { + if (ret >= 0) { + ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) *checkeof = 1; - - /* zero trailing bytes (inside i_size) */ - if (left > 0 && pos < inode->i_size) { - if (pos + left > inode->i_size) - left = inode->i_size - pos; - - dout("zero tail %llu\n", left); - ceph_zero_page_vector_range(page_align + read, left, - pages); - read += left; - } } - if (ret >= 0) - ret = read; dout("striped_read returns %d\n", ret); return ret; } -- cgit v1.2.3 From 30379f4553bae684a928c6bdd9dc606fd0ef6093 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Wed, 21 Aug 2013 15:02:51 +0800 Subject: ceph: allow sync_read/write return partial successed size of read/write. commit ee7289bfadda5f4ef60884547ebc9989c8fb314a upstream. For sync_read/write, it may do multi stripe operations.If one of those met erro, we return the former successed size rather than a error value. There is a exception for write-operation met -EOLDSNAPC.If this occur,we retry the whole write again. Signed-off-by: Jianpeng Ma Signed-off-by: Greg Kroah-Hartman --- fs/ceph/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 1ac0a59867dc..5de16f5ac7e9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -373,7 +373,7 @@ more: goto more; } - if (ret >= 0) { + if (read > 0) { ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) @@ -611,6 +611,8 @@ out: if (check_caps) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); + } else if (ret != -EOLDSNAPC && written > 0) { + ret = written; } return ret; } -- cgit v1.2.3 From 70139b6643db61809db76dd642af87979421d41d Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Mon, 6 Jan 2014 14:00:23 -0500 Subject: ext4: fix bigalloc regression commit d0abafac8c9162f39c4f6b2f8141b772a09b3770 upstream. Commit f5a44db5d2 introduced a regression on filesystems created with the bigalloc feature (cluster size > blocksize). It causes xfstests generic/006 and /013 to fail with an unexpected JBD2 failure and transaction abort that leaves the test file system in a read only state. Other xfstests run on bigalloc file systems are likely to fail as well. The cause is the accidental use of a cluster mask where a cluster offset was needed in ext4_ext_map_blocks(). Signed-off-by: Eric Whitney Cc: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ecabf00829e2..a2b625e279db 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4123,7 +4123,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); - cluster_offset = EXT4_LBLK_CMASK(sbi, map->m_lblk); + cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); /* * If we are doing bigalloc, check to see if the extent returned -- cgit v1.2.3 From 350f737ed039d2b36329ddcbcdf78e12c6d8c758 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 6 Jan 2014 17:16:01 -0500 Subject: GFS2: Increase i_writecount during gfs2_setattr_chown commit 62e96cf81988101fe9e086b2877307b6adda5197 upstream. This patch calls get_write_access in function gfs2_setattr_chown, which merely increases inode->i_writecount for the duration of the function. That will ensure that any file closes won't delete the inode's multi-block reservation while the function is running. It also ensures that a multi-block reservation exists when needed for quota change operations during the chown. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/inode.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 62b484e4a9e4..bc5dac400125 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1536,10 +1536,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = gfs2_quota_lock(ip, nuid, ngid); + error = get_write_access(inode); if (error) return error; + error = gfs2_rs_alloc(ip); + if (error) + goto out; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); @@ -1566,6 +1578,8 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock_q: gfs2_quota_unlock(ip); +out: + put_write_access(inode); return error; } -- cgit v1.2.3 From 3717eee3c4d1d44ded3bd84bf5f6437ec2aa5496 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 8 Nov 2013 16:31:29 -0800 Subject: vfs: In d_path don't call d_dname on a mount point commit f48cfddc6729ef133933062320039808bafa6f45 upstream. Aditya Kali (adityakali@google.com) wrote: > Commit bf056bfa80596a5d14b26b17276a56a0dcb080e5: > "proc: Fix the namespace inode permission checks." converted > the namespace files into symlinks. The same commit changed > the way namespace bind mounts appear in /proc/mounts: > $ mount --bind /proc/self/ns/ipc /mnt/ipc > Originally: > $ cat /proc/mounts | grep ipc > proc /mnt/ipc proc rw,nosuid,nodev,noexec 0 0 > > After commit bf056bfa80596a5d14b26b17276a56a0dcb080e5: > $ cat /proc/mounts | grep ipc > proc ipc:[4026531839] proc rw,nosuid,nodev,noexec 0 0 > > This breaks userspace which expects the 2nd field in > /proc/mounts to be a valid path. The symlink /proc//ns/{ipc,mnt,net,pid,user,uts} point to dentries allocated with d_alloc_pseudo that we can mount, and that have interesting names printed out with d_dname. When these files are bind mounted /proc/mounts is not currently displaying the mount point correctly because d_dname is called instead of just displaying the path where the file is mounted. Solve this by adding an explicit check to distinguish mounted pseudo inodes and unmounted pseudo inodes. Unmounted pseudo inodes always use mount of their filesstem as the mnt_root in their path making these two cases easy to distinguish. Acked-by: Serge Hallyn Reported-by: Aditya Kali Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/dcache.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index da89cdfb21ab..9a59653d3449 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2686,8 +2686,13 @@ char *d_path(const struct path *path, char *buf, int buflen) * thus don't need to be hashed. They also don't need a name until a * user wants to identify the object in /proc/pid/fd/. The little hack * below allows us to generate a name for these objects on demand: + * + * Some pseudo inodes are mountable. When they are mounted + * path->dentry == path->mnt->mnt_root. In that case don't call d_dname + * and instead have d_path return the mounted path. */ - if (path->dentry->d_op && path->dentry->d_op->d_dname) + if (path->dentry->d_op && path->dentry->d_op->d_dname && + (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root)) return path->dentry->d_op->d_dname(path->dentry, buf, buflen); get_fs_root(current->fs, &root); -- cgit v1.2.3 From cc46cb337cf614b652b0068e6e388102bf8a9b54 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 14 Dec 2013 04:21:26 +0800 Subject: writeback: Fix data corruption on NFS commit f9b0e058cbd04ada76b13afffa7e1df830543c24 upstream. Commit 4f8ad655dbc8 "writeback: Refactor writeback_single_inode()" added a condition to skip clean inode. However this is wrong in WB_SYNC_ALL mode because there we also want to wait for outstanding writeback on possibly clean inode. This was causing occasional data corruption issues on NFS because it uses sync_inode() to make sure all outstanding writes are flushed to the server before truncating the inode and with sync_inode() returning prematurely file was sometimes extended back by an outstanding write after it was truncated. So modify the test to also check for pages under writeback in WB_SYNC_ALL mode. Fixes: 4f8ad655dbc82cf05d2edc11e66b78a42d38bf93 Reported-and-tested-by: Dan Duval Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3be57189efd5..e3ab1e4dc442 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -505,13 +505,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean. We don't want to mess with writeback - * lists in this function since flusher thread may be doing for example - * sync in parallel and if we move the inode, it could get skipped. So - * here we make sure inode is on some writeback list and leave it there - * unless we have completely cleaned the inode. + * Skip inode if it is clean and we have no outstanding writeback in + * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this + * function since flusher thread may be doing for example sync in + * parallel and if we move the inode, it could get skipped. So here we + * make sure inode is on some writeback list and leave it there unless + * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY) && + (wbc->sync_mode != WB_SYNC_ALL || + !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 4cb1e59ffcbee368f32c37ca5872fd73163c7783 Mon Sep 17 00:00:00 2001 From: Andreas Rohner Date: Tue, 14 Jan 2014 17:56:36 -0800 Subject: nilfs2: fix segctor bug that causes file system corruption commit 70f2fe3a26248724d8a5019681a869abdaf3e89a upstream. There is a bug in the function nilfs_segctor_collect, which results in active data being written to a segment, that is marked as clean. It is possible, that this segment is selected for a later segment construction, whereby the old data is overwritten. The problem shows itself with the following kernel log message: nilfs_sufile_do_cancel_free: segment 6533 must be clean Usually a few hours later the file system gets corrupted: NILFS: bad btree node (blocknr=8748107): level = 0, flags = 0x0, nchildren = 0 NILFS error (device sdc1): nilfs_bmap_last_key: broken bmap (inode number=114660) The issue can be reproduced with a file system that is nearly full and with the cleaner running, while some IO intensive task is running. Although it is quite hard to reproduce. This is what happens: 1. The cleaner starts the segment construction 2. nilfs_segctor_collect is called 3. sc_stage is on NILFS_ST_SUFILE and segments are freed 4. sc_stage is on NILFS_ST_DAT current segment is full 5. nilfs_segctor_extend_segments is called, which allocates a new segment 6. The new segment is one of the segments freed in step 3 7. nilfs_sufile_cancel_freev is called and produces an error message 8. Loop around and the collection starts again 9. sc_stage is on NILFS_ST_SUFILE and segments are freed including the newly allocated segment, which will contain active data and can be allocated at a later time 10. A few hours later another segment construction allocates the segment and causes file system corruption This can be prevented by simply reordering the statements. If nilfs_sufile_cancel_freev is called before nilfs_segctor_extend_segments the freed segments are marked as dirty and cannot be allocated any more. Signed-off-by: Andreas Rohner Reviewed-by: Ryusuke Konishi Tested-by: Andreas Rohner Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/segment.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index cbd66188a28b..958a5b57ed4a 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci, nilfs_clear_logs(&sci->sc_segbufs); - err = nilfs_segctor_extend_segments(sci, nilfs, nadd); - if (unlikely(err)) - return err; - if (sci->sc_stage.flags & NILFS_CF_SUFREED) { err = nilfs_sufile_cancel_freev(nilfs->ns_sufile, sci->sc_freesegs, sci->sc_nfreesegs, NULL); WARN_ON(err); /* do not happen */ + sci->sc_stage.flags &= ~NILFS_CF_SUFREED; } + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); sci->sc_stage = prev_stage; } -- cgit v1.2.3 From 209bd086e423d98ddf2fd52a6f1afda15b5758b1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 7 Jan 2014 12:58:19 -0500 Subject: ext4: avoid clearing beyond i_blocks when truncating an inline data file commit 09c455aaa8f47a94d5bafaa23d58365768210507 upstream. A missing cast means that when we are truncating a file which is less than 60 bytes, we don't clear the correct area of memory, and in fact we can end up truncating the next inode in the inode table, or worse yet, some other kernel data structure. Addresses-Coverity-Id: #751987 Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inline.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 33331b4c2178..e350be6c7ac6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1957,9 +1957,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline) } /* Clear the content within i_blocks. */ - if (i_size < EXT4_MIN_INLINE_DATA_SIZE) - memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0, - EXT4_MIN_INLINE_DATA_SIZE - i_size); + if (i_size < EXT4_MIN_INLINE_DATA_SIZE) { + void *p = (void *) ext4_raw_inode(&is.iloc)->i_block; + memset(p + i_size, 0, + EXT4_MIN_INLINE_DATA_SIZE - i_size); + } EXT4_I(inode)->i_inline_size = i_size < EXT4_MIN_INLINE_DATA_SIZE ? -- cgit v1.2.3 From 9a8bd082503f4f91176d8fe0b91a23c5b768008e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 20 Jan 2014 15:26:15 -0800 Subject: vfs: Is mounted should be testing mnt_ns for NULL or error. commit 260a459d2e39761fbd39803497205ce1690bc7b1 upstream. A bug was introduced with the is_mounted helper function in commit f7a99c5b7c8bd3d3f533c8b38274e33f3da9096e Author: Al Viro Date: Sat Jun 9 00:59:08 2012 -0400 get rid of ->mnt_longterm it's enough to set ->mnt_ns of internal vfsmounts to something distinct from all struct mnt_namespace out there; then we can just use the check for ->mnt_ns != NULL in the fast path of mntput_no_expire() Signed-off-by: Al Viro The intent was to test if the real_mount(vfsmount)->mnt_ns was NULL_OR_ERR but the code is actually testing real_mount(vfsmount) and always returning true. The result is d_absolute_path returning paths it should be hiding. Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/mount.h b/fs/mount.h index 64a858143ff9..68d80bdcd081 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -73,7 +73,7 @@ static inline int mnt_has_parent(struct mount *mnt) static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ - return !IS_ERR_OR_NULL(real_mount(mnt)); + return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int); -- cgit v1.2.3 From 5c61a3d3ff81ddb53006bbc84aca8363b4f2841b Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 7 Jan 2014 17:26:58 +0800 Subject: Btrfs: handle EAGAIN case properly in btrfs_drop_snapshot() commit 90515e7f5d7d24cbb2a4038a3f1b5cfa2921aa17 upstream. We may return early in btrfs_drop_snapshot(), we shouldn't call btrfs_std_err() for this case, fix it. Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b6d20bc2388..bbafa05519da 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7491,7 +7491,7 @@ out: */ if (root_dropped == false) btrfs_add_dead_root(root); - if (err) + if (err && err != -EAGAIN) btrfs_std_error(root->fs_info, err); return err; } -- cgit v1.2.3 From f0cea52a481c749ac0ed4cf0fd8a00f90306ebd5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 Jan 2014 18:15:52 +0100 Subject: btrfs: restrict snapshotting to own subvolumes commit d024206133ce21936b3d5780359afc00247655b7 upstream. Currently, any user can snapshot any subvolume if the path is accessible and thus indirectly create and keep files he does not own under his direcotries. This is not possible with traditional directories. In security context, a user can snapshot root filesystem and pin any potentially buggy binaries, even if the updates are applied. All the snapshots are visible to the administrator, so it's possible to verify if there are suspicious snapshots. Another more practical problem is that any user can pin the space used by eg. root and cause ENOSPC. Original report: https://bugs.launchpad.net/ubuntu/+source/apparmor/+bug/484786 Signed-off-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 145b2c75ab83..783906c687b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1528,6 +1528,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, printk(KERN_INFO "btrfs: Snapshot src from " "another FS\n"); ret = -EINVAL; + } else if (!inode_owner_or_capable(src_inode)) { + /* + * Subvolume creation is not restricted, but snapshots + * are limited to own subvolumes only + */ + ret = -EPERM; } else { ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, -- cgit v1.2.3 From d840f9899a6bc8bd88b3c87cd067ddf39a6ada45 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 22 Jan 2014 19:36:57 +0100 Subject: fuse: fix pipe_buf_operations commit 28a625cbc2a14f17b83e47ef907b2658576a32aa upstream. Having this struct in module memory could Oops when if the module is unloaded while the buffer still persists in a pipe. Since sock_pipe_buf_ops is essentially the same as fuse_dev_pipe_buf_steal merge them into nosteal_pipe_buf_ops (this is the same as default_pipe_buf_ops except stealing the page from the buffer is not allowed). Reported-by: Al Viro Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 22 +++++----------------- fs/splice.c | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1d55f9465400..23bf1a52a5da 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); } -static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - return 1; -} - -static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { - .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, - .confirm = generic_pipe_buf_confirm, - .release = generic_pipe_buf_release, - .steal = fuse_dev_pipe_buf_steal, - .get = generic_pipe_buf_get, -}; - static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, buf->page = bufs[page_nr].page; buf->offset = bufs[page_nr].offset; buf->len = bufs[page_nr].len; - buf->ops = &fuse_dev_pipe_buf_ops; + /* + * Need to be careful about this. Having buf->ops in module + * code can Oops if the buffer persists after module unload. + */ + buf->ops = &nosteal_pipe_buf_ops; pipe->nrbufs++; page_nr++; diff --git a/fs/splice.c b/fs/splice.c index d37431dd60a1..4b5a5fac3383 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -555,6 +555,24 @@ static const struct pipe_buf_operations default_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +/* Pipe buffer operations for a socket and similar. */ +const struct pipe_buf_operations nosteal_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_nosteal, + .get = generic_pipe_buf_get, +}; +EXPORT_SYMBOL(nosteal_pipe_buf_ops); + static ssize_t kernel_readv(struct file *file, const struct iovec *vec, unsigned long vlen, loff_t offset) { -- cgit v1.2.3 From 060168af7cbbade31561f3a1efbc1bac91ee2166 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 27 Jan 2014 17:07:19 -0800 Subject: compat: fix sys_fanotify_mark commit 592f6b842f64e416c7598a1b97c649b34241e22d upstream. Commit 91c2e0bcae72 ("unify compat fanotify_mark(2), switch to COMPAT_SYSCALL_DEFINE") added a new unified compat fanotify_mark syscall to be used by all architectures. Unfortunately the unified version merges the split mask parameter in a wrong way: the lower and higher word got swapped. This was discovered with glibc's tst-fanotify test case. Signed-off-by: Heiko Carstens Reported-by: Andreas Krebbel Cc: "James E.J. Bottomley" Acked-by: "David S. Miller" Acked-by: Al Viro Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/notify/fanotify/fanotify_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 77cc85dd0db0..f1680cdbd88b 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -867,9 +867,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark, { return sys_fanotify_mark(fanotify_fd, flags, #ifdef __BIG_ENDIAN - ((__u64)mask1 << 32) | mask0, -#else ((__u64)mask0 << 32) | mask1, +#else + ((__u64)mask1 << 32) | mask0, #endif dfd, pathname); } -- cgit v1.2.3 From 9580348ce827b7026c12f2fde1da7d064c338dc6 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 29 Jan 2014 14:05:44 -0800 Subject: fs/compat: fix parameter handling for compat readv/writev syscalls commit dfd948e32af2e7b28bcd7a490c0a30d4b8df2a36 upstream. We got a report that the pwritev syscall does not work correctly in compat mode on s390. It turned out that with commit 72ec35163f9f ("switch compat readv/writev variants to COMPAT_SYSCALL_DEFINE") we lost the zero extension of a couple of syscall parameters because the some parameter types haven't been converted from unsigned long to compat_ulong_t. This is needed for architectures where the ABI requires that the caller of a function performed zero and/or sign extension to 64 bit of all parameters. Signed-off-by: Heiko Carstens Cc: Al Viro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Hendrik Brueckner Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/read_write.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 2cefa417be34..f6b7c600eb7f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -947,9 +947,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -983,9 +983,9 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_preadv64(fd, vec, vlen, pos); @@ -1013,9 +1013,9 @@ out: return ret; } -COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, - unsigned long, vlen) + compat_ulong_t, vlen) { struct fd f = fdget(fd); ssize_t ret; @@ -1049,9 +1049,9 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, return ret; } -COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd, +COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, u32, pos_low, u32, pos_high) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; return compat_sys_pwritev64(fd, vec, vlen, pos); -- cgit v1.2.3 From 7462402eae491665f58f434dbf45bd964cbf8b79 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 29 Jan 2014 14:05:46 -0800 Subject: fs/compat: fix lookup_dcookie() parameter handling commit d8d14bd09cddbaf0168d61af638455a26bd027ff upstream. Commit d5dc77bfeeab ("consolidate compat lookup_dcookie()") coverted all architectures to the new compat_sys_lookup_dcookie() syscall. The "len" paramater of the new compat syscall must have the type compat_size_t in order to enforce zero extension for architectures where the ABI requires that the caller of a function performed zero and/or sign extension to 64 bit of all parameters. Signed-off-by: Heiko Carstens Cc: Al Viro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Hendrik Brueckner Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/dcookies.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcookies.c b/fs/dcookies.c index ab5954b50267..ac44a69fbea9 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -204,7 +204,7 @@ out: } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len) +COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); -- cgit v1.2.3 From ed6161470c6c8c45d97211d2426eb050dc382a4a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 21 Nov 2013 17:58:08 +0200 Subject: ore: Fix wrong math in allocation of per device BIO commit aad560b7f63b495f48a7232fd086c5913a676e6f upstream. At IO preparation we calculate the max pages at each device and allocate a BIO per device of that size. The calculation was wrong on some unaligned corner cases offset/length combination and would make prepare return with -ENOMEM. This would be bad for pnfs-objects that would in that case IO through MDS. And fatal for exofs were it would fail writes with EIO. Fix it by doing the proper math, that will work in all cases. (I ran a test with all possible offset/length combinations this time round). Also when reading we do not need to allocate for the parity units since we jump over them. Also lower the max_io_length to take into account the parity pages so not to allocate BIOs bigger than PAGE_SIZE Signed-off-by: Boaz Harrosh Signed-off-by: Greg Kroah-Hartman --- fs/exofs/ore.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index b74422888604..85cde3e76290 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -103,7 +103,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) layout->max_io_length = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * - layout->group_width; + (layout->group_width - layout->parity); if (layout->parity) { unsigned stripe_length = (layout->group_width - layout->parity) * @@ -286,7 +286,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, if (length) { ore_calc_stripe_info(layout, offset, length, &ios->si); ios->length = ios->si.length; - ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; if (layout->parity) _ore_post_alloc_raid_stuff(ios); } @@ -536,6 +537,7 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, u64 H = LmodS - G * T; u32 N = div_u64(H, U); + u32 Nlast; /* "H - (N * U)" is just "H % U" so it's bound to u32 */ u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; @@ -568,6 +570,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, si->length = T - H; if (si->length > length) si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + si->M = M; } EXPORT_SYMBOL(ore_calc_stripe_info); @@ -583,13 +589,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, int ret; if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned nr_pages = ios->nr_pages * ios->layout->group_width / - (ios->layout->group_width - - ios->layout->parity); - unsigned bio_size = (nr_pages + pages_in_stripe) / - ios->layout->group_width; + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); if (unlikely(!per_dev->bio)) { @@ -609,8 +618,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) { - ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", - per_dev->bio->bi_vcnt); + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); ret = -ENOMEM; goto out; } @@ -1098,7 +1111,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc, size_attr->attr = g_attr_logical_length; size_attr->attr.val_ptr = &size_attr->newsize; - ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", _LLU(oc->comps->obj.id), _LLU(obj_size), i); ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, &size_attr->attr); -- cgit v1.2.3 From afce3609db524d801a8fc4096810fa6844062f65 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 4 Dec 2013 17:39:23 -0500 Subject: NFSv4: OPEN must handle the NFS4ERR_IO return code correctly commit c7848f69ec4a8c03732cde5c949bd2aa711a9f4b upstream. decode_op_hdr() cannot distinguish between an XDR decoding error and the perfectly valid errorcode NFS4ERR_IO. This is normally not a problem, but for the particular case of OPEN, we need to be able to increment the NFSv4 open sequence id when the server returns a valid response. Reported-by: J Bruce Fields Link: http://lkml.kernel.org/r/20131204210356.GA19452@fieldses.org Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4xdr.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4be8d135ed61..988efb4caac0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3002,7 +3002,8 @@ out_overflow: return -EIO; } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *nfs_retval) { __be32 *p; uint32_t opnum; @@ -3012,19 +3013,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) if (unlikely(!p)) goto out_overflow; opnum = be32_to_cpup(p++); - if (opnum != expected) { - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); - return -EIO; - } + if (unlikely(opnum != expected)) + goto out_bad_operation; nfserr = be32_to_cpup(p); - if (nfserr != NFS_OK) - return nfs4_stat_to_errno(nfserr); - return 0; + if (nfserr == NFS_OK) + *nfs_retval = 0; + else + *nfs_retval = nfs4_stat_to_errno(nfserr); + return true; +out_bad_operation: + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + *nfs_retval = -EREMOTEIO; + return false; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + *nfs_retval = -EIO; + return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + int retval; + + __decode_op_hdr(xdr, expected, &retval); + return retval; } /* Dummy routine */ @@ -4842,11 +4856,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) uint32_t savewords, bmlen, i; int status; - status = decode_op_hdr(xdr, OP_OPEN); - if (status != -EIO) - nfs_increment_open_seqid(status, res->seqid); - if (!status) - status = decode_stateid(xdr, &res->stateid); + if (!__decode_op_hdr(xdr, OP_OPEN, &status)) + return status; + nfs_increment_open_seqid(status, res->seqid); + if (status) + return status; + status = decode_stateid(xdr, &res->stateid); if (unlikely(status)) return status; -- cgit v1.2.3 From c19422192bfb29a3c9ed00961f317b6f46f14923 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 13 Jan 2014 16:54:45 -0500 Subject: nfs4.1: properly handle ENOTSUP in SECINFO_NO_NAME commit 78b19bae0813bd6f921ca58490196abd101297bd upstream. Don't check for -NFS4ERR_NOTSUPP, it's already been mapped to -ENOTSUPP by nfs4_stat_to_errno. This allows the client to mount v4.1 servers that don't support SECINFO_NO_NAME by falling back to the "guess and check" method of nfs4_find_root_sec. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 75e49d42a7fb..5aca7b400ad9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6683,7 +6683,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, switch (err) { case 0: case -NFS4ERR_WRONGSEC: - case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: goto out; default: err = nfs4_handle_exception(server, err, &exception); @@ -6715,7 +6715,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ - if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { err = nfs4_find_root_sec(server, fhandle, info); goto out_freepage; } -- cgit v1.2.3 From 14ff66ce27e91666bf55e415104fdec9f9b18668 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Jan 2014 17:03:41 -0500 Subject: NFSv4.1: Handle errors correctly in nfs41_walk_client_list commit 64590daa9e0dfb3aad89e3ab9230683b76211d5b upstream. Both nfs41_walk_client_list and nfs40_walk_client_list expect the 'status' variable to be set to the value -NFS4ERR_STALE_CLIENTID if the loop fails to find a match. The problem is that the 'pos->cl_cons_state > NFS_CS_READY' changes the value of 'status', and sets it either to the value '0' (which indicates success), or to the value EINTR. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4client.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 4cbad5d6b276..3afae9059222 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -324,9 +324,10 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + goto out; + status = -NFS4ERR_STALE_CLIENTID; + spin_lock(&nn->nfs_client_lock); } if (pos->cl_cons_state != NFS_CS_READY) continue; @@ -464,7 +465,8 @@ int nfs41_walk_client_list(struct nfs_client *new, } spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + break; + status = -NFS4ERR_STALE_CLIENTID; } if (pos->cl_cons_state != NFS_CS_READY) continue; -- cgit v1.2.3 From daab6e7df44ba7d4281379c6ac5780e10d133286 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Sun, 19 Jan 2014 22:45:36 -0500 Subject: nfs4: fix discover_server_trunking use after free commit abad2fa5ba67725a3f9c376c8cfe76fbe94a3041 upstream. If clp is new (cl_count = 1) and it matches another client in nfs4_discover_server_trunking, the nfs_put_client will free clp before ->cl_preserve_clid is set. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4client.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 3afae9059222..02773aab43c5 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -240,13 +240,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - nfs_put_client(clp); - if (clp != old) { - clp->cl_preserve_clid = true; - clp = old; - } - return clp; + if (clp != old) + clp->cl_preserve_clid = true; + nfs_put_client(clp); + return old; error: nfs_mark_client_ready(clp, error); -- cgit v1.2.3 From 305a7c624df026dbdde23eb880549db7df0ad7b9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 22 Jan 2014 20:34:54 +0200 Subject: pnfs: Proper delay for NFS4ERR_RECALLCONFLICT in layout_get_done commit ed7e5423014ad89720fcf315c0b73f2c5d0c7bd2 upstream. An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT only when a Server Sent a RECALL do to that GET_LAYOUT, or the RECALL and GET_LAYOUT crossed on the wire. In any way this means we want to wait at most until in-flight IO is finished and the RECALL can be satisfied. So a proper wait here is more like 1/10 of a second, not 15 seconds like we have now. In case of a server bug we delay exponentially longer on each retry. Current code totally craps out performance of very large files on most pnfs-objects layouts, because of how the map changes when the file has grown into the next raid group. [Stable: This will patch back to 3.9. If there are earlier still maintained trees, please tell me I'll send a patch] Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5aca7b400ad9..26e71bdb5b33 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6232,9 +6232,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; - unsigned long timeo, giveup; + unsigned long timeo, now, giveup; - dprintk("--> %s\n", __func__); + dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); if (!nfs41_sequence_done(task, &lgp->res.seq_res)) goto out; @@ -6242,12 +6242,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + /* + * NFS4ERR_LAYOUTTRYLATER is a conflict with another client + * (or clients) writing to the same RAID stripe + */ case -NFS4ERR_LAYOUTTRYLATER: + /* + * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall + * existing layout before getting a new one). + */ case -NFS4ERR_RECALLCONFLICT: timeo = rpc_get_timeout(task->tk_client); giveup = lgp->args.timestamp + timeo; - if (time_after(giveup, jiffies)) - task->tk_status = -NFS4ERR_DELAY; + now = jiffies; + if (time_after(giveup, now)) { + unsigned long delay; + + /* Delay for: + * - Not less then NFS4_POLL_RETRY_MIN. + * - One last time a jiffie before we give up + * - exponential backoff (time_now minus start_attempt) + */ + delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, + min((giveup - now - 1), + now - lgp->args.timestamp)); + + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", + __func__, delay); + rpc_delay(task, delay); + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out; /* Do not call nfs4_async_handle_error() */ + } break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: -- cgit v1.2.3 From a082f872743e09fe2cdf870b2d8f1b5dccd36cf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 29 Jan 2014 16:05:30 -0500 Subject: Btrfs: disable snapshot aware defrag for now commit 8101c8dbf6243ba517aab58d69bf1bc37d8b7b9c upstream. It's just broken and it's taking a lot of effort to fix it, so for now just disable it so people can defrag in peace. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0bcee78cde16..25e6a8e1014e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2655,7 +2655,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) EXTENT_DEFRAG, 1, cached_state); if (ret) { u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (last_snapshot >= BTRFS_I(inode)->generation) + if (0 && last_snapshot >= BTRFS_I(inode)->generation) /* the inode is shared */ new = record_old_file_extents(inode, ordered_extent); -- cgit v1.2.3 From 7e405afc255142ceb5d8cc3d97a7cffac46f0e4e Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 6 Feb 2014 12:04:28 -0800 Subject: mm: __set_page_dirty uses spin_lock_irqsave instead of spin_lock_irq commit 227d53b397a32a7614667b3ecaf1d89902fb6c12 upstream. To use spin_{un}lock_irq is dangerous if caller disabled interrupt. During aio buffer migration, we have a possibility to see the following call stack. aio_migratepage [disable interrupt] migrate_page_copy clear_page_dirty_for_io set_page_dirty __set_page_dirty_buffers __set_page_dirty spin_lock_irq This mean, current aio migration is a deadlockable. spin_lock_irqsave is a safer alternative and we should use it. Signed-off-by: KOSAKI Motohiro Reported-by: David Rientjes rientjes@google.com> Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index d2a4d1bb2d57..75964d734444 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -620,14 +620,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - spin_lock_irq(&mapping->tree_lock); + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } -- cgit v1.2.3 From 434672c0c50a3ef988c9d582c0148c9067e6961c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Feb 2014 14:25:41 -0800 Subject: fs/file.c:fdtable: avoid triggering OOMs from alloc_fdmem commit 96c7a2ff21501691587e1ae969b83cbec8b78e08 upstream. Recently due to a spike in connections per second memcached on 3 separate boxes triggered the OOM killer from accept. At the time the OOM killer was triggered there was 4GB out of 36GB free in zone 1. The problem was that alloc_fdtable was allocating an order 3 page (32KiB) to hold a bitmap, and there was sufficient fragmentation that the largest page available was 8KiB. I find the logic that PAGE_ALLOC_COSTLY_ORDER can't fail pretty dubious but I do agree that order 3 allocations are very likely to succeed. There are always pathologies where order > 0 allocations can fail when there are copious amounts of free memory available. Using the pigeon hole principle it is easy to show that it requires 1 page more than 50% of the pages being free to guarantee an order 1 (8KiB) allocation will succeed, 1 page more than 75% of the pages being free to guarantee an order 2 (16KiB) allocation will succeed and 1 page more than 87.5% of the pages being free to guarantee an order 3 allocate will succeed. A server churning memory with a lot of small requests and replies like memcached is a common case that if anything can will skew the odds against large pages being available. Therefore let's not give external applications a practical way to kill linux server applications, and specify __GFP_NORETRY to the kmalloc in alloc_fdmem. Unless I am misreading the code and by the time the code reaches should_alloc_retry in __alloc_pages_slowpath (where __GFP_NORETRY becomes signification). We have already tried everything reasonable to allocate a page and the only thing left to do is wait. So not waiting and falling back to vmalloc immediately seems like the reasonable thing to do even if there wasn't a chance of triggering the OOM killer. Signed-off-by: "Eric W. Biederman" Cc: Eric Dumazet Acked-by: David Rientjes Cc: Cong Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index 4a78f981557a..9de20265a78c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -34,7 +34,7 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN); + void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); if (data != NULL) return data; } -- cgit v1.2.3 From 53fecc53969681726e39275576fc75af39e7e7af Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 26 Jan 2014 23:53:43 -0600 Subject: CIFS: Fix SMB2 mounts so they don't try to set or get xattrs via cifs commit 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 upstream. When mounting with smb2 (or smb2.1 or smb3) we need to check to make sure that attempts to query or set extended attributes do not attempt to send the request with the older cifs protocol instead (eventually we also need to add the support in SMB2 to query/set extended attributes but this patch prevents us from using the wrong protocol for extended attribute operations). Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 6 ++++++ fs/cifs/xattr.c | 49 ++++++++++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ea3a0b3018a5..56ac89fe957e 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -370,6 +370,12 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *fid); int (*calc_signature)(struct smb_rqst *rqst, struct TCP_Server_Info *server); + ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, + const unsigned char *, const unsigned char *, char *, + size_t, const struct nls_table *, int); + int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, + const char *, const void *, const __u16, + const struct nls_table *, int); }; struct smb_version_values { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 09afda4cc58e..95c43bb20335 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) goto remove_ea_exit; ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL, - (__u16)0, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, NULL, (__u16)0, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } remove_ea_exit: kfree(full_path); @@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, cifs_dbg(FYI, "attempt to set cifs inode metadata\n"); ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, - (__u16)value_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto set_ea_exit; ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, - (__u16)value_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->set_EA) + rc = pTcon->ses->server->ops->set_EA(xid, pTcon, + full_path, ea_name, ea_value, (__u16)value_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { #ifdef CONFIG_CIFS_ACL @@ -272,17 +278,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, /* revalidate/getattr then populate from inode */ } /* BB add else when above is implemented */ ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, ea_name, ea_value, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { #ifdef CONFIG_CIFS_POSIX @@ -400,11 +410,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) /* if proc/fs/cifs/streamstoxattr is set then search server for EAs or streams to returns as xattrs */ - rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data, - buf_size, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (pTcon->ses->server->ops->query_all_EAs) + rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, + full_path, NULL, data, buf_size, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); list_ea_exit: kfree(full_path); free_xid(xid); -- cgit v1.2.3 From 367d96446cc94044d4bb7858d9cadabb69cf3030 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 1 Feb 2014 23:27:18 -0600 Subject: Add protocol specific operation for CIFS xattrs commit d979f3b0a1f0b5499ab85e68cdf02b56852918b6 upstream. Changeset 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 added protocol operations for get/setxattr to avoid calling cifs operations on smb2/smb3 mounts for xattr operations and this changeset adds the calls to cifs specific protocol operations for xattrs (in order to reenable cifs support for xattrs which was temporarily disabled by the previous changeset. We do not have SMB2/SMB3 worker function for setting xattrs yet so this only enables it for cifs. CCing stable since without these two small changsets (its small coreq 666753c3ef8fc88b0ddd5be4865d0aa66428ac35 is also needed) calling getfattr/setfattr on smb2/smb3 mounts causes problems. Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar Signed-off-by: Greg Kroah-Hartman --- fs/cifs/inode.c | 13 +++++++++---- fs/cifs/smb1ops.c | 4 ++++ 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 449b6cf09b09..9d463501348f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -490,10 +490,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS", - ea_value, 4 /* size of buf */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tcon->ses->server->ops->query_all_EAs == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, + "SETFILEBITS", ea_value, 4 /* size of buf */, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 3efdb9d5c0b8..4a61ba7f5e0d 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -948,6 +948,10 @@ struct smb_version_operations smb1_operations = { .mand_lock = cifs_mand_lock, .mand_unlock_range = cifs_unlock_range, .push_mand_locks = cifs_push_mandatory_locks, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = CIFSSMBQAllEAs, + .set_EA = CIFSSMBSetEA, +#endif /* CIFS_XATTR */ }; struct smb_version_values smb1_values = { -- cgit v1.2.3 From 6bd8c8508282b9db05b9bb5a9f76e051f247a618 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 2 Feb 2014 23:31:47 -0600 Subject: retrieving CIFS ACLs when mounted with SMB2 fails dropping session commit 83e3bc23ef9ce7c03b7b4e5d3d790246ea59db3e upstream. The get/set ACL xattr support for CIFS ACLs attempts to send old cifs dialect protocol requests even when mounted with SMB2 or later dialects. Sending cifs requests on an smb2 session causes problems - the server drops the session due to the illegal request. This patch makes CIFS ACL operations protocol specific to fix that. Attempting to query/set CIFS ACLs for SMB2 will now return EOPNOTSUPP (until we add worker routines for sending query ACL requests via SMB2) instead of sending invalid (cifs) requests. A separate followon patch will be needed to fix cifs_acl_to_fattr (which takes a cifs specific u16 fid so can't be abstracted to work with SMB2 until that is changed) and will be needed to fix mount problems when "cifsacl" is specified on mount with e.g. vers=2.1 Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsacl.c | 28 ++++++++++++++++++++++++---- fs/cifs/cifsglob.h | 4 ++++ fs/cifs/smb1ops.c | 4 ++++ fs/cifs/xattr.c | 15 +++++++++++---- 4 files changed, 43 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 51f5e0ee7237..494b68349667 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1027,15 +1027,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); + struct cifs_tcon *tcon; + + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + tcon = tlink_tcon(tlink); cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ - pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); + + if (tcon->ses->server->ops->get_acl == NULL) { + cifs_put_tlink(tlink); + return -EOPNOTSUPP; + } + + pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path, + &secdesclen); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); - goto out; + cifs_put_tlink(tlink); + return rc; } /* @@ -1048,6 +1063,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); + cifs_put_tlink(tlink); return -ENOMEM; } @@ -1056,14 +1072,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode, cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); + if (tcon->ses->server->ops->set_acl == NULL) + rc = -EOPNOTSUPP; + if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode, + path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } + cifs_put_tlink(tlink); kfree(pnntsd); kfree(pntsd); -out: return rc; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 56ac89fe957e..e2c2d96491fa 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -376,6 +376,10 @@ struct smb_version_operations { int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, const struct nls_table *, int); + struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, + const char *, u32 *); + int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, + int); }; struct smb_version_values { diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 4a61ba7f5e0d..4885a40f3210 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -952,6 +952,10 @@ struct smb_version_operations smb1_operations = { .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, #endif /* CIFS_XATTR */ +#ifdef CONFIG_CIFS_ACL + .get_acl = get_cifs_acl, + .set_acl = set_cifs_acl, +#endif /* CIFS_ACL */ }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 95c43bb20335..5ac836a86b18 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -176,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = -ENOMEM; } else { memcpy(pacl, ea_value, value_size); - rc = set_cifs_acl(pacl, value_size, - direntry->d_inode, full_path, CIFS_ACL_DACL); + if (pTcon->ses->server->ops->set_acl) + rc = pTcon->ses->server->ops->set_acl(pacl, + value_size, direntry->d_inode, + full_path, CIFS_ACL_DACL); + else + rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ CIFS_I(direntry->d_inode)->time = 0; kfree(pacl); @@ -323,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, u32 acllen; struct cifs_ntsd *pacl; - pacl = get_cifs_acl(cifs_sb, direntry->d_inode, - full_path, &acllen); + if (pTcon->ses->server->ops->get_acl == NULL) + goto get_ea_exit; /* rc already EOPNOTSUPP */ + + pacl = pTcon->ses->server->ops->get_acl(cifs_sb, + direntry->d_inode, full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", -- cgit v1.2.3 From 5de64260830aafce062b99e17cb3963ecc4291bf Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Feb 2014 17:10:26 +1100 Subject: lockd: send correct lock when granting a delayed lock. commit 2ec197db1a56c9269d75e965f14c344b58b2a4f6 upstream. If an NFS client attempts to get a lock (using NLM) and the lock is not available, the server will remember the request and when the lock becomes available it will send a GRANT request to the client to provide the lock. If the client already held an adjacent lock, the GRANT callback will report the union of the existing and new locks, which can confuse the client. This happens because __posix_lock_file (called by vfs_lock_file) updates the passed-in file_lock structure when adjacent or over-lapping locks are found. To avoid this problem we take a copy of the two fields that can be changed (fl_start and fl_end) before the call and restore them afterwards. An alternate would be to allocate a 'struct file_lock', initialise it, use locks_copy_lock() to take a copy, then locks_release_private() after the vfs_lock_file() call. But that is a lot more work. Reported-by: Olaf Kirch Signed-off-by: NeilBrown Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman -- v1 had a couple of issues (large on-stack struct and didn't really work properly). This version is much better tested. Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 8ebd3f551e0c..ffc4045fc62e 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -767,6 +767,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; int error; + loff_t fl_start, fl_end; dprintk("lockd: grant blocked lock %p\n", block); @@ -784,9 +785,16 @@ nlmsvc_grant_blocked(struct nlm_block *block) } /* Try the lock operation again */ + /* vfs_lock_file() can mangle fl_start and fl_end, but we need + * them unchanged for the GRANT_MSG + */ lock->fl.fl_flags |= FL_SLEEP; + fl_start = lock->fl.fl_start; + fl_end = lock->fl.fl_end; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; + lock->fl.fl_start = fl_start; + lock->fl.fl_end = fl_end; switch (error) { case 0: -- cgit v1.2.3 From caaeac355477b49d6b50791fb6f0ced3d2b4e93b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 6 Feb 2014 15:14:13 -0500 Subject: block: Fix nr_vecs for inline integrity vectors commit 087787959ce851d7bbb19f10f6e9241b7f85a3ca upstream. Commit 9f060e2231ca changed the way we handle allocations for the integrity vectors. When the vectors are inline there is no associated slab and consequently bvec_nr_vecs() returns 0. Ensure that we check against BIP_INLINE_VECS in that case. Reported-by: David Milburn Tested-by: David Milburn Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/bio-integrity.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 45e944fe52a6..8dccf73025b3 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -114,6 +114,14 @@ void bio_integrity_free(struct bio *bio) } EXPORT_SYMBOL(bio_integrity_free); +static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip) +{ + if (bip->bip_slab == BIO_POOL_NONE) + return BIP_INLINE_VECS; + + return bvec_nr_vecs(bip->bip_slab); +} + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update @@ -129,7 +137,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv; - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { + if (bip->bip_vcnt >= bip_integrity_vecs(bip)) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } -- cgit v1.2.3 From 4687eb1ebf2129cf46e636ec197173b99a4959e2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Wed, 12 Feb 2014 11:48:31 -0500 Subject: ext4: fix error paths in swap_inode_boot_loader() commit 30d29b119ef01776e0a301444ab24defe8d8bef3 upstream. In swap_inode_boot_loader() we forgot to release ->i_mutex and resume unlocked dio for inode and inode_bl if there is an error starting the journal handle. This commit fixes this issue. Reported-by: Ahmed Tamrawi Cc: Andreas Dilger Cc: Dr. Tilmann Bubeck Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index c0427e2f6648..42624a995b00 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -145,7 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto swap_boot_out; + goto journal_err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -203,6 +203,7 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_double_up_write_data_sem(inode, inode_bl); +journal_err_out: ext4_inode_resume_unlocked_dio(inode); ext4_inode_resume_unlocked_dio(inode_bl); -- cgit v1.2.3 From f13ea132887a8649cc6d45942e2be27f4362351f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 12 Feb 2014 12:16:04 -0500 Subject: ext4: don't try to modify s_flags if the the file system is read-only commit 23301410972330c0ae9a8afc379ba2005e249cc6 upstream. If an ext4 file system is created by some tool other than mke2fs (perhaps by someone who has a pathalogical fear of the GPL) that doesn't set one or the other of the EXT2_FLAGS_{UN}SIGNED_HASH flags, and that file system is then mounted read-only, don't try to modify the s_flags field. Otherwise, if dm_verity is in use, the superblock will change, causing an dm_verity failure. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e4923b6a9e39..a7a5f7ea74db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3592,16 +3592,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { + if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; #else - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); + if (!(sb->s_flags & MS_RDONLY)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif + } } /* Handle clustersize */ -- cgit v1.2.3 From 94a197c6bc4976fdec2ea37a0612df226839c541 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 15 Feb 2014 21:33:13 -0500 Subject: ext4: fix online resize with very large inode tables commit b93c95353413041a8cebad915a8109619f66bcc6 upstream. If a file system has a large number of inodes per block group, all of the metadata blocks in a flex_bg may be larger than what can fit in a single block group. Unfortunately, ext4_alloc_group_tables() in resize.c was never tested to see if it would handle this case correctly, and there were a large number of bugs which caused the following sequence to result in a BUG_ON: kernel bug at fs/ext4/resize.c:409! ... call trace: [] ext4_flex_group_add+0x1448/0x1830 [] ext4_resize_fs+0x7b2/0xe80 [] ext4_ioctl+0xbf0/0xf00 [] do_vfs_ioctl+0x2dd/0x4b0 [] ? final_putname+0x22/0x50 [] sys_ioctl+0x81/0xa0 [] system_call_fastpath+0x16/0x1b code: c8 4c 89 df e8 41 96 f8 ff 44 89 e8 49 01 c4 44 29 6d d4 0 rip [] set_flexbg_block_bitmap+0x171/0x180 This can be reproduced with the following command sequence: mke2fs -t ext4 -i 4096 /dev/vdd 1G mount -t ext4 /dev/vdd /vdd resize2fs /dev/vdd 8G To fix this, we need to make sure the right thing happens when a block group's inode table straddles two block groups, which means the following bugs had to be fixed: 1) Not clearing the BLOCK_UNINIT flag in the second block group in ext4_alloc_group_tables --- the was proximate cause of the BUG_ON. 2) Incorrectly determining how many block groups contained contiguous free blocks in ext4_alloc_group_tables(). 3) Incorrectly setting the start of the next block range to be marked in use after a discontinuity in setup_new_flex_group_blocks(). Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 49d3c01eabf8..6bb79434861f 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -238,6 +238,7 @@ static int ext4_alloc_group_tables(struct super_block *sb, ext4_group_t group; ext4_group_t last_group; unsigned overhead; + __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0; BUG_ON(flex_gd->count == 0 || group_data == NULL); @@ -261,7 +262,7 @@ next_group: src_group++; for (; src_group <= last_group; src_group++) { overhead = ext4_group_overhead_blocks(sb, src_group); - if (overhead != 0) + if (overhead == 0) last_blk += group_data[src_group - group].blocks_count; else break; @@ -275,8 +276,7 @@ next_group: group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode bitmaps */ @@ -287,22 +287,30 @@ next_group: group = ext4_get_group_number(sb, start_blk - 1); group -= group_data[0].group; group_data[group].free_blocks_count--; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + flex_gd->bg_flags[group] &= uninit_mask; } /* Allocate inode tables */ for (; it_index < flex_gd->count; it_index++) { - if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk) + unsigned int itb = EXT4_SB(sb)->s_itb_per_group; + ext4_fsblk_t next_group_start; + + if (start_blk + itb > last_blk) goto next_group; group_data[it_index].inode_table = start_blk; - group = ext4_get_group_number(sb, start_blk - 1); + group = ext4_get_group_number(sb, start_blk); + next_group_start = ext4_group_first_block_no(sb, group + 1); group -= group_data[0].group; - group_data[group].free_blocks_count -= - EXT4_SB(sb)->s_itb_per_group; - if (flexbg_size > 1) - flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT; + if (start_blk + itb > next_group_start) { + flex_gd->bg_flags[group + 1] &= uninit_mask; + overhead = start_blk + itb - next_group_start; + group_data[group + 1].free_blocks_count -= overhead; + itb -= overhead; + } + + group_data[group].free_blocks_count -= itb; + flex_gd->bg_flags[group] &= uninit_mask; start_blk += EXT4_SB(sb)->s_itb_per_group; } @@ -615,7 +623,7 @@ handle_ib: if (err) goto out; count = group_table_count[j]; - start = group_data[i].block_bitmap; + start = (&group_data[i].block_bitmap)[j]; block = start; } -- cgit v1.2.3 From d6b55805ac4ffbcd63bf74ce572cc7a94f15be9e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 15 Feb 2014 22:42:25 -0500 Subject: ext4: fix online resize with a non-standard blocks per group setting commit 3d2660d0c9c2f296837078c189b68a47f6b2e3b5 upstream. The set_flexbg_block_bitmap() function assumed that the number of blocks in a blockgroup was sb->blocksize * 8, which is normally true, but not always! Use EXT4_BLOCKS_PER_GROUP(sb) instead, to fix block bitmap corruption after: mke2fs -t ext4 -g 3072 -i 4096 /dev/vdd 1G mount -t ext4 /dev/vdd /vdd resize2fs /dev/vdd 8G Signed-off-by: "Theodore Ts'o" Reported-by: Jon Bernard Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6bb79434861f..c503850a61a8 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -404,7 +404,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, start = ext4_group_first_block_no(sb, group); group -= flex_gd->groups[0].group; - count2 = sb->s_blocksize * 8 - (block - start); + count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start); if (count2 > count) count2 = count; -- cgit v1.2.3 From 0c9f21983490feb5637990d719e729bf77b99bd1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 16 Feb 2014 19:29:32 -0500 Subject: ext4: don't leave i_crtime.tv_sec uninitialized commit 19ea80603715d473600cd993b9987bc97d042e02 upstream. If the i_crtime field is not present in the inode, don't leave the field uninitialized. Fixes: ef7f38359 ("ext4: Add nanosecond timestamps") Reported-by: Vegard Nossum Tested-by: Vegard Nossum Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7bb2e2e55123..790b14c5f262 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -774,6 +774,8 @@ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime.tv_sec = \ (signed)le32_to_cpu((raw_inode)->xtime); \ + else \ + (einode)->xtime.tv_sec = 0; \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ ext4_decode_extra_time(&(einode)->xtime, \ raw_inode->xtime ## _extra); \ -- cgit v1.2.3 From 9f0afafeb7bdd6c2c2b27a93c803fff975a232be Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 14 Feb 2014 07:20:35 -0500 Subject: cifs: ensure that uncached writes handle unmapped areas correctly commit 5d81de8e8667da7135d3a32a964087c0faf5483f upstream. It's possible for userland to pass down an iovec via writev() that has a bogus user pointer in it. If that happens and we're doing an uncached write, then we can end up getting less bytes than we expect from the call to iov_iter_copy_from_user. This is CVE-2014-0069 cifs_iovec_write isn't set up to handle that situation however. It'll blindly keep chugging through the page array and not filling those pages with anything useful. Worse yet, we'll later end up with a negative number in wdata->tailsz, which will confuse the sending routines and cause an oops at the very least. Fix this by having the copy phase of cifs_iovec_write stop copying data in this situation and send the last write as a short one. At the same time, we want to avoid sending a zero-length write to the server, so break out of the loop and set rc to -EFAULT if that happens. This also allows us to handle the case where no address in the iovec is valid. [Note: Marking this for stable on v3.4+ kernels, but kernels as old as v2.6.38 may have a similar problem and may need similar fix] Reviewed-by: Pavel Shilovsky Reported-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c2934f8701da..8b0c656f2ab2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2353,7 +2353,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *poffset) { unsigned long nr_pages, i; - size_t copied, len, cur_len; + size_t bytes, copied, len, cur_len; ssize_t total_written = 0; loff_t offset; struct iov_iter it; @@ -2408,14 +2408,45 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, save_len = cur_len; for (i = 0; i < nr_pages; i++) { - copied = min_t(const size_t, cur_len, PAGE_SIZE); + bytes = min_t(const size_t, cur_len, PAGE_SIZE); copied = iov_iter_copy_from_user(wdata->pages[i], &it, - 0, copied); + 0, bytes); cur_len -= copied; iov_iter_advance(&it, copied); + /* + * If we didn't copy as much as we expected, then that + * may mean we trod into an unmapped area. Stop copying + * at that point. On the next pass through the big + * loop, we'll likely end up getting a zero-length + * write and bailing out of it. + */ + if (copied < bytes) + break; } cur_len = save_len - cur_len; + /* + * If we have no data to send, then that probably means that + * the copy above failed altogether. That's most likely because + * the address in the iovec was bogus. Set the rc to -EFAULT, + * free anything we allocated and bail out. + */ + if (!cur_len) { + for (i = 0; i < nr_pages; i++) + put_page(wdata->pages[i]); + kfree(wdata); + rc = -EFAULT; + break; + } + + /* + * i + 1 now represents the number of pages we actually used in + * the copy phase above. Bring nr_pages down to that, and free + * any pages that we didn't use. + */ + for ( ; nr_pages > i + 1; nr_pages--) + put_page(wdata->pages[nr_pages - 1]); + wdata->sync_mode = WB_SYNC_ALL; wdata->nr_pages = nr_pages; wdata->offset = (__u64)offset; -- cgit v1.2.3 From 32c36d8888008fb30c8215d0fc189a288285bc12 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 14 Feb 2014 13:31:02 +0400 Subject: CIFS: Fix too big maxBuf size for SMB3 mounts commit 2365c4eaf077c48574ab6f143960048fc0f31518 upstream. SMB3 servers can respond with MaxTransactSize of more than 4M that can cause a memory allocation error returned from kmalloc in a lock codepath. Also the client doesn't support multicredit requests now and allows buffer sizes of 65536 bytes only. Set MaxTransactSize to this maximum supported value. Signed-off-by: Pavel Shilovsky Acked-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2glob.h | 3 +++ fs/cifs/smb2ops.c | 14 ++++---------- fs/cifs/smb2pdu.c | 3 +++ 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 7c0e2143e775..cc592ef6584a 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -55,4 +55,7 @@ #define SMB2_NTLMV2_SESSKEY_SIZE (16) #define SMB2_HMACSHA256_SIZE (32) +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + #endif /* _SMB2_GLOB_H */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f2e76f3b0c61..e2756bb40b4d 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -181,11 +181,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified wsize, or default */ wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); - /* - * limit write size to 2 ** 16, because we don't support multicredit - * requests now. - */ - wsize = min_t(unsigned int, wsize, 2 << 15); + /* set it to the maximum buffer size value we can send with 1 credit */ + wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); return wsize; } @@ -199,11 +196,8 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) /* start with specified rsize, or default */ rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); - /* - * limit write size to 2 ** 16, because we don't support multicredit - * requests now. - */ - rsize = min_t(unsigned int, rsize, 2 << 15); + /* set it to the maximum buffer size value we can send with 1 credit */ + rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); return rsize; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2b95ce2b54e8..c7a6fd87bb6e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -408,6 +408,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) server->dialect = le16_to_cpu(rsp->DialectRevision); server->maxBuf = le32_to_cpu(rsp->MaxTransactSize); + /* set it to the maximum buffer size value we can send with 1 credit */ + server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize), + SMB2_MAX_BUFFER_SIZE); server->max_read = le32_to_cpu(rsp->MaxReadSize); server->max_write = le32_to_cpu(rsp->MaxWriteSize); /* BB Do we need to validate the SecurityMode? */ -- cgit v1.2.3 From d5685be1332f2dde176de463b0eebea875118a8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2013 07:17:07 -0800 Subject: fs: fix iversion handling commit dff6efc326a4d5f305797d4a6bba14f374fdd633 upstream. Currently notify_change directly updates i_version for size updates, which not only is counter to how all other fields are updated through struct iattr, but also breaks XFS, which need inode updates to happen under its own lock, and synchronized to the structure that gets written to the log. Remove the update in the common code, and it to btrfs and ext4, XFS already does a proper updaste internally and currently gets a double update with the existing code. IMHO this is 3.13 and -stable material and should go in through the XFS tree. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Dilger Acked-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Chris Mason Signed-off-by: Ben Myers Signed-off-by: Greg Kroah-Hartman --- fs/attr.c | 5 ----- fs/btrfs/inode.c | 8 ++++++-- fs/ext4/inode.c | 4 ++++ 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/attr.c b/fs/attr.c index 1449adb14ef6..8dd5825ec708 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -182,11 +182,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr) return -EPERM; } - if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) { - if (attr->ia_size != inode->i_size) - inode_inc_iversion(inode); - } - if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; /* Flag setting protected by i_mutex */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 25e6a8e1014e..8fcd2424e7f9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4527,8 +4527,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ - if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize != oldsize) { + inode_inc_iversion(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) + inode->i_ctime = inode->i_mtime = + current_fs_time(inode->i_sb); + } if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb2bdc7ccb05..21dff8f236f6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4704,6 +4704,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + + if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) + inode_inc_iversion(inode); + if (S_ISREG(inode->i_mode) && (attr->ia_size < inode->i_size)) { if (ext4_should_order_data(inode)) { -- cgit v1.2.3 From 4480120a6cf3c834cc73431d615bb716be586f2f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 20 Feb 2014 17:02:27 +0100 Subject: quota: Fix race between dqput() and dquot_scan_active() commit 1362f4ea20fa63688ba6026e586d9746ff13a846 upstream. Currently last dqput() can race with dquot_scan_active() causing it to call callback for an already deactivated dquot. The race is as follows: CPU1 CPU2 dqput() spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { - not taken if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); ->release_dquot(dquot); if (atomic_read(&dquot->dq_count) > 1) - not taken dquot_scan_active() spin_lock(&dq_list_lock); if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) - not taken atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); - proceeds to release dquot ret = fn(dquot, priv); - called for inactive dquot Fix the problem by making sure possible ->release_dquot() is finished by the time we call the callback and new calls to it will notice reference dquot_scan_active() has taken and bail out. Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/quota/dquot.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3e64169ef527..38802d683969 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb, dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; - ret = fn(dquot, priv); - if (ret < 0) - goto out; + /* + * ->release_dquot() can be racing with us. Our reference + * protects us from new calls to it so just wait for any + * outstanding call and recheck the DQ_ACTIVE_B after that. + */ + wait_on_dquot(dquot); + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + ret = fn(dquot, priv); + if (ret < 0) + goto out; + } spin_lock(&dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ -- cgit v1.2.3