From 73ba39ab9307340dc98ec3622891314bbc09cc2e Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Sun, 4 Dec 2016 12:51:53 +0800 Subject: btrfs: return the actual error value from from btrfs_uuid_tree_iterate In function btrfs_uuid_tree_iterate(), errno is assigned to variable ret on errors. However, it directly returns 0. It may be better to return ret. This patch also removes the warning, because the caller already prints a warning. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188731 Signed-off-by: Pan Bian Reviewed-by: Omar Sandoval [ edited subject ] Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 161342b73ce5..726f928238d0 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -352,7 +352,5 @@ skip: out: btrfs_free_path(path); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); - return 0; + return ret; } -- cgit v1.2.3 From aa7c8da35d1905d80e840d075f07d26ec90144b5 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 20 Dec 2016 13:28:27 -0500 Subject: btrfs: fix error handling when run_delayed_extent_op fails In __btrfs_run_delayed_refs, the error path when run_delayed_extent_op fails sets locked_ref->processing = 0 but doesn't re-increment delayed_refs->num_heads_ready. As a result, we end up triggering the WARN_ON in btrfs_select_ref_head. Fixes: d7df2c796d7 (Btrfs: attach delayed ref updates to delayed ref heads) Reported-by: Jon Nelson Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e97302f437a1..5366e50c84c6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2572,7 +2572,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + spin_lock(&delayed_refs->lock); locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); -- cgit v1.2.3 From d0280996437081dd12ed1e982ac8aeaa62835ec4 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 20 Dec 2016 13:28:28 -0500 Subject: btrfs: fix locking when we put back a delayed ref that's too new In __btrfs_run_delayed_refs, when we put back a delayed ref that's too new, we have already dropped the lock on locked_ref when we set ->processing = 0. This patch keeps the lock to cover that assignment. Fixes: d7df2c796d7 (Btrfs: attach delayed ref updates to delayed ref heads) Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5366e50c84c6..ac7e6713033c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2522,11 +2522,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - btrfs_delayed_ref_unlock(locked_ref); spin_lock(&delayed_refs->lock); locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; cond_resched(); count++; -- cgit v1.2.3 From e321f8a801d7b4c40da8005257b05b9c2b51b072 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 30 Nov 2016 16:11:04 -0800 Subject: Btrfs: use down_read_nested to make lockdep silent If @block_group is not @used_bg, it'll try to get @used_bg's lock without droping @block_group 's lock and lockdep has throwed a scary deadlock warning about it. Fix it by using down_read_nested. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ac7e6713033c..dcd2e798767e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7387,7 +7387,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) -- cgit v1.2.3 From 781feef7e6befafd4d9787d1f7ada1f9ccd504e4 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 30 Nov 2016 16:20:25 -0800 Subject: Btrfs: fix lockdep warning about log_mutex While checking INODE_REF/INODE_EXTREF for a corner case, we may acquire a different inode's log_mutex with holding the current inode's log_mutex, and lockdep has complained this with a possilble deadlock warning. Fix this by using mutex_lock_nested() when processing the other inode's log_mutex. Reviewed-by: Filipe Manana Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f10bf5213ed8..eeffff84f280 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -37,6 +37,7 @@ */ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 +#define LOG_OTHER_INODE 2 /* * directory trouble cases @@ -4641,7 +4642,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) || (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags) && - inode_only == LOG_INODE_EXISTS)) + inode_only >= LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; @@ -4665,7 +4666,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - mutex_lock(&BTRFS_I(inode)->log_mutex); + if (inode_only == LOG_OTHER_INODE) { + inode_only = LOG_INODE_EXISTS; + mutex_lock_nested(&BTRFS_I(inode)->log_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&BTRFS_I(inode)->log_mutex); + } /* * a brute force approach to making sure we get the most uptodate @@ -4817,7 +4824,7 @@ again: * unpin it. */ err = btrfs_log_inode(trans, root, other_inode, - LOG_INODE_EXISTS, + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); iput(other_inode); if (err) -- cgit v1.2.3 From c2931667c83ded6504b3857e99cc45b21fa496fb Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 22 Dec 2016 17:13:54 -0800 Subject: Btrfs: adjust outstanding_extents counter properly when dio write is split Currently how btrfs dio deals with split dio write is not good enough if dio write is split into several segments due to the lack of contiguous space, a large dio write like 'dd bs=1G count=1' can end up with incorrect outstanding_extents counter and endio would complain loudly with an assertion. This fixes the problem by compensating the outstanding_extents counter in inode if a large dio write gets split. Reported-by: Anand Jain Tested-by: Anand Jain Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a713d9d324b0..81b9d9d0450c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7623,11 +7623,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode, * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (dio_data->outstanding_extents) { + if (dio_data->outstanding_extents >= num_extents) { dio_data->outstanding_extents -= num_extents; } else { + /* + * If dio write length has been split due to no large enough + * contiguous space, we need to compensate our inode counter + * appropriately. + */ + u64 num_needed = num_extents - dio_data->outstanding_extents; + spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_extents; + BTRFS_I(inode)->outstanding_extents += num_needed; spin_unlock(&BTRFS_I(inode)->lock); } } -- cgit v1.2.3 From ac0c7cf8be00f269f82964cf7b144ca3edc5dbc4 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 6 Jan 2017 14:12:51 +0100 Subject: btrfs: fix crash when tracepoint arguments are freed by wq callbacks Enabling btrfs tracepoints leads to instant crash, as reported. The wq callbacks could free the memory and the tracepoints started to dereference the members to get to fs_info. The proposed fix https://marc.info/?l=linux-btrfs&m=148172436722606&w=2 removed the tracepoints but we could preserve them by passing only the required data in a safe way. Fixes: bc074524e123 ("btrfs: prefix fsid to all trace events") CC: stable@vger.kernel.org # 4.8+ Reported-by: Sebastian Andrzej Siewior Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/async-thread.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 63d197724519..ff0b0be92d61 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -273,6 +273,8 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) unsigned long flags; while (1) { + void *wtag; + spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -299,11 +301,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); /* - * we don't want to call the ordered free functions - * with the lock held though + * We don't want to call the ordered free functions with the + * lock held though. Save the work as tag for the trace event, + * because the callback could free the structure. */ + wtag = work; work->ordered_free(work); - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } spin_unlock_irqrestore(lock, flags); } @@ -311,6 +315,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) static void normal_work_helper(struct btrfs_work *work) { struct __btrfs_workqueue *wq; + void *wtag; int need_order = 0; /* @@ -324,6 +329,8 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; + /* Safe for tracepoints in case work gets freed by the callback */ + wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -333,7 +340,7 @@ static void normal_work_helper(struct btrfs_work *work) run_ordered_work(wq); } if (!need_order) - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, -- cgit v1.2.3 From 92a1bf76a89ad338f00eb9a2c7689a3907fbcaad Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 17 Nov 2016 15:00:50 -0800 Subject: Btrfs: add 'inode' for extent map tracepoint 'inode' is an important field for btrfs_get_extent, lets trace it. Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a713d9d324b0..fab189c67eff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7059,7 +7059,7 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); if (trans) { -- cgit v1.2.3 From 5149fd327f16e393c1d04fa5325ab072c32472bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:36:30 -0800 Subject: xfs: bump up reserved blocks in xfs_alloc_set_aside Setting aside 4 blocks globally for bmbt splits isn't all that useful, as different threads can allocate space in parallel. Bump it to 4 blocks per AG to allow each thread that is currently doing an allocation to dip into it separately. Without that we may no have enough reserved blocks if there are enough parallel transactions in an almost out space file system that all run into bmap btree splits. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 5050056a0b06..0a46f8488b8d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -95,10 +95,7 @@ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - unsigned int blocks; - - blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - return blocks; + return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); } /* -- cgit v1.2.3 From 255c516278175a6dc7037d1406307f35237d8688 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:36:19 -0800 Subject: xfs: fix bogus minleft manipulations We can't just set minleft to 0 when we're low on space - that's exactly what we need minleft for: to protect space in the AG for btree block allocations when we are low on free space. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 24 +++++++----------------- fs/xfs/libxfs/xfs_bmap.c | 3 --- fs/xfs/libxfs/xfs_bmap_btree.c | 3 +-- 3 files changed, 8 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 0a46f8488b8d..fe925702c955 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2635,12 +2635,10 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ - xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ xfs_alloctype_t type; /* input allocation type */ int bump_rotor = 0; - int no_min = 0; xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ mp = args->mp; @@ -2669,7 +2667,6 @@ xfs_alloc_vextent( trace_xfs_alloc_vextent_badargs(args); return 0; } - minleft = args->minleft; switch (type) { case XFS_ALLOCTYPE_THIS_AG: @@ -2680,9 +2677,7 @@ xfs_alloc_vextent( */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); args->pag = xfs_perag_get(mp, args->agno); - args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2747,9 +2742,7 @@ xfs_alloc_vextent( */ for (;;) { args->pag = xfs_perag_get(mp, args->agno); - if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2789,20 +2782,17 @@ xfs_alloc_vextent( * or switch to non-trylock mode. */ if (args->agno == sagno) { - if (no_min == 1) { + if (flags == 0) { args->agbno = NULLAGBLOCK; trace_xfs_alloc_vextent_allfailed(args); break; } - if (flags == 0) { - no_min = 1; - } else { - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } + + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; } } xfs_perag_put(args->pag); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760bc3b2536..44773c9eb957 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3812,7 +3812,6 @@ xfs_bmap_btalloc( args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; args.total = ap->minlen; - args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; ap->dfops->dop_low = true; @@ -4344,8 +4343,6 @@ xfs_bmapi_allocate( if (error) return error; - if (bma->dfops->dop_low) - bma->minleft = 0; if (bma->cur) bma->cur->bc_private.b.firstblock = *bma->firstblock; if (bma->blkno == NULLFSBLOCK) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d6330c297ca0..d9be241fc86f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -502,12 +502,11 @@ try_another_ag: if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy - * a full btree split. Try again without minleft and if + * a full btree split. Try again and if * successful activate the lowspace algorithm. */ args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; - args.minleft = 0; error = xfs_alloc_vextent(&args); if (error) goto error0; -- cgit v1.2.3 From 54fee133ad59c87ab01dd84ab3e9397134b32acb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:44:30 -0800 Subject: xfs: adjust allocation length in xfs_alloc_space_available We must decide in xfs_alloc_fix_freelist if we can perform an allocation from a given AG is possible or not based on the available space, and should not fail the allocation past that point on a healthy file system. But currently we have two additional places that second-guess xfs_alloc_fix_freelist: xfs_alloc_ag_vextent tries to adjust the maxlen parameter to remove the reservation before doing the allocation (but ignores the various minium freespace requirements), and xfs_alloc_fix_minleft tries to fix up the allocated length after we've found an extent, but ignores the reservations and also doesn't take the AGFL into account (and thus fails allocations for not matching minlen in some cases). Remove all these later fixups and just correct the maxlen argument inside xfs_alloc_fix_freelist once we have the AGF buffer locked. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 81 ++++++++++------------------------------------- fs/xfs/libxfs/xfs_alloc.h | 2 +- 2 files changed, 18 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fe925702c955..f2e7eb6e5243 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -362,35 +362,11 @@ xfs_alloc_fix_len( return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); ASSERT(rlen % args->prod == args->mod); + ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= + rlen + args->minleft); args->len = rlen; } -/* - * Fix up length if there is too little space left in the a.g. - * Return 1 if ok, 0 if too little, should give up. - */ -STATIC int -xfs_alloc_fix_minleft( - xfs_alloc_arg_t *args) /* allocation argument structure */ -{ - xfs_agf_t *agf; /* a.g. freelist header */ - int diff; /* free space difference */ - - if (args->minleft == 0) - return 1; - agf = XFS_BUF_TO_AGF(args->agbp); - diff = be32_to_cpu(agf->agf_freeblks) - - args->len - args->minleft; - if (diff >= 0) - return 1; - args->len += diff; /* shrink the allocated space */ - /* casts to (int) catch length underflows */ - if ((int)args->len >= (int)args->minlen) - return 1; - args->agbno = NULLAGBLOCK; - return 0; -} - /* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the @@ -686,8 +662,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; - xfs_extlen_t reservation; - xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -695,20 +669,6 @@ xfs_alloc_ag_vextent( ASSERT(args->mod < args->prod); ASSERT(args->alignment > 0); - /* - * Clamp maxlen to the amount of free space minus any reservations - * that have been made. - */ - oldmax = args->maxlen; - reservation = xfs_ag_resv_needed(args->pag, args->resv); - if (args->maxlen > args->pag->pagf_freeblks - reservation) - args->maxlen = args->pag->pagf_freeblks - reservation; - if (args->maxlen == 0) { - args->agbno = NULLAGBLOCK; - args->maxlen = oldmax; - return 0; - } - /* * Branch to correct routine based on the type. */ @@ -728,8 +688,6 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } - args->maxlen = oldmax; - if (error || args->agbno == NULLAGBLOCK) return error; @@ -838,9 +796,6 @@ xfs_alloc_ag_vextent_exact( args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto not_found; - ASSERT(args->agbno + args->len <= tend); /* @@ -1146,12 +1101,7 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_near_nominleft(args); - return 0; - } - blen = args->len; + /* * We are allocating starting at bnew for blen blocks. */ @@ -1343,12 +1293,6 @@ restart: */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - trace_xfs_alloc_near_nominleft(args); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - return 0; - } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, args->datatype, ltbnoa, ltlena, <new); @@ -1550,8 +1494,6 @@ restart: } xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* @@ -2070,10 +2012,20 @@ xfs_alloc_space_available( /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - reservation - min_free - args->total); - if (available < (int)args->minleft || available <= 0) + reservation - min_free - args->minleft); + if (available < (int)args->total) return false; + /* + * Clamp maxlen to the amount of free space available for the actual + * extent allocation. + */ + if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { + args->maxlen = available; + ASSERT(args->maxlen > 0); + ASSERT(args->maxlen >= args->minlen); + } + return true; } @@ -2119,7 +2071,8 @@ xfs_alloc_fix_freelist( } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, flags | + XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7c404a6b0ae3..1d0f48a501a3 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ #define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ #define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ - +#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ /* * Argument structure for xfs_alloc routines. -- cgit v1.2.3 From 12ef830198b0d71668eb9b59f9ba69d32951a48a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:39:35 -0800 Subject: xfs: don't rely on ->total in xfs_alloc_space_available ->total is a bit of an odd parameter passed down to the low-level allocator all the way from the high-level callers. It's supposed to contain the maximum number of blocks to be allocated for the whole transaction [1]. But in xfs_iomap_write_allocate we only convert existing delayed allocations and thus only have a minimal block reservation for the current transaction, so xfs_alloc_space_available can't use it for the allocation decisions. Use the maximum of args->total and the calculated block requirement to make a decision. We probably should get rid of args->total eventually and instead apply ->minleft more broadly, but that will require some extensive changes all over. [1] which creates lots of confusion as most callers don't decrement it once doing a first allocation. But that's for a separate series. Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f2e7eb6e5243..9f06a211e157 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1995,7 +1995,7 @@ xfs_alloc_space_available( int flags) { struct xfs_perag *pag = args->pag; - xfs_extlen_t longest; + xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; @@ -2005,15 +2005,16 @@ xfs_alloc_space_available( reservation = xfs_ag_resv_needed(pag, args->resv); /* do we have enough contiguous free space for the allocation? */ + alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, reservation); - if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + if (longest < alloc_len) return false; /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - reservation - min_free - args->minleft); - if (available < (int)args->total) + if (available < (int)max(args->total, alloc_len)) return false; /* -- cgit v1.2.3 From 84a4620cfe97c9d57e39b2369bfb77faff55063d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 9 Jan 2017 13:41:33 -0800 Subject: xfs: don't print warnings when xfs_log_force fails There are only two reasons for xfs_log_force / xfs_log_force_lsn to fail: one is an I/O error, for which xlog_bdstrat already logs a warning, and the second is an already shutdown log due to a previous I/O errors. In the latter case we'll already have a previous indication for the actual error, but the large stream of misleading warnings from xfs_log_force will probably scroll it out of the message buffer. Simply removing the warnings thus makes the XFS log reporting significantly better. Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c39ac14ff540..b1469f0a91a6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3317,12 +3317,8 @@ xfs_log_force( xfs_mount_t *mp, uint flags) { - int error; - trace_xfs_log_force(mp, 0, _RET_IP_); - error = _xfs_log_force(mp, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force(mp, flags, NULL); } /* @@ -3466,12 +3462,8 @@ xfs_log_force_lsn( xfs_lsn_t lsn, uint flags) { - int error; - trace_xfs_log_force(mp, lsn, _RET_IP_); - error = _xfs_log_force_lsn(mp, lsn, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force_lsn(mp, lsn, flags, NULL); } /* -- cgit v1.2.3 From 3895dbf8985f656675b5bde610723a29cbce3fa7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 3 Jan 2017 14:18:43 +1300 Subject: mnt: Protect the mountpoint hashtable with mount_lock Protecting the mountpoint hashtable with namespace_sem was sufficient until a call to umount_mnt was added to mntput_no_expire. At which point it became possible for multiple calls of put_mountpoint on the same hash chain to happen on the same time. Kristen Johansen reported: > This can cause a panic when simultaneous callers of put_mountpoint > attempt to free the same mountpoint. This occurs because some callers > hold the mount_hash_lock, while others hold the namespace lock. Some > even hold both. > > In this submitter's case, the panic manifested itself as a GP fault in > put_mountpoint() when it called hlist_del() and attempted to dereference > a m_hash.pprev that had been poisioned by another thread. Al Viro observed that the simple fix is to switch from using the namespace_sem to the mount_lock to protect the mountpoint hash table. I have taken Al's suggested patch moved put_mountpoint in pivot_root (instead of taking mount_lock an additional time), and have replaced new_mountpoint with get_mountpoint a function that does the hash table lookup and addition under the mount_lock. The introduction of get_mounptoint ensures that only the mount_lock is needed to manipulate the mountpoint hashtable. d_set_mounted is modified to only set DCACHE_MOUNTED if it is not already set. This allows get_mountpoint to use the setting of DCACHE_MOUNTED to ensure adding a struct mountpoint for a dentry happens exactly once. Cc: stable@vger.kernel.org Fixes: ce07d891a089 ("mnt: Honor MNT_LOCKED when detaching mounts") Reported-by: Krister Johansen Suggested-by: Al Viro Acked-by: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/dcache.c | 7 +++++-- fs/namespace.c | 64 +++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 50 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 769903dbc19d..95d71eda8142 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1336,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry) } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { - dentry->d_flags |= DCACHE_MOUNTED; - ret = 0; + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } } spin_unlock(&dentry->d_lock); out: diff --git a/fs/namespace.c b/fs/namespace.c index b5b1259e064f..487ba30bb5c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -742,26 +742,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) return NULL; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +static struct mountpoint *get_mountpoint(struct dentry *dentry) { - struct hlist_head *chain = mp_hash(dentry); - struct mountpoint *mp; + struct mountpoint *mp, *new = NULL; int ret; - mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!mp) + if (d_mountpoint(dentry)) { +mountpoint: + read_seqlock_excl(&mount_lock); + mp = lookup_mountpoint(dentry); + read_sequnlock_excl(&mount_lock); + if (mp) + goto done; + } + + if (!new) + new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!new) return ERR_PTR(-ENOMEM); + + /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); - if (ret) { - kfree(mp); - return ERR_PTR(ret); - } - mp->m_dentry = dentry; - mp->m_count = 1; - hlist_add_head(&mp->m_hash, chain); - INIT_HLIST_HEAD(&mp->m_list); + /* Someone else set d_mounted? */ + if (ret == -EBUSY) + goto mountpoint; + + /* The dentry is not available as a mountpoint? */ + mp = ERR_PTR(ret); + if (ret) + goto done; + + /* Add the new mountpoint to the hash table */ + read_seqlock_excl(&mount_lock); + new->m_dentry = dentry; + new->m_count = 1; + hlist_add_head(&new->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&new->m_list); + read_sequnlock_excl(&mount_lock); + + mp = new; + new = NULL; +done: + kfree(new); return mp; } @@ -1595,11 +1619,11 @@ void __detach_mounts(struct dentry *dentry) struct mount *mnt; namespace_lock(); + lock_mount_hash(); mp = lookup_mountpoint(dentry); if (IS_ERR_OR_NULL(mp)) goto out_unlock; - lock_mount_hash(); event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); @@ -1609,9 +1633,9 @@ void __detach_mounts(struct dentry *dentry) } else umount_tree(mnt, UMOUNT_CONNECTED); } - unlock_mount_hash(); put_mountpoint(mp); out_unlock: + unlock_mount_hash(); namespace_unlock(); } @@ -2038,9 +2062,7 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = lookup_mountpoint(dentry); - if (!mp) - mp = new_mountpoint(dentry); + struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); inode_unlock(dentry->d_inode); @@ -2059,7 +2081,11 @@ retry: static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; + + read_seqlock_excl(&mount_lock); put_mountpoint(where); + read_sequnlock_excl(&mount_lock); + namespace_unlock(); inode_unlock(dentry->d_inode); } @@ -3135,9 +3161,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); + put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); - put_mountpoint(root_mp); error = 0; out4: unlock_mount(old_mp); -- cgit v1.2.3 From 75422726b0f717d67db3283c2eb5bc14fa2619c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 4 Jan 2017 17:37:27 +1300 Subject: libfs: Modify mount_pseudo_xattr to be clear it is not a userspace mount Add MS_KERNMOUNT to the flags that are passed. Use sget_userns and force &init_user_ns instead of calling sget so that even if called from a weird context the internal filesystem will be considered to be in the intial user namespace. Luis Ressel reported that the the failure to pass MS_KERNMOUNT into mount_pseudo broke his in development graphics driver that uses the generic drm infrastructure. I am not certain the deriver was bug free in it's usage of that infrastructure but since mount_pseudo_xattr can never be triggered by userspace it is clearer and less error prone, and less problematic for the code to be explicit. Reported-by: Luis Ressel Tested-by: Luis Ressel Acked-by: Al Viro Signed-off-by: "Eric W. Biederman" --- fs/libfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index e973cd51f126..28d6f35feed6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); - s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); + s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER, + &init_user_ns, NULL); if (IS_ERR(s)) return ERR_CAST(s); -- cgit v1.2.3 From 93362fa47fe98b62e4a34ab408c4a418432e7939 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 6 Jan 2017 09:32:32 +0800 Subject: sysctl: Drop reference added by grab_header in proc_sys_readdir Fixes CVE-2016-9191, proc_sys_readdir doesn't drop reference added by grab_header when return from !dir_emit_dots path. It can cause any path called unregister_sysctl_table will wait forever. The calltrace of CVE-2016-9191: [ 5535.960522] Call Trace: [ 5535.963265] [] schedule+0x3f/0xa0 [ 5535.968817] [] schedule_timeout+0x3db/0x6f0 [ 5535.975346] [] ? wait_for_completion+0x45/0x130 [ 5535.982256] [] wait_for_completion+0xc3/0x130 [ 5535.988972] [] ? wake_up_q+0x80/0x80 [ 5535.994804] [] drop_sysctl_table+0xc4/0xe0 [ 5536.001227] [] drop_sysctl_table+0x77/0xe0 [ 5536.007648] [] unregister_sysctl_table+0x4d/0xa0 [ 5536.014654] [] unregister_sysctl_table+0x7f/0xa0 [ 5536.021657] [] unregister_sched_domain_sysctl+0x15/0x40 [ 5536.029344] [] partition_sched_domains+0x44/0x450 [ 5536.036447] [] ? __mutex_unlock_slowpath+0x111/0x1f0 [ 5536.043844] [] rebuild_sched_domains_locked+0x64/0xb0 [ 5536.051336] [] update_flag+0x11d/0x210 [ 5536.057373] [] ? mutex_lock_nested+0x2df/0x450 [ 5536.064186] [] ? cpuset_css_offline+0x1b/0x60 [ 5536.070899] [] ? trace_hardirqs_on+0xd/0x10 [ 5536.077420] [] ? mutex_lock_nested+0x2df/0x450 [ 5536.084234] [] ? css_killed_work_fn+0x25/0x220 [ 5536.091049] [] cpuset_css_offline+0x35/0x60 [ 5536.097571] [] css_killed_work_fn+0x5c/0x220 [ 5536.104207] [] process_one_work+0x1df/0x710 [ 5536.110736] [] ? process_one_work+0x160/0x710 [ 5536.117461] [] worker_thread+0x12b/0x4a0 [ 5536.123697] [] ? process_one_work+0x710/0x710 [ 5536.130426] [] kthread+0xfe/0x120 [ 5536.135991] [] ret_from_fork+0x1f/0x40 [ 5536.142041] [] ? kthread_create_on_node+0x230/0x230 One cgroup maintainer mentioned that "cgroup is trying to offline a cpuset css, which takes place under cgroup_mutex. The offlining ends up trying to drain active usages of a sysctl table which apprently is not happening." The real reason is that proc_sys_readdir doesn't drop reference added by grab_header when return from !dir_emit_dots path. So this cpuset offline path will wait here forever. See here for details: http://www.openwall.com/lists/oss-security/2016/11/04/13 Fixes: f0c3b5093add ("[readdir] convert procfs") Cc: stable@vger.kernel.org Reported-by: CAI Qian Tested-by: Yang Shukui Signed-off-by: Zhou Chengming Acked-by: Al Viro Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55313d994895..d4e37acd4821 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -709,7 +709,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) - return 0; + goto out; pos = 2; @@ -719,6 +719,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) break; } } +out: sysctl_head_finish(head); return 0; } -- cgit v1.2.3 From 497de07d89c1410d76a15bec2bb41f24a2a89f31 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Mon, 9 Jan 2017 09:34:48 +0800 Subject: tmpfs: clear S_ISGID when setting posix ACLs This change was missed the tmpfs modification in In CVE-2016-7097 commit 073931017b49 ("posix_acl: Clear SGID bit when setting file permissions") It can test by xfstest generic/375, which failed to clear setgid bit in the following test case on tmpfs: touch $testfile chown 100:100 $testfile chmod 2755 $testfile _runas -u 100 -g 101 -- setfacl -m u::rwx,g::rwx,o::rwx $testfile Signed-off-by: Gu Zheng Signed-off-by: Al Viro --- fs/posix_acl.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 595522022aca..c9d48dc78495 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return 0; - if (error == 0) - acl = NULL; + error = posix_acl_update_mode(inode, + &inode->i_mode, &acl); + if (error) + return error; } inode->i_ctime = current_time(inode); -- cgit v1.2.3 From dd545b52a3e1efd9f2c6352dbe95ccd0c53461cc Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Tue, 10 Jan 2017 13:29:54 -0700 Subject: do_direct_IO: Use inode->i_blkbits to compute block count to be cleaned The code currently uses sdio->blkbits to compute the number of blocks to be cleaned. However sdio->blkbits is derived from the logical block size of the underlying block device (Refer to the definition of do_blockdev_direct_IO()). Due to this, generic/299 test would rarely fail when executed on an ext4 filesystem with 64k as the block size and when using a virtio based disk (having 512 byte as the logical block size) inside a kvm guest. This commit fixes the bug by using inode->i_blkbits to compute the number of blocks to be cleaned. Signed-off-by: Chandan Rajendra Reviewed-by: Christoph Hellwig Fixed up by Jeff Moyer to only use/evaluate inode->i_blkbits once, to avoid issues with block size changes with IO in flight. Signed-off-by: Jens Axboe --- fs/direct-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index aeae8c063451..c87bae4376b8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -906,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; + const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; while (sdio->block_in_file < sdio->final_block_in_request) { @@ -949,7 +950,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, clean_bdev_aliases( map_bh->b_bdev, map_bh->b_blocknr, - map_bh->b_size >> blkbits); + map_bh->b_size >> i_blkbits); } if (!sdio->blkfactor) -- cgit v1.2.3 From 0a417b8dc1f10b03e8f558b8a831f07ec4c23795 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Jan 2017 10:20:04 -0800 Subject: xfs: Timely free truncated dirty pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 99579ccec4e2 "xfs: skip dirty pages in ->releasepage()" started to skip dirty pages in xfs_vm_releasepage() which also has the effect that if a dirty page is truncated, it does not get freed by block_invalidatepage() and is lingering in LRU list waiting for reclaim. So a simple loop like: while true; do dd if=/dev/zero of=file bs=1M count=100 rm file done will keep using more and more memory until we hit low watermarks and start pagecache reclaim which will eventually reclaim also the truncate pages. Keeping these truncated (and thus never usable) pages in memory is just a waste of memory, is unnecessarily stressing page cache reclaim, and reportedly also leads to anonymous mmap(2) returning ENOMEM prematurely. So instead of just skipping dirty pages in xfs_vm_releasepage(), return to old behavior of skipping them only if they have delalloc or unwritten buffers and fix the spurious warnings by warning only if the page is clean. CC: stable@vger.kernel.org CC: Brian Foster CC: Vlastimil Babka Reported-by: Petr Tůma Fixes: 99579ccec4e271c3d4d4e7c946058766812afdab Signed-off-by: Jan Kara Reviewed-by: Brian Foster Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0f56fcd3a5d5..631e7c0e0a29 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1152,19 +1152,22 @@ xfs_vm_releasepage( * block_invalidatepage() can send pages that are still marked dirty * but otherwise have invalidated buffers. * - * We've historically freed buffers on the latter. Instead, quietly - * filter out all dirty pages to avoid spurious buffer state warnings. - * This can likely be removed once shrink_active_list() is fixed. + * We want to release the latter to avoid unnecessary buildup of the + * LRU, skip the former and warn if we've left any lingering + * delalloc/unwritten buffers on clean pages. Skip pages with delalloc + * or unwritten buffers and warn if the page is not dirty. Otherwise + * try to release the buffers. */ - if (PageDirty(page)) - return 0; - xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON_ONCE(delalloc)) + if (delalloc) { + WARN_ON_ONCE(!PageDirty(page)); return 0; - if (WARN_ON_ONCE(unwritten)) + } + if (unwritten) { + WARN_ON_ONCE(!PageDirty(page)); return 0; + } return try_to_free_buffers(page); } -- cgit v1.2.3 From f99e86485cc32cd16e5cc97f9bb0474f28608d84 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 12 Jan 2017 07:58:32 -0700 Subject: block: Rename blk_queue_zone_size and bdev_zone_size All block device data fields and functions returning a number of 512B sectors are by convention named xxx_sectors while names in the form xxx_size are generally used for a number of bytes. The blk_queue_zone_size and bdev_zone_size functions were not following this convention so rename them. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Collapsed the two patches, they were nonsensically split and broke bisection. Signed-off-by: Jens Axboe --- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0738f48293cc..0d8802453758 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -713,8 +713,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 702638e21c76..46fd30d8af77 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1553,16 +1553,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); -- cgit v1.2.3 From 4b09ec4b14a168bf2c687e1f598140c3c11e9222 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 5 Jan 2017 10:20:16 -0500 Subject: nfs: Don't take a reference on fl->fl_file for LOCK operation I have reports of a crash that look like __fput() was called twice for a NFSv4.0 file. It seems possible that the state manager could try to reclaim a lock and take a reference on the fl->fl_file at the same time the file is being released if, during the close(), a signal interrupts the wait for outstanding IO while removing locks which then skips the removal of that lock. Since 83bfff23e9ed ("nfs4: have do_vfs_lock take an inode pointer") has removed the need to traverse fl->fl_file->f_inode in nfs4_lock_done(), taking that reference is no longer necessary. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6dcbc5defb7a..700ed1fc1075 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -6127,7 +6126,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); - get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -6240,7 +6238,6 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); - fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } -- cgit v1.2.3 From cc8e8342930129aa2c9b629e1653e4681f0896ea Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 4 Jan 2017 16:21:58 +0800 Subject: ceph: fix mds cluster availability check We should apply the check after getting the initial mdsmap. Fixes: e9e427f0a14f ("ceph: check availability of mds cluster on mount") Link: http://tracker.ceph.com/issues/18161 Signed-off-by: Yan, Zheng --- fs/ceph/mds_client.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4f49253387a0..ec6b35e9f966 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2106,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc, dout("do_request mdsmap err %d\n", err); goto finish; } + if (mdsc->mdsmap->m_epoch == 0) { + dout("do_request no mdsmap, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto finish; + } if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { -- cgit v1.2.3 From 84fcc2d2bd6cbf621e49e1d0f7eaef2e3c666b40 Mon Sep 17 00:00:00 2001 From: "Geng, Jichao" Date: Thu, 5 Jan 2017 16:50:39 +0800 Subject: ceph: fix get_oldest_context() For no snapshot case, we should use ci->truncate_{seq,size}. Fixes: 5f743e456606 ("ceph: record truncate size/seq for snap data writeback") Signed-off-by: Geng, Jichao Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9cd0c0ea7cdb..e4b066cd912a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -502,9 +502,9 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); if (truncate_size) - *truncate_size = capsnap->truncate_size; + *truncate_size = ci->i_truncate_size; if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; + *truncate_seq = ci->i_truncate_seq; } spin_unlock(&ci->i_ceph_lock); return snapc; -- cgit v1.2.3 From 2dfc61736482441993bfb7dfaa971113b53f107c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jan 2017 08:47:00 -0500 Subject: NFSv4: Call update_changeattr() from _nfs4_proc_open only if a file was created We don't want to invalidate the directory attribute and data cache unless we know that a file was created, or the change attribute differs from the one in our cache. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 700ed1fc1075..4010c33151ad 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2390,11 +2390,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(server, &data->f_attr); if (o_arg->open_flags & O_CREAT) { - update_changeattr(dir, &o_res->cinfo); if (o_arg->open_flags & O_EXCL) data->file_created = 1; else if (o_res->cinfo.before != o_res->cinfo.after) data->file_created = 1; + if (data->file_created || dir->i_version != o_res->cinfo.after) + update_changeattr(dir, &o_res->cinfo); } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; -- cgit v1.2.3 From c733c49c32624f927f443be6dbabb387006bbe42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jan 2017 12:32:26 -0500 Subject: NFSv4: Don't apply change_info4 twice on rename within a directory If a file is renamed, but stays in the same directory, we will still receive 2 change_info4 structures describing the change to that directory, but we only want to apply it once. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4010c33151ad..1e797bf74aaf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4159,8 +4159,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; - update_changeattr(old_dir, &res->old_cinfo); - update_changeattr(new_dir, &res->new_cinfo); + if (task->tk_status == 0) { + update_changeattr(old_dir, &res->old_cinfo); + if (new_dir != old_dir) + update_changeattr(new_dir, &res->new_cinfo); + } return 1; } -- cgit v1.2.3 From c40d52fe1c2ba25dcb8cd207c8d26ef5f57f0476 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jan 2017 12:36:11 -0500 Subject: NFSv4: Don't call update_changeattr() unless the unlink is successful If the unlink wasn't successful, then the directory has presumably not changed. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1e797bf74aaf..6a35204affa4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4125,7 +4125,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; - update_changeattr(dir, &res->cinfo); + if (task->tk_status == 0) + update_changeattr(dir, &res->cinfo); return 1; } -- cgit v1.2.3 From d3129ef672cac81c4d0185336af377c8dc1091d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 11 Jan 2017 22:07:28 -0500 Subject: NFSv4: update_changeattr should update the attribute timestamp Otherwise, the attribute cache remains marked as being expired. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6a35204affa4..ecc151697fd4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1082,7 +1082,8 @@ int nfs4_call_sync(struct rpc_clnt *clnt, return nfs4_call_sync_sequence(clnt, server, msg, args, res); } -static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) +static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, + unsigned long timestamp) { struct nfs_inode *nfsi = NFS_I(dir); @@ -1098,6 +1099,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) NFS_INO_INVALID_ACL; } dir->i_version = cinfo->after; + nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); @@ -2395,7 +2397,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) else if (o_res->cinfo.before != o_res->cinfo.after) data->file_created = 1; if (data->file_created || dir->i_version != o_res->cinfo.after) - update_changeattr(dir, &o_res->cinfo); + update_changeattr(dir, &o_res->cinfo, + o_res->f_attr->time_start); } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; @@ -4073,11 +4076,12 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) .rpc_argp = &args, .rpc_resp = &res, }; + unsigned long timestamp = jiffies; int status; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) - update_changeattr(dir, &res.cinfo); + update_changeattr(dir, &res.cinfo, timestamp); return status; } @@ -4126,7 +4130,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) &data->timeout) == -EAGAIN) return 0; if (task->tk_status == 0) - update_changeattr(dir, &res->cinfo); + update_changeattr(dir, &res->cinfo, res->dir_attr->time_start); return 1; } @@ -4161,9 +4165,9 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 0; if (task->tk_status == 0) { - update_changeattr(old_dir, &res->old_cinfo); + update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start); if (new_dir != old_dir) - update_changeattr(new_dir, &res->new_cinfo); + update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start); } return 1; } @@ -4201,7 +4205,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { - update_changeattr(dir, &res.cinfo); + update_changeattr(dir, &res.cinfo, res.fattr->time_start); status = nfs_post_op_update_inode(inode, res.fattr); if (!status) nfs_setsecurity(inode, res.fattr, res.label); @@ -4276,7 +4280,8 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { - update_changeattr(dir, &data->res.dir_cinfo); + update_changeattr(dir, &data->res.dir_cinfo, + data->res.fattr->time_start); status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); } return status; -- cgit v1.2.3 From dcd208697707b12adeaa45643bab239c5e90ef9b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 11 Jan 2017 20:34:50 -0500 Subject: nfsd: fix supported attributes for acl & labels Oops--in 916d2d844afd I moved some constants into an array for convenience, but here I'm accidentally writing to that array. The effect is that if you ever encounter a filesystem lacking support for ACLs or security labels, then all queries of supported attributes will report that attribute as unsupported from then on. Fixes: 916d2d844afd "nfsd: clean up supported attribute handling" Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7ecf16be4a44..8fae53ce21d1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2440,7 +2440,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 *supp = nfsd_suppattrs[minorversion]; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); if (!IS_POSIXACL(dentry->d_inode)) supp[0] &= ~FATTR4_WORD0_ACL; -- cgit v1.2.3 From c6180a6237174f481dc856ed6e890d8196b6f0fb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Jan 2017 13:31:32 -0500 Subject: NFSv4: Fix client recovery when server reboots multiple times If the server reboots multiple times, the client should rely on the server to tell it that it cannot reclaim state as per section 9.6.3.4 in RFC7530 and section 8.4.2.1 in RFC5661. Currently, the client is being to conservative, and is assuming that if the server reboots while state recovery is in progress, then it must ignore state that was not recovered before the reboot. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1d152f4470cd..90e6193ce6be 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1729,7 +1729,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) break; case -NFS4ERR_STALE_CLIENTID: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: -- cgit v1.2.3 From a12f1ae61c489076a9aeb90bddca7722bf330df3 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 13 Dec 2016 12:09:56 -0800 Subject: aio: fix lock dep warning lockdep reports a warnning. file_start_write/file_end_write only acquire/release the lock for regular files. So checking the files in aio side too. [ 453.532141] ------------[ cut here ]------------ [ 453.533011] WARNING: CPU: 1 PID: 1298 at ../kernel/locking/lockdep.c:3514 lock_release+0x434/0x670 [ 453.533011] DEBUG_LOCKS_WARN_ON(depth <= 0) [ 453.533011] Modules linked in: [ 453.533011] CPU: 1 PID: 1298 Comm: fio Not tainted 4.9.0+ #964 [ 453.533011] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.0-1.fc24 04/01/2014 [ 453.533011] ffff8803a24b7a70 ffffffff8196cffb ffff8803a24b7ae8 0000000000000000 [ 453.533011] ffff8803a24b7ab8 ffffffff81091ee1 ffff8803a5dba700 00000dba00000008 [ 453.533011] ffffed0074496f59 ffff8803a5dbaf54 ffff8803ae0f8488 fffffffffffffdef [ 453.533011] Call Trace: [ 453.533011] [] dump_stack+0x67/0x9c [ 453.533011] [] __warn+0x111/0x130 [ 453.533011] [] warn_slowpath_fmt+0x97/0xb0 [ 453.533011] [] ? __warn+0x130/0x130 [ 453.533011] [] ? blk_finish_plug+0x29/0x60 [ 453.533011] [] lock_release+0x434/0x670 [ 453.533011] [] ? import_single_range+0xd4/0x110 [ 453.533011] [] ? rw_verify_area+0x65/0x140 [ 453.533011] [] ? aio_write+0x1f6/0x280 [ 453.533011] [] aio_write+0x229/0x280 [ 453.533011] [] ? aio_complete+0x640/0x640 [ 453.533011] [] ? debug_check_no_locks_freed+0x1a0/0x1a0 [ 453.533011] [] ? debug_lockdep_rcu_enabled.part.2+0x1a/0x30 [ 453.533011] [] ? debug_lockdep_rcu_enabled+0x35/0x40 [ 453.533011] [] ? __might_fault+0x7e/0xf0 [ 453.533011] [] do_io_submit+0x94c/0xb10 [ 453.533011] [] ? do_io_submit+0x23e/0xb10 [ 453.533011] [] ? SyS_io_destroy+0x270/0x270 [ 453.533011] [] ? mark_held_locks+0x23/0xc0 [ 453.533011] [] ? trace_hardirqs_on_thunk+0x1a/0x1c [ 453.533011] [] SyS_io_submit+0x10/0x20 [ 453.533011] [] entry_SYSCALL_64_fastpath+0x18/0xad [ 453.533011] [] ? trace_hardirqs_off_caller+0xc0/0x110 [ 453.533011] ---[ end trace b2fbe664d1cc0082 ]--- Cc: Dmitry Monakhov Cc: Jan Kara Cc: Christoph Hellwig Cc: Al Viro Reviewed-by: Christoph Hellwig Signed-off-by: Shaohua Li Signed-off-by: Al Viro --- fs/aio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 4ab67e8cb776..873b4ca82ccb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * Tell lockdep we inherited freeze protection from submission * thread. */ - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); file_end_write(file); } @@ -1525,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, * by telling it the lock got released so that it doesn't * complain about held lock when we return to userspace. */ - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } kfree(iovec); return ret; -- cgit v1.2.3 From 4d22c75d4c7b5c5f4bd31054f09103ee490878fd Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 11 Jan 2017 13:25:00 -0600 Subject: coredump: Ensure proper size of sparse core files If the last section of a core file ends with an unmapped or zero page, the size of the file does not correspond with the last dump_skip() call. gdb complains that the file is truncated and can be confusing to users. After all of the vma sections are written, make sure that the file size is no smaller than the current file position. This problem can be demonstrated with gdb's bigcore testcase on the sparc architecture. Signed-off-by: Dave Kleikamp Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Al Viro --- fs/binfmt_elf.c | 1 + fs/coredump.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 29a02daf08a9..422370293cfd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2298,6 +2298,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } } + dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/coredump.c b/fs/coredump.c index e525b6017cdf..ae6b05629ca1 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align) return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); + +/* + * Ensures that file size is big enough to contain the current file + * postion. This prevents gdb from complaining about a truncated file + * if the last "write" to the file was dump_skip. + */ +void dump_truncate(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + loff_t offset; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + offset = file->f_op->llseek(file, 0, SEEK_CUR); + if (i_size_read(file->f_mapping->host) < offset) + do_truncate(file->f_path.dentry, offset, 0, file); + } +} +EXPORT_SYMBOL(dump_truncate); -- cgit v1.2.3