From d353d7587d02116b9732d5c06615aed75a4d3a47 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 11:16:36 -0500
Subject: writeback: plug writeback at a high level

Doing writeback on lots of little files causes terrible IOPS storms
because of the per-mapping writeback plugging we do. This
essentially causes imeediate dispatch of IO for each mapping,
regardless of the context in which writeback is occurring.

IOWs, running a concurrent write-lots-of-small 4k files using fsmark
on XFS results in a huge number of IOPS being issued for data
writes.  Metadata writes are sorted and plugged at a high level by
XFS, so aggregate nicely into large IOs. However, data writeback IOs
are dispatched in individual 4k IOs, even when the blocks of two
consecutively written files are adjacent.

Test VM: 8p, 8GB RAM, 4xSSD in RAID0, 100TB sparse XFS filesystem,
metadata CRCs enabled.

Kernel: 3.10-rc5 + xfsdev + my 3.11 xfs queue (~70 patches)

Test:

$ ./fs_mark  -D  10000  -S0  -n  10000  -s  4096  -L  120  -d
/mnt/scratch/0  -d  /mnt/scratch/1  -d  /mnt/scratch/2  -d
/mnt/scratch/3  -d  /mnt/scratch/4  -d  /mnt/scratch/5  -d
/mnt/scratch/6  -d  /mnt/scratch/7

Result:

		wall	sys	create rate	Physical write IO
		time	CPU	(avg files/s)	 IOPS	Bandwidth
		-----	-----	------------	------	---------
unpatched	6m56s	15m47s	24,000+/-500	26,000	130MB/s
patched		5m06s	13m28s	32,800+/-600	 1,500	180MB/s
improvement	-26.44%	-14.68%	  +36.67%	-94.23%	+38.46%

If I use zero length files, this workload at about 500 IOPS, so
plugging drops the data IOs from roughly 25,500/s to 1000/s.
3 lines of code, 35% better throughput for 15% less CPU.

The benefits of plugging at this layer are likely to be higher for
spinning media as the IO patterns for this workload are going make a
much bigger difference on high IO latency devices.....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/fs-writeback.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 518c6294bf6c..d98e37bbf417 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1439,7 +1439,9 @@ static long writeback_sb_inodes(struct super_block *sb,
 	unsigned long start_time = jiffies;
 	long write_chunk;
 	long wrote = 0;  /* count both pages and inodes */
+	struct blk_plug plug;
 
+	blk_start_plug(&plug);
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
 
@@ -1537,6 +1539,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 				break;
 		}
 	}
+	blk_finish_plug(&plug);
 	return wrote;
 }
 
-- 
cgit v1.2.3


From 74278da9f70d84d715601fe794567a6d2bfdf078 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 12:37:22 -0500
Subject: inode: convert inode_sb_list_lock to per-sb

The process of reducing contention on per-superblock inode lists
starts with moving the locking to match the per-superblock inode
list. This takes the global lock out of the picture and reduces the
contention problems to within a single filesystem. This doesn't get
rid of contention as the locks still have global CPU scope, but it
does isolate operations on different superblocks form each other.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d98e37bbf417..f45bf876579f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2124,7 +2124,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	 */
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	spin_lock(&inode_sb_list_lock);
+	spin_lock(&sb->s_inode_list_lock);
 
 	/*
 	 * Data integrity sync. Must wait for all pages under writeback,
@@ -2144,14 +2144,14 @@ static void wait_sb_inodes(struct super_block *sb)
 		}
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_sb_list_lock);
+		spin_unlock(&sb->s_inode_list_lock);
 
 		/*
 		 * We hold a reference to 'inode' so it couldn't have been
 		 * removed from s_inodes list while we dropped the
-		 * inode_sb_list_lock.  We cannot iput the inode now as we can
+		 * s_inode_list_lock.  We cannot iput the inode now as we can
 		 * be holding the last reference and we cannot iput it under
-		 * inode_sb_list_lock. So we keep the reference and iput it
+		 * s_inode_list_lock. So we keep the reference and iput it
 		 * later.
 		 */
 		iput(old_inode);
@@ -2161,9 +2161,9 @@ static void wait_sb_inodes(struct super_block *sb)
 
 		cond_resched();
 
-		spin_lock(&inode_sb_list_lock);
+		spin_lock(&sb->s_inode_list_lock);
 	}
-	spin_unlock(&inode_sb_list_lock);
+	spin_unlock(&sb->s_inode_list_lock);
 	iput(old_inode);
 }
 
-- 
cgit v1.2.3


From e97fedb9ef9868ff24d588be781906cf7c1b59ae Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 13:40:00 -0500
Subject: sync: serialise per-superblock sync operations

When competing sync(2) calls walk the same filesystem, they need to
walk the list of inodes on the superblock to find all the inodes
that we need to wait for IO completion on. However, when multiple
wait_sb_inodes() calls do this at the same time, they contend on the
the inode_sb_list_lock and the contention causes system wide
slowdowns. In effect, concurrent sync(2) calls can take longer and
burn more CPU than if they were serialised.

Stop the worst of the contention by adding a per-sb mutex to wrap
around wait_sb_inodes() so that we only execute one sync(2) IO
completion walk per superblock superblock at a time and hence avoid
contention being triggered by concurrent sync(2) calls.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f45bf876579f..3c974442bdf0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2114,6 +2114,15 @@ out_unlock_inode:
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
+/*
+ * The @s_sync_lock is used to serialise concurrent sync operations
+ * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
+ * Concurrent callers will block on the s_sync_lock rather than doing contending
+ * walks. The queueing maintains sync(2) required behaviour as all the IO that
+ * has been issued up to the time this function is enter is guaranteed to be
+ * completed by the time we have gained the lock and waited for all IO that is
+ * in progress regardless of the order callers are granted the lock.
+ */
 static void wait_sb_inodes(struct super_block *sb)
 {
 	struct inode *inode, *old_inode = NULL;
@@ -2124,6 +2133,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	 */
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	mutex_lock(&sb->s_sync_lock);
 	spin_lock(&sb->s_inode_list_lock);
 
 	/*
@@ -2165,6 +2175,7 @@ static void wait_sb_inodes(struct super_block *sb)
 	}
 	spin_unlock(&sb->s_inode_list_lock);
 	iput(old_inode);
+	mutex_unlock(&sb->s_sync_lock);
 }
 
 static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
-- 
cgit v1.2.3


From c7f5408493aeb01532927b2276316797a03ed6ee Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 4 Mar 2015 14:07:22 -0500
Subject: inode: rename i_wb_list to i_io_list

There's a small consistency problem between the inode and writeback
naming. Writeback calls the "for IO" inode queues b_io and
b_more_io, but the inode calls these the "writeback list" or
i_wb_list. This makes it hard to an new "under writeback" list to
the inode, or call it an "under IO" list on the bdi because either
way we'll have writeback on IO and IO on writeback and it'll just be
confusing. I'm getting confused just writing this!

So, rename the inode "for IO" list variable to i_io_list so we can
add a new "writeback list" in a subsequent patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Dave Chinner <dchinner@redhat.com>
---
 fs/fs-writeback.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs/fs-writeback.c')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3c974442bdf0..63e00f11022e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -88,7 +88,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 
 static inline struct inode *wb_inode(struct list_head *head)
 {
-	return list_entry(head, struct inode, i_wb_list);
+	return list_entry(head, struct inode, i_io_list);
 }
 
 /*
@@ -125,22 +125,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 }
 
 /**
- * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
  * @inode: inode to be moved
  * @wb: target bdi_writeback
  * @head: one of @wb->b_{dirty|io|more_io}
  *
- * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
  * Returns %true if @inode is the first occupant of the !dirty_time IO
  * lists; otherwise, %false.
  */
-static bool inode_wb_list_move_locked(struct inode *inode,
+static bool inode_io_list_move_locked(struct inode *inode,
 				      struct bdi_writeback *wb,
 				      struct list_head *head)
 {
 	assert_spin_locked(&wb->list_lock);
 
-	list_move(&inode->i_wb_list, head);
+	list_move(&inode->i_io_list, head);
 
 	/* dirty_time doesn't count as dirty_io until expiration */
 	if (head != &wb->b_dirty_time)
@@ -151,19 +151,19 @@ static bool inode_wb_list_move_locked(struct inode *inode,
 }
 
 /**
- * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list
  * @inode: inode to be removed
  * @wb: bdi_writeback @inode is being removed from
  *
  * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
  * clear %WB_has_dirty_io if all are empty afterwards.
  */
-static void inode_wb_list_del_locked(struct inode *inode,
+static void inode_io_list_del_locked(struct inode *inode,
 				     struct bdi_writeback *wb)
 {
 	assert_spin_locked(&wb->list_lock);
 
-	list_del_init(&inode->i_wb_list);
+	list_del_init(&inode->i_io_list);
 	wb_io_lists_depopulated(wb);
 }
 
@@ -351,7 +351,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 
 	/*
 	 * Once I_FREEING is visible under i_lock, the eviction path owns
-	 * the inode and we shouldn't modify ->i_wb_list.
+	 * the inode and we shouldn't modify ->i_io_list.
 	 */
 	if (unlikely(inode->i_state & I_FREEING))
 		goto skip_switch;
@@ -390,16 +390,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	 * is always correct including from ->b_dirty_time.  The transfer
 	 * preserves @inode->dirtied_when ordering.
 	 */
-	if (!list_empty(&inode->i_wb_list)) {
+	if (!list_empty(&inode->i_io_list)) {
 		struct inode *pos;
 
-		inode_wb_list_del_locked(inode, old_wb);
+		inode_io_list_del_locked(inode, old_wb);
 		inode->i_wb = new_wb;
-		list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+		list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
 			if (time_after_eq(inode->dirtied_when,
 					  pos->dirtied_when))
 				break;
-		inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+		inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev);
 	} else {
 		inode->i_wb = new_wb;
 	}
@@ -961,12 +961,12 @@ void wb_start_background_writeback(struct bdi_writeback *wb)
 /*
  * Remove the inode from the writeback list it is on.
  */
-void inode_wb_list_del(struct inode *inode)
+void inode_io_list_del(struct inode *inode)
 {
 	struct bdi_writeback *wb;
 
 	wb = inode_to_wb_and_lock_list(inode);
-	inode_wb_list_del_locked(inode, wb);
+	inode_io_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 }
 
@@ -988,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 		if (time_before(inode->dirtied_when, tail->dirtied_when))
 			inode->dirtied_when = jiffies;
 	}
-	inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
+	inode_io_list_move_locked(inode, wb, &wb->b_dirty);
 }
 
 /*
@@ -996,7 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  */
 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
+	inode_io_list_move_locked(inode, wb, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -1055,7 +1055,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 		if (older_than_this &&
 		    inode_dirtied_after(inode, *older_than_this))
 			break;
-		list_move(&inode->i_wb_list, &tmp);
+		list_move(&inode->i_io_list, &tmp);
 		moved++;
 		if (flags & EXPIRE_DIRTY_ATIME)
 			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
@@ -1078,7 +1078,7 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 		list_for_each_prev_safe(pos, node, &tmp) {
 			inode = wb_inode(pos);
 			if (inode->i_sb == sb)
-				list_move(&inode->i_wb_list, dispatch_queue);
+				list_move(&inode->i_io_list, dispatch_queue);
 		}
 	}
 out:
@@ -1232,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		redirty_tail(inode, wb);
 	} else if (inode->i_state & I_DIRTY_TIME) {
 		inode->dirtied_when = jiffies;
-		inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
+		inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
-		inode_wb_list_del_locked(inode, wb);
+		inode_io_list_del_locked(inode, wb);
 	}
 }
 
@@ -1378,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * touch it. See comment above for explanation.
 	 */
 	if (!(inode->i_state & I_DIRTY_ALL))
-		inode_wb_list_del_locked(inode, wb);
+		inode_io_list_del_locked(inode, wb);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
 out:
@@ -2091,7 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			else
 				dirty_list = &wb->b_dirty_time;
 
-			wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+			wakeup_bdi = inode_io_list_move_locked(inode, wb,
 							       dirty_list);
 
 			spin_unlock(&wb->list_lock);
-- 
cgit v1.2.3