From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:15:18 -0600
Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains

Struct bio has an atomic ref count for chained bio's, and we use this
to know when to end IO on the bio. However, most bio's are not chained,
so we don't need to always introduce this atomic operation as part of
ending IO.

Add a helper to elevate the bi_remaining count, and flag the bio as
now actually needing the decrement at end_io time. Rename the field
to __bi_remaining to catch any current users of this doing the
incrementing manually.

For high IOPS workloads, this reduces the overhead of bio_endio()
substantially.

Tested-by: Robert Elliott <elliott@hp.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index f66a4eae16ee..117da319afb6 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -270,7 +270,7 @@ void bio_init(struct bio *bio)
 {
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
+	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
@@ -292,8 +292,8 @@ void bio_reset(struct bio *bio)
 	__bio_free(bio);
 
 	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
+	bio->bi_flags = flags | (1 << BIO_UPTODATE);
+	atomic_set(&bio->__bi_remaining, 1);
 }
 EXPORT_SYMBOL(bio_reset);
 
@@ -320,7 +320,7 @@ void bio_chain(struct bio *bio, struct bio *parent)
 
 	bio->bi_private = parent;
 	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
+	bio_inc_remaining(parent);
 }
 EXPORT_SYMBOL(bio_chain);
 
@@ -1741,6 +1741,23 @@ void bio_flush_dcache_pages(struct bio *bi)
 EXPORT_SYMBOL(bio_flush_dcache_pages);
 #endif
 
+static inline bool bio_remaining_done(struct bio *bio)
+{
+	/*
+	 * If we're not chaining, then ->__bi_remaining is always 1 and
+	 * we always end io on the first invocation.
+	 */
+	if (!bio_flagged(bio, BIO_CHAIN))
+		return true;
+
+	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
+
+	if (atomic_dec_and_test(&bio->__bi_remaining))
+		return true;
+
+	return false;
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1758,15 +1775,13 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
 void bio_endio(struct bio *bio, int error)
 {
 	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
+		if (unlikely(!bio_remaining_done(bio)))
+			break;
 
 		/*
 		 * Need to have a real endio function for chained bios,
@@ -1799,7 +1814,12 @@ EXPORT_SYMBOL(bio_endio);
  **/
 void bio_endio_nodec(struct bio *bio, int error)
 {
-	atomic_inc(&bio->bi_remaining);
+	/*
+	 * If it's not flagged as a chain, we are not going to dec the count
+	 */
+	if (bio_flagged(bio, BIO_CHAIN))
+		bio_inc_remaining(bio);
+
 	bio_endio(bio, error);
 }
 EXPORT_SYMBOL(bio_endio_nodec);
-- 
cgit v1.2.3


From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:23:59 -0600
Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases

Struct bio has a reference count that controls when it can be freed.
Most uses cases is allocating the bio, which then returns with a
single reference to it, doing IO, and then dropping that single
reference. We can remove this atomic_dec_and_test() in the completion
path, if nobody else is holding a reference to the bio.

If someone does call bio_get() on the bio, then we flag the bio as
now having valid count and that we must properly honor the reference
count when it's being put.

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 117da319afb6..c2ff8a88aef1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -271,7 +271,7 @@ void bio_init(struct bio *bio)
 	memset(bio, 0, sizeof(*bio));
 	bio->bi_flags = 1 << BIO_UPTODATE;
 	atomic_set(&bio->__bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
+	atomic_set(&bio->__bi_cnt, 1);
 }
 EXPORT_SYMBOL(bio_init);
 
@@ -524,13 +524,17 @@ EXPORT_SYMBOL(zero_fill_bio);
  **/
 void bio_put(struct bio *bio)
 {
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
+	if (!bio_flagged(bio, BIO_REFFED))
 		bio_free(bio);
+	else {
+		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+
+		/*
+		 * last put frees it
+		 */
+		if (atomic_dec_and_test(&bio->__bi_cnt))
+			bio_free(bio);
+	}
 }
 EXPORT_SYMBOL(bio_put);
 
-- 
cgit v1.2.3


From a7928c1578c550bd6f4dec62d65132e6db226c57 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2015 22:37:20 +0200
Subject: block: move PM request support to IDE

This removes the request types and hacks from the block code and into the
old IDE driver.  There is a small amunt of code duplication due to this,
but it's not too bad.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c |  1 +
 block/blk-exec.c | 10 ----------
 block/blk.h      |  2 --
 3 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b94447a..2e5020f37d55 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -285,6 +285,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q)
 	q->request_fn(q);
 	q->request_fn_active--;
 }
+EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
 
 /**
  * __blk_run_queue - run a single device queue
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 9924725fa50d..3fec8a29d0fa 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	bool is_pm_resume;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
@@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 		return;
 	}
 
-	/*
-	 * need to check this before __blk_run_queue(), because rq can
-	 * be freed before that returns.
-	 */
-	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
-
 	spin_lock_irq(q->queue_lock);
 
 	if (unlikely(blk_queue_dying(q))) {
@@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
-	/* the queue is stopped so it won't be run */
-	if (is_pm_resume)
-		__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
diff --git a/block/blk.h b/block/blk.h
index 43b036185712..4b48d55e588e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -193,8 +193,6 @@ int blk_try_merge(struct request *rq, struct bio *bio);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
-void __blk_run_queue_uncond(struct request_queue *q);
-
 int blk_dev_init(void);
 
 
-- 
cgit v1.2.3


From dd6cf3e18decb4895503db1752bb5500c4dd588d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:28 -0700
Subject: blk: clean up plug

Current code looks like inner plug gets flushed with a
blk_finish_plug(). Actually it's a nop. All requests/callbacks are added
to current->plug, while only outmost plug is assigned to current->plug.
So inner plug always has empty request/callback list, which makes
blk_flush_plug_list() a nop. This tries to make the code more clear.

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2e5020f37d55..9dcfb8ec554b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3032,21 +3032,20 @@ void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
-
 	/*
-	 * If this is a nested plug, don't actually assign it. It will be
-	 * flushed on its own.
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
 	 */
-	if (!tsk->plug) {
-		/*
-		 * Store ordering should not be needed here, since a potential
-		 * preempt will imply a full memory barrier
-		 */
-		tsk->plug = plug;
-	}
+	tsk->plug = plug;
 }
 EXPORT_SYMBOL(blk_start_plug);
 
@@ -3193,10 +3192,11 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 void blk_finish_plug(struct blk_plug *plug)
 {
+	if (plug != current->plug)
+		return;
 	blk_flush_plug_list(plug, false);
 
-	if (plug == current->plug)
-		current->plug = NULL;
+	current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
-- 
cgit v1.2.3


From e6c4438ba7cb615448492849970aaf0aaa1cc973 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Fri, 8 May 2015 10:51:30 -0700
Subject: blk-mq: fix plugging in blk_sq_make_request

The following appears in blk_sq_make_request:

	/*
	 * If we have multiple hardware queues, just go directly to
	 * one of those for sync IO.
	 */

We clearly don't have multiple hardware queues, here!  This comment was
introduced with this commit 07068d5b8e (blk-mq: split make request
handler for multi and single queue):

    We want slightly different behavior from them:

    - On single queue devices, we currently use the per-process plug
      for deferred IO and for merging.

    - On multi queue devices, we don't use the per-process plug, but
      we want to go straight to hardware for SYNC IO.

The old code had this:

        use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);

and that was converted to:

	use_plug = !is_flush_fua && !is_sync;

which is not equivalent.  For the single queue case, that second half of
the && expression is always true.  So, what I think was actually inteded
follows (and this more closely matches what is done in blk_queue_bio).

V2: delete the 'likely', which should not be a big deal

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..a65acffde19a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1309,16 +1309,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
-	unsigned int use_plug, request_count = 0;
+	struct blk_plug *plug;
+	unsigned int request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;
 
-	/*
-	 * If we have multiple hardware queues, just go directly to
-	 * one of those for sync IO.
-	 */
-	use_plug = !is_flush_fua && !is_sync;
-
 	blk_queue_bounce(q, &bio);
 
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,7 +1321,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
-	if (use_plug && !blk_queue_nomerges(q) &&
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count))
 		return;
 
@@ -1345,21 +1340,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	 * utilize that to temporarily store requests until the task is
 	 * either done or scheduled away.
 	 */
-	if (use_plug) {
-		struct blk_plug *plug = current->plug;
-
-		if (plug) {
-			blk_mq_bio_to_request(rq, bio);
-			if (list_empty(&plug->mq_list))
-				trace_block_plug(q);
-			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
-				blk_flush_plug_list(plug, false);
-				trace_block_plug(q);
-			}
-			list_add_tail(&rq->queuelist, &plug->mq_list);
-			blk_mq_put_ctx(data.ctx);
-			return;
+	plug = current->plug;
+	if (plug) {
+		blk_mq_bio_to_request(rq, bio);
+		if (list_empty(&plug->mq_list))
+			trace_block_plug(q);
+		else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+			blk_flush_plug_list(plug, false);
+			trace_block_plug(q);
 		}
+		list_add_tail(&rq->queuelist, &plug->mq_list);
+		blk_mq_put_ctx(data.ctx);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
-- 
cgit v1.2.3


From 239ad215f0d8388cbe6c09a0fab8ad8ff5dba420 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:31 -0700
Subject: blk-mq: avoid re-initialize request which is failed in direct
 dispatch

If we directly issue a request and it fails, we use
blk_mq_merge_queue_io(). But we already assigned bio to a request in
blk_mq_bio_to_request. blk_mq_merge_queue_io shouldn't run
blk_mq_bio_to_request again.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a65acffde19a..f13d0de42f53 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1284,6 +1284,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 				blk_mq_end_request(rq, rq->errors);
 				goto done;
 			}
+			blk_mq_insert_request(rq, false, true, true);
+			return;
 		}
 	}
 
-- 
cgit v1.2.3


From f984df1f0f71ef96254411fc3576a10ae561be71 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:32 -0700
Subject: blk-mq: do limited block plug for multiple queue case

plug is still helpful for workload with IO merge, but it can be harmful
otherwise especially with multiple hardware queues, as there is
(supposed) no lock contention in this case and plug can introduce
latency. For multiple queues, we do limited plug, eg plug only if there
is request merge. If a request doesn't have merge with following
request, the requet will be dispatched immediately.

V2: check blk_queue_nomerges() as suggested by Jeff.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 82 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 59 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f13d0de42f53..902c2eb9a0e7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1224,6 +1224,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
 	return rq;
 }
 
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+	int ret;
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+			rq->mq_ctx->cpu);
+	struct blk_mq_queue_data bd = {
+		.rq = rq,
+		.list = NULL,
+		.last = 1
+	};
+
+	/*
+	 * For OK queue, we are done. For error, kill it. Any other
+	 * error (busy), just add it to our list as we previously
+	 * would have done
+	 */
+	ret = q->mq_ops->queue_rq(hctx, &bd);
+	if (ret == BLK_MQ_RQ_QUEUE_OK)
+		return 0;
+	else {
+		__blk_mq_requeue_request(rq);
+
+		if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+			return 0;
+		}
+		return -1;
+	}
+}
+
 /*
  * Multiple hardware queue variant. This will not use per-process plugs,
  * but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1267,8 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
+	unsigned int request_count = 0;
+	struct blk_plug *plug;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1243,6 +1277,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return;
 	}
 
+	if (!is_flush_fua && !blk_queue_nomerges(q) &&
+	    blk_attempt_plug_merge(q, bio, &request_count))
+		return;
+
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
@@ -1253,40 +1291,39 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto run_queue;
 	}
 
+	plug = current->plug;
 	/*
 	 * If the driver supports defer issued based on 'last', then
 	 * queue it up like normal since we can potentially save some
 	 * CPU this way.
 	 */
-	if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
-		struct blk_mq_queue_data bd = {
-			.rq = rq,
-			.list = NULL,
-			.last = 1
-		};
-		int ret;
+	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+		struct request *old_rq = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
 
 		/*
-		 * For OK queue, we are done. For error, kill it. Any other
-		 * error (busy), just add it to our list as we previously
-		 * would have done
+		 * we do limited pluging. If bio can be merged, do merge.
+		 * Otherwise the existing request in the plug list will be
+		 * issued. So the plug list will have one request at most
 		 */
-		ret = q->mq_ops->queue_rq(data.hctx, &bd);
-		if (ret == BLK_MQ_RQ_QUEUE_OK)
-			goto done;
-		else {
-			__blk_mq_requeue_request(rq);
-
-			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
-				rq->errors = -EIO;
-				blk_mq_end_request(rq, rq->errors);
-				goto done;
+		if (plug) {
+			if (!list_empty(&plug->mq_list)) {
+				old_rq = list_first_entry(&plug->mq_list,
+					struct request, queuelist);
+				list_del_init(&old_rq->queuelist);
 			}
-			blk_mq_insert_request(rq, false, true, true);
+			list_add_tail(&rq->queuelist, &plug->mq_list);
+		} else /* is_sync */
+			old_rq = rq;
+		blk_mq_put_ctx(data.ctx);
+		if (!old_rq)
 			return;
-		}
+		if (!blk_mq_direct_issue_request(old_rq))
+			return;
+		blk_mq_insert_request(old_rq, false, true, true);
+		return;
 	}
 
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1299,7 +1336,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
-done:
 	blk_mq_put_ctx(data.ctx);
 }
 
-- 
cgit v1.2.3


From 5b3f341f098d60da2970758db6a05bd851eb6b39 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:33 -0700
Subject: blk-mq: make plug work for mutiple disks and queues

Last patch makes plug work for multiple queue case. However it only
works for single disk case, because it assumes only one request in the
plug list. If a task is accessing multiple disks, eg MD/DM, the
assumption is wrong. Let blk_attempt_plug_merge() record request from
the same queue.

V2: use NULL parameter in !mq case. Fix a bug. Add comments in
blk_attempt_plug_merge to make it less (hopefully) confusion.

Cc: Jens Axboe <axboe@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 15 ++++++++++++---
 block/blk-mq.c   | 14 +++++++++-----
 block/blk.h      |  3 ++-
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9dcfb8ec554b..f0be754c7781 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1522,7 +1522,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count)
+			    unsigned int *request_count,
+			    struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
 	struct request *rq;
@@ -1542,8 +1543,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;
 
-		if (rq->q == q)
+		if (rq->q == q) {
 			(*request_count)++;
+			/*
+			 * Only blk-mq multiple hardware queues case checks the
+			 * rq in the same queue, there should be only one such
+			 * rq in a queue
+			 **/
+			if (same_queue_rq)
+				*same_queue_rq = rq;
+		}
 
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
 			continue;
@@ -1608,7 +1617,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * any locks.
 	 */
 	if (!blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	spin_lock_irq(q->queue_lock);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 902c2eb9a0e7..31df47443699 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1269,6 +1269,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	struct request *rq;
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
+	struct request *same_queue_rq = NULL;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1278,7 +1279,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
@@ -1309,9 +1310,12 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		 * issued. So the plug list will have one request at most
 		 */
 		if (plug) {
-			if (!list_empty(&plug->mq_list)) {
-				old_rq = list_first_entry(&plug->mq_list,
-					struct request, queuelist);
+			/*
+			 * The plug list might get flushed before this. If that
+			 * happens, same_queue_rq is invalid and plug list is empty
+			 **/
+			if (same_queue_rq && !list_empty(&plug->mq_list)) {
+				old_rq = same_queue_rq;
 				list_del_init(&old_rq->queuelist);
 			}
 			list_add_tail(&rq->queuelist, &plug->mq_list);
@@ -1360,7 +1364,7 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count))
+	    blk_attempt_plug_merge(q, bio, &request_count, NULL))
 		return;
 
 	rq = blk_mq_map_request(q, bio, &data);
diff --git a/block/blk.h b/block/blk.h
index 4b48d55e588e..026d9594142b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -78,7 +78,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 			    struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count);
+			    unsigned int *request_count,
+			    struct request **same_queue_rq);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
-- 
cgit v1.2.3


From 4ecd4fef3a074c8bb43c391a57742c422469ebbd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 May 2015 09:38:13 +0200
Subject: block: use an atomic_t for mq_freeze_depth

lockdep gets unhappy about the not disabling irqs when using the queue_lock
around it.  Instead of trying to fix that up just switch to an atomic_t
and get rid of the lock.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 31df47443699..c382a34fe5ac 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
-				!q->mq_freeze_depth || blk_queue_dying(q));
+				!atomic_read(&q->mq_freeze_depth) ||
+				blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 
 void blk_mq_freeze_queue_start(struct request_queue *q)
 {
-	bool freeze;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	freeze = !q->mq_freeze_depth++;
-	spin_unlock_irq(q->queue_lock);
-
-	if (freeze) {
+	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_hw_queues(q, false);
 	}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-	bool wake;
+	int freeze_depth;
 
-	spin_lock_irq(q->queue_lock);
-	wake = !--q->mq_freeze_depth;
-	WARN_ON_ONCE(q->mq_freeze_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-	if (wake) {
+	freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+	WARN_ON_ONCE(freeze_depth < 0);
+	if (!freeze_depth) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
@@ -2081,7 +2077,7 @@ void blk_mq_free_queue(struct request_queue *q)
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	WARN_ON_ONCE(!q->mq_freeze_depth);
+	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
 	blk_mq_sysfs_unregister(q);
 
-- 
cgit v1.2.3


From b25de9d6da49b1a8760a89672283128aa8c78345 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2015 21:41:01 +0200
Subject: block: remove BIO_EOPNOTSUPP

Since the big barrier rewrite/removal in 2007 we never fail FLUSH or
FUA requests, which means we can remove the magic BIO_EOPNOTSUPP flag
to help propagating those to the buffer_head layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bounce.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..4bac72579c1f 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -128,9 +128,6 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	struct bio_vec *bvec, *org_vec;
 	int i;
 
-	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
-		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
-
 	/*
 	 * free up bounce indirect pages used
 	 */
-- 
cgit v1.2.3


From 97ca223c3b37ed12a5b67a5dc6247e5a4799d337 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Apr 2015 21:41:02 +0200
Subject: block: remove unused BIO_RW_BLOCK and BIO_EOF flags

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index f0be754c7781..de474b5dee2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1725,8 +1725,6 @@ static void handle_bad_sector(struct bio *bio)
 			bio->bi_rw,
 			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
-
-	set_bit(BIO_EOF, &bio->bi_flags);
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-- 
cgit v1.2.3


From be32417796c2b8a83fe4cbece83bea96ab9e378f Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Wed, 6 May 2015 12:26:22 +0800
Subject: block: export blkdev_reread_part() and __blkdev_reread_part()

This patch exports blkdev_reread_part() for block drivers, also
introduce __blkdev_reread_part().

For some drivers, such as loop, reread of partitions can be run
from the release path, and bd_mutex may already be held prior to
calling ioctl_by_bdev(bdev, BLKRRPART, 0), so introduce
__blkdev_reread_part for use in such cases.

CC: Christoph Hellwig <hch@lst.de>
CC: Jens Axboe <axboe@kernel.dk>
CC: Tejun Heo <tj@kernel.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
CC: Markus Pargmann <mpa@pengutronix.de>
CC: Stefan Weinhuber <wein@de.ibm.com>
CC: Stefan Haberland <stefan.haberland@de.ibm.com>
CC: Sebastian Ott <sebott@linux.vnet.ibm.com>
CC: Fabian Frederick <fabf@skynet.be>
CC: Ming Lei <ming.lei@canonical.com>
CC: David Herrmann <dh.herrmann@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: nbd-general@lists.sourceforge.net
CC: linux-s390@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/ioctl.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befde2aca..203cb4aeea8b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -150,21 +150,43 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	}
 }
 
-static int blkdev_reread_part(struct block_device *bdev)
+/*
+ * This is an exported API for the block driver, and will not
+ * acquire bd_mutex. This API should be used in case that
+ * caller has held bd_mutex already.
+ */
+int __blkdev_reread_part(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int res;
 
 	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
+
+	lockdep_assert_held(&bdev->bd_mutex);
+
+	return rescan_partitions(disk, bdev);
+}
+EXPORT_SYMBOL(__blkdev_reread_part);
+
+/*
+ * This is an exported API for the block driver, and will
+ * try to acquire bd_mutex. If bd_mutex has been held already
+ * in current context, please call __blkdev_reread_part().
+ */
+int blkdev_reread_part(struct block_device *bdev)
+{
+	int res;
+
 	if (!mutex_trylock(&bdev->bd_mutex))
 		return -EBUSY;
-	res = rescan_partitions(disk, bdev);
+	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
+
 	return res;
 }
+EXPORT_SYMBOL(blkdev_reread_part);
 
 static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 			     uint64_t len, int secure)
-- 
cgit v1.2.3


From b04a5636a665f5529fdf69ee7e5512156196f31c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Wed, 6 May 2015 12:26:27 +0800
Subject: block: replace trylock with mutex_lock in blkdev_reread_part()

The only possible problem of using mutex_lock() instead of trylock
is about deadlock.

If there aren't any locks held before calling blkdev_reread_part(),
deadlock can't be caused by this conversion.

If there are locks held before calling blkdev_reread_part(),
and if these locks arn't required in open, close handler and I/O
path, deadlock shouldn't be caused too.

Both user space's ioctl(BLKRRPART) and md_setup_drive() from
init/do_mounts_md.c belongs to the 1st case, so the conversion is safe
for the two cases.

For loop, the previous patches in this pathset has fixed the ABBA lock
dependency, so the conversion is OK.

For nbd, tx_lock is held when calling the function:

	- both open and release won't hold the lock
	- when blkdev_reread_part() is run, I/O thread has been stopped
	already, so tx_lock won't be acquired in I/O path at that time.
	- so the conversion won't cause deadlock for nbd

For dasd, both dasd_open(), dasd_release() and request function don't
acquire any mutex/semphone, so the conversion should be safe.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Jarod Wilson <jarod@redhat.com>
Acked-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/ioctl.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/ioctl.c b/block/ioctl.c
index 203cb4aeea8b..8061eba42887 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -174,13 +174,18 @@ EXPORT_SYMBOL(__blkdev_reread_part);
  * This is an exported API for the block driver, and will
  * try to acquire bd_mutex. If bd_mutex has been held already
  * in current context, please call __blkdev_reread_part().
+ *
+ * Make sure the held locks in current context aren't required
+ * in open()/close() handler and I/O path for avoiding ABBA deadlock:
+ * - bd_mutex is held before calling block driver's open/close
+ *   handler
+ * - reading partition table may submit I/O to the block device
  */
 int blkdev_reread_part(struct block_device *bdev)
 {
 	int res;
 
-	if (!mutex_trylock(&bdev->bd_mutex))
-		return -EBUSY;
+	mutex_lock(&bdev->bd_mutex);
 	res = __blkdev_reread_part(bdev);
 	mutex_unlock(&bdev->bd_mutex);
 
-- 
cgit v1.2.3


From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 22 May 2015 09:14:03 -0400
Subject: block: remove management of bi_remaining when restoring original
 bi_end_io

Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for
non-chains") regressed all existing callers that followed this pattern:
 1) saving a bio's original bi_end_io
 2) wiring up an intermediate bi_end_io
 3) restoring the original bi_end_io from intermediate bi_end_io
 4) calling bio_endio() to execute the restored original bi_end_io

The regression was due to BIO_CHAIN only ever getting set if
bio_inc_remaining() is called.  For the above pattern it isn't set until
step 3 above (step 2 would've needed to establish BIO_CHAIN).  As such
the first bio_endio(), in step 2 above, never decremented __bi_remaining
before calling the intermediate bi_end_io -- leaving __bi_remaining with
the value 1 instead of 0.  When bio_inc_remaining() occurred during step
3 it brought it to a value of 2.  When the second bio_endio() was
called, in step 4 above, it should've called the original bi_end_io but
it didn't because there was an extra reference that wasn't dropped (due
to atomic operations being optimized away since BIO_CHAIN wasn't set
upfront).

Fix this issue by removing the __bi_remaining management complexity for
all callers that use the above pattern -- bio_chain() is the only
interface that _needs_ to be concerned with __bi_remaining.  For the
above pattern callers just expect the bi_end_io they set to get called!
Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls
that aren't associated with the bio_chain() interface.

Also, the bio_inc_remaining() interface has been moved local to bio.c.

Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c |  4 ++--
 block/bio.c           | 35 ++++++++++++++---------------------
 2 files changed, 16 insertions(+), 23 deletions(-)

(limited to 'block')

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5cbd5d9ea61d..0436c21db7f2 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -361,7 +361,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -388,7 +388,7 @@ void bio_integrity_endio(struct bio *bio, int error)
 	 */
 	if (error) {
 		bio->bi_end_io = bip->bip_end_io;
-		bio_endio_nodec(bio, error);
+		bio_endio(bio, error);
 
 		return;
 	}
diff --git a/block/bio.c b/block/bio.c
index c2ff8a88aef1..259197d97de1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -303,6 +303,17 @@ static void bio_chain_endio(struct bio *bio, int error)
 	bio_put(bio);
 }
 
+/*
+ * Increment chain count for the bio. Make sure the CHAIN flag update
+ * is visible before the raised count.
+ */
+static inline void bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
+
 /**
  * bio_chain - chain bio completions
  * @bio: the target bio
@@ -1756,8 +1767,10 @@ static inline bool bio_remaining_done(struct bio *bio)
 
 	BUG_ON(atomic_read(&bio->__bi_remaining) <= 0);
 
-	if (atomic_dec_and_test(&bio->__bi_remaining))
+	if (atomic_dec_and_test(&bio->__bi_remaining)) {
+		clear_bit(BIO_CHAIN, &bio->bi_flags);
 		return true;
+	}
 
 	return false;
 }
@@ -1808,26 +1821,6 @@ void bio_endio(struct bio *bio, int error)
 }
 EXPORT_SYMBOL(bio_endio);
 
-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	/*
-	 * If it's not flagged as a chain, we are not going to dec the count
-	 */
-	if (bio_flagged(bio, BIO_CHAIN))
-		bio_inc_remaining(bio);
-
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
 /**
  * bio_split - split a bio
  * @bio:	bio to split
-- 
cgit v1.2.3


From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 09:14:04 -0400
Subject: block, dm: don't copy bios for request clones

Currently dm-multipath has to clone the bios for every request sent
to the lower devices, which wastes cpu cycles and ties down memory.

This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio
to not complete bios attached to a request, which we set on clone
requests similar to bios in a flush sequence.  With this change I/O
errors on a path failure only get propagated to dm-multipath, which
can then either resubmit the I/O or complete the bios on the original
request.

I've done some basic testing of this on a Linux target with ALUA support,
and it survives path failures during I/O nicely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 94 +++++++-------------------------------------------------
 1 file changed, 11 insertions(+), 83 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index de474b5dee2b..aa819a58ea24 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
-	if (error)
+	if (error && !(rq->cmd_flags & REQ_CLONE))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
@@ -128,7 +128,8 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 	bio_advance(bio, nbytes);
 
 	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+	if (bio->bi_iter.bi_size == 0 &&
+	    !(rq->cmd_flags & (REQ_FLUSH_SEQ|REQ_CLONE)))
 		bio_endio(bio, error);
 }
 
@@ -2909,95 +2910,22 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- *     Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
-	struct bio *bio;
-
-	while ((bio = rq->bio) != NULL) {
-		rq->bio = bio->bi_next;
-
-		bio_put(bio);
-	}
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+void blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
-	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
+	dst->cmd_flags |= (src->cmd_flags & REQ_CLONE_MASK);
+	dst->cmd_flags |= REQ_NOMERGE | REQ_CLONE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
-}
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- *           Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- *     The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- *     are not copied, and copying such parts is the caller's responsibility.
- *     Also, pages which the original bios are pointing to are not copied
- *     and the cloned bios just point same pages.
- *     So cloned bios must be completed before original bios, which means
- *     the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
-		      struct bio_set *bs, gfp_t gfp_mask,
-		      int (*bio_ctr)(struct bio *, struct bio *, void *),
-		      void *data)
-{
-	struct bio *bio, *bio_src;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	__rq_for_each_bio(bio_src, rq_src) {
-		bio = bio_clone_fast(bio_src, gfp_mask, bs);
-		if (!bio)
-			goto free_and_out;
-
-		if (bio_ctr && bio_ctr(bio, bio_src, data))
-			goto free_and_out;
-
-		if (rq->bio) {
-			rq->biotail->bi_next = bio;
-			rq->biotail = bio;
-		} else
-			rq->bio = rq->biotail = bio;
-	}
-
-	__blk_rq_prep_clone(rq, rq_src);
-
-	return 0;
-
-free_and_out:
-	if (bio)
-		bio_put(bio);
-	blk_rq_unprep_clone(rq);
-
-	return -ENOMEM;
+	dst->bio = src->bio;
+	dst->biotail = src->biotail;
+	dst->cmd = src->cmd;
+	dst->cmd_len = src->cmd_len;
+	dst->sense = src->sense;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 
-- 
cgit v1.2.3


From beefa6ba7bf304d3de3a02cb7366fb0a7d6b27ab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 29 May 2015 13:10:23 -0600
Subject: block: only honor SG gap prevention for merges that contain data

We can safely merge anything that wont generate an SG list entry,
so if the bio is data-less (discard), don't look at potential
SG gaps.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-merge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-merge.c b/block/blk-merge.c
index fd3fee81c23c..30a0d9f89017 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -589,7 +589,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 
-	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+	/* Only check gaps if the bio carries data */
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && bio_has_data(bio)) {
 		struct bio_vec *bprev;
 
 		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
-- 
cgit v1.2.3


From f26cdc8536ad50fb802a0445f836b4f94ca09ae7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 1 Jun 2015 09:29:53 -0600
Subject: blk-mq: Shared tag enhancements

Storage controllers may expose multiple block devices that share hardware
resources managed by blk-mq. This patch enhances the shared tags so a
low-level driver can access the shared resources not tied to the unshared
h/w contexts. This way the LLD can dynamically add and delete disks and
request queues without having to track all the request_queue hctx's to
iterate outstanding tags.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 38 ++++++++++++++++++++++++++++++++++++++
 block/blk-mq-tag.h |  1 +
 block/blk-mq.c     | 12 ++++++++++--
 3 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290cc0644..9b6e28830b82 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b370308..75893a34237d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
+	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c382a34fe5ac..ef100fd2cb86 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1525,7 +1525,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;
 
 fail:
@@ -1821,6 +1820,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2187,6 +2187,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2248,8 +2254,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}
 
 	kfree(set->tags);
-- 
cgit v1.2.3


From 41c0126b3f22ef36b97b3c38b8f29569848a5ce2 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Tue, 19 May 2015 13:55:21 -0700
Subject: block: Make CFQ default to IOPS mode on SSDs

CFQ idling causes reduced IOPS throughput on non-rotational disks.
Since disk head seeking is not applicable to SSDs, it doesn't really
help performance by anticipating future near-by IO requests.

By turning off idling (and switching to IOPS mode), we allow other
processes to dispatch IO requests down to the driver and so increase IO
throughput.

Following FIO benchmark results were taken on a cloud SSD offering with
idling on and off:

Idling     iops    avg-lat(ms)    stddev            bw
------------------------------------------------------
    On     7054    90.107         38.697     28217KB/s
   Off    29255    21.836         11.730    117022KB/s

fio --name=temp --size=100G --time_based --ioengine=libaio \
    --randrepeat=0 --direct=1 --invalidate=1 --verify=0 \
    --verify_fatal=0 --rw=randread --blocksize=4k --group_reporting=1 \
    --filename=/dev/sdb --runtime=10 --iodepth=64 --numjobs=10

And the following is from a local SSD run:

Idling     iops    avg-lat(ms)    stddev            bw
------------------------------------------------------
    On    19320    33.043         14.068     77281KB/s
   Off    21626    29.465         12.662     86507KB/s

fio --name=temp --size=5G --time_based --ioengine=libaio \
    --randrepeat=0 --direct=1 --invalidate=1 --verify=0 \
    --verify_fatal=0 --rw=randread --blocksize=4k --group_reporting=1 \
    --filename=/fio_data --runtime=10 --iodepth=64 --numjobs=10

Reviewed-by: Nauman Rafique <nauman@google.com>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5da8e6e9ab4b..402be0139122 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4460,7 +4460,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_target_latency = cfq_target_latency;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
-	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->cfq_slice_idle = blk_queue_nonrot(q) ? 0 : cfq_slice_idle;
 	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->hw_tag = -1;
-- 
cgit v1.2.3


From e48453c386f39ca9ea29e9df6efef78f56746af0 Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini.arianna@gmail.com>
Date: Fri, 5 Jun 2015 23:38:42 +0200
Subject: block, cgroup: implement policy-specific per-blkcg data

The block IO (blkio) controller enables the block layer to provide service
guarantees in a hierarchical fashion. Specifically, service guarantees
are provided by registered request-accounting policies. As of now, a
proportional-share and a throttling policy are available. They are
implemented, respectively, by the CFQ I/O scheduler and the blk-throttle
subsystem. Unfortunately, as for adding new policies, the current
implementation of the block IO controller is only halfway ready to allow
new policies to be plugged in. This commit provides a solution to make
the block IO controller fully ready to handle new policies.
In what follows, we first describe briefly the current state, and then
list the changes made by this commit.

The throttling policy does not need any per-cgroup information to perform
its task. In contrast, the proportional share policy uses, for each cgroup,
both the weight assigned by the user to the cgroup, and a set of dynamically-
computed weights, one for each device.

The first, user-defined weight is stored in the blkcg data structure: the
block IO controller allocates a private blkcg data structure for each
cgroup in the blkio cgroups hierarchy (regardless of which policy is active).
In other words, the block IO controller internally mirrors the blkio cgroups
with private blkcg data structures.

On the other hand, for each cgroup and device, the corresponding dynamically-
computed weight is maintained in the following, different way. For each device,
the block IO controller keeps a private blkcg_gq structure for each cgroup in
blkio. In other words, block IO also keeps one private mirror copy of the blkio
cgroups hierarchy for each device, made of blkcg_gq structures.
Each blkcg_gq structure keeps per-policy information in a generic array of
dynamically-allocated 'dedicated' data structures, one for each registered
policy (so currently the array contains two elements). To be inserted into the
generic array, each dedicated data structure embeds a generic blkg_policy_data
structure. Consider now the array contained in the blkcg_gq structure
corresponding to a given pair of cgroup and device: one of the elements
of the array contains the dedicated data structure for the proportional-share
policy, and this dedicated data structure contains the dynamically-computed
weight for that pair of cgroup and device.

The generic strategy adopted for storing per-policy data in blkcg_gq structures
is already capable of handling new policies, whereas the one adopted with blkcg
structures is not, because per-policy data are hard-coded in the blkcg
structures themselves (currently only data related to the proportional-
share policy).

This commit addresses the above issues through the following changes:
. It generalizes blkcg structures so that per-policy data are stored in the same
  way as in blkcg_gq structures.
  Specifically, it lets also the blkcg structure store per-policy data in a
  generic array of dynamically-allocated dedicated data structures. We will
  refer to these data structures as blkcg dedicated data structures, to
  distinguish them from the dedicated data structures inserted in the generic
  arrays kept by blkcg_gq structures.
  To allow blkcg dedicated data structures to be inserted in the generic array
  inside a blkcg structure, this commit also introduces a new blkcg_policy_data
  structure, which is the equivalent of blkg_policy_data for blkcg dedicated
  data structures.
. It adds to the blkcg_policy structure, i.e., to the descriptor of a policy, a
  cpd_size field and a cpd_init field, to be initialized by the policy with,
  respectively, the size of the blkcg dedicated data structures, and the
  address of a constructor function for blkcg dedicated data structures.
. It moves the CFQ-specific fields embedded in the blkcg data structure (i.e.,
  the fields related to the proportional-share policy), into a new blkcg
  dedicated data structure called cfq_group_data.

Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c  | 92 ++++++++++++++++++++++++++++++++++++++++++++++-------
 block/blk-cgroup.h  | 40 ++++++++++++++++++-----
 block/cfq-iosched.c | 70 ++++++++++++++++++++++++++++++++++------
 3 files changed, 173 insertions(+), 29 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ac817b750db..6e43fa355e71 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -9,6 +9,10 @@
  *
  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  * 	              Nauman Rafique <nauman@google.com>
+ *
+ * For policy-specific per-blkcg data:
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini.arianna@gmail.com>
  */
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
@@ -26,8 +30,7 @@
 
 static DEFINE_MUTEX(blkcg_pol_mutex);
 
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
-			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
@@ -823,6 +826,8 @@ static struct cgroup_subsys_state *
 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct blkcg *blkcg;
+	struct cgroup_subsys_state *ret;
+	int i;
 
 	if (!parent_css) {
 		blkcg = &blkcg_root;
@@ -830,17 +835,49 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	}
 
 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-	if (!blkcg)
-		return ERR_PTR(-ENOMEM);
+	if (!blkcg) {
+		ret = ERR_PTR(-ENOMEM);
+		goto free_blkcg;
+	}
+
+	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+		struct blkcg_policy *pol = blkcg_policy[i];
+		struct blkcg_policy_data *cpd;
+
+		/*
+		 * If the policy hasn't been attached yet, wait for it
+		 * to be attached before doing anything else. Otherwise,
+		 * check if the policy requires any specific per-cgroup
+		 * data: if it does, allocate and initialize it.
+		 */
+		if (!pol || !pol->cpd_size)
+			continue;
+
+		BUG_ON(blkcg->pd[i]);
+		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+		if (!cpd) {
+			ret = ERR_PTR(-ENOMEM);
+			goto free_pd_blkcg;
+		}
+		blkcg->pd[i] = cpd;
+		cpd->plid = i;
+		pol->cpd_init_fn(blkcg);
+	}
 
-	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
-	blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&blkcg->blkg_list);
 
 	return &blkcg->css;
+
+free_pd_blkcg:
+	for (i--; i >= 0; i--)
+		kfree(blkcg->pd[i]);
+
+free_blkcg:
+	kfree(blkcg);
+	return ret;
 }
 
 /**
@@ -958,8 +995,10 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
+	LIST_HEAD(cpds);
 	struct blkcg_gq *blkg, *new_blkg;
-	struct blkg_policy_data *pd, *n;
+	struct blkg_policy_data *pd, *nd;
+	struct blkcg_policy_data *cpd, *cnd;
 	int cnt = 0, ret;
 	bool preloaded;
 
@@ -1003,7 +1042,10 @@ int blkcg_activate_policy(struct request_queue *q,
 
 	spin_unlock_irq(q->queue_lock);
 
-	/* allocate policy_data for all existing blkgs */
+	/*
+	 * Allocate per-blkg and per-blkcg policy data
+	 * for all existing blkgs.
+	 */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1011,26 +1053,50 @@ int blkcg_activate_policy(struct request_queue *q,
 			goto out_free;
 		}
 		list_add_tail(&pd->alloc_node, &pds);
+
+		if (!pol->cpd_size)
+			continue;
+		cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
+		if (!cpd) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+		list_add_tail(&cpd->alloc_node, &cpds);
 	}
 
 	/*
-	 * Install the allocated pds.  With @q bypassing, no new blkg
+	 * Install the allocated pds and cpds. With @q bypassing, no new blkg
 	 * should have been created while the queue lock was dropped.
 	 */
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds))) {
+		if (WARN_ON(list_empty(&pds)) ||
+		    WARN_ON(pol->cpd_size && list_empty(&cpds))) {
 			/* umm... this shouldn't happen, just abort */
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
+		cpd = list_first_entry(&cpds, struct blkcg_policy_data,
+				       alloc_node);
+		list_del_init(&cpd->alloc_node);
 		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 		list_del_init(&pd->alloc_node);
 
 		/* grab blkcg lock too while installing @pd on @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
+		if (!pol->cpd_size)
+			goto no_cpd;
+		if (!blkg->blkcg->pd[pol->plid]) {
+			/* Per-policy per-blkcg data */
+			blkg->blkcg->pd[pol->plid] = cpd;
+			cpd->plid = pol->plid;
+			pol->cpd_init_fn(blkg->blkcg);
+		} else { /* must free it as it has already been extracted */
+			kfree(cpd);
+		}
+no_cpd:
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
@@ -1045,8 +1111,10 @@ out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out_free:
 	blk_queue_bypass_end(q);
-	list_for_each_entry_safe(pd, n, &pds, alloc_node)
+	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
 		kfree(pd);
+	list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
+		kfree(cpd);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1087,6 +1155,8 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 		kfree(blkg->pd[pol->plid]);
 		blkg->pd[pol->plid] = NULL;
+		kfree(blkg->blkcg->pd[pol->plid]);
+		blkg->blkcg->pd[pol->plid] = NULL;
 
 		spin_unlock(&blkg->blkcg->lock);
 	}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index c567865b5f1d..74296a78bba1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -23,11 +23,6 @@
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
-/* CFQ specific, out here for blkcg->cfq_weight */
-#define CFQ_WEIGHT_MIN		10
-#define CFQ_WEIGHT_MAX		1000
-#define CFQ_WEIGHT_DEFAULT	500
-
 #ifdef CONFIG_BLK_CGROUP
 
 enum blkg_rwstat_type {
@@ -50,9 +45,7 @@ struct blkcg {
 	struct blkcg_gq			*blkg_hint;
 	struct hlist_head		blkg_list;
 
-	/* TODO: per-policy storage in blkcg */
-	unsigned int			cfq_weight;	/* belongs to cfq */
-	unsigned int			cfq_leaf_weight;
+	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
 };
 
 struct blkg_stat {
@@ -87,6 +80,24 @@ struct blkg_policy_data {
 	struct list_head		alloc_node;
 };
 
+/*
+ * Policies that need to keep per-blkcg data which is independent
+ * from any request_queue associated to it must specify its size
+ * with the cpd_size field of the blkcg_policy structure and
+ * embed a blkcg_policy_data in it. blkcg core allocates
+ * policy-specific per-blkcg structures lazily the first time
+ * they are actually needed, so it handles them together with
+ * blkgs. cpd_init() is invoked to let each policy handle
+ * per-blkcg data.
+ */
+struct blkcg_policy_data {
+	/* the policy id this per-policy data belongs to */
+	int				plid;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
+};
+
 /* association between a blk cgroup and a request queue */
 struct blkcg_gq {
 	/* Pointer to the associated request_queue */
@@ -112,6 +123,7 @@ struct blkcg_gq {
 	struct rcu_head			rcu_head;
 };
 
+typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
 typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
 typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
@@ -122,10 +134,13 @@ struct blkcg_policy {
 	int				plid;
 	/* policy specific private data size */
 	size_t				pd_size;
+	/* policy specific per-blkcg data size */
+	size_t				cpd_size;
 	/* cgroup files for the policy */
 	struct cftype			*cftypes;
 
 	/* operations */
+	blkcg_pol_init_cpd_fn		*cpd_init_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
@@ -218,6 +233,12 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 	return blkg ? blkg->pd[pol->plid] : NULL;
 }
 
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+						     struct blkcg_policy *pol)
+{
+	return blkcg ? blkcg->pd[pol->plid] : NULL;
+}
+
 /**
  * pdata_to_blkg - get blkg associated with policy private data
  * @pd: policy private data of interest
@@ -564,6 +585,9 @@ struct blkcg;
 struct blkg_policy_data {
 };
 
+struct blkcg_policy_data {
+};
+
 struct blkcg_gq {
 };
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 402be0139122..c808ad87652d 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool;
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 
+/* blkio-related constants */
+#define CFQ_WEIGHT_MIN          10
+#define CFQ_WEIGHT_MAX          1000
+#define CFQ_WEIGHT_DEFAULT      500
+
 struct cfq_ttime {
 	unsigned long last_end_request;
 
@@ -212,6 +217,15 @@ struct cfqg_stats {
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 };
 
+/* Per-cgroup data */
+struct cfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned int weight;
+	unsigned int leaf_weight;
+};
+
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* must be the first member */
@@ -451,6 +465,12 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 }
 
+static struct cfq_group_data
+*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+}
+
 static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 {
 	return pd_to_blkg(&cfqg->pd);
@@ -607,6 +627,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
 	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
 }
 
+static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
+}
+
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 {
 	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@ -1544,13 +1569,28 @@ static void cfqg_stats_init(struct cfqg_stats *stats)
 #endif
 }
 
+static void cfq_cpd_init(const struct blkcg *blkcg)
+{
+	struct cfq_group_data *cgd =
+		cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+
+	if (blkcg == &blkcg_root) {
+		cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
+		cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+	} else {
+		cgd->weight = CFQ_WEIGHT_DEFAULT;
+		cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+	}
+}
+
 static void cfq_pd_init(struct blkcg_gq *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
 
 	cfq_init_cfqg_base(cfqg);
-	cfqg->weight = blkg->blkcg->cfq_weight;
-	cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+	cfqg->weight = cgd->weight;
+	cfqg->leaf_weight = cgd->leaf_weight;
 	cfqg_stats_init(&cfqg->stats);
 	cfqg_stats_init(&cfqg->dead_stats);
 }
@@ -1673,13 +1713,17 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
 
 static int cfq_print_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+	seq_printf(sf, "%u\n", blkcg_to_cfqgd(blkcg)->weight);
 	return 0;
 }
 
 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 {
-	seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+	seq_printf(sf, "%u\n", blkcg_to_cfqgd(blkcg)->leaf_weight);
 	return 0;
 }
 
@@ -1690,6 +1734,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
+	struct cfq_group_data *cfqgd;
 	int ret;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@ -1698,13 +1743,14 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
+	cfqgd = blkcg_to_cfqgd(blkcg);
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = ctx.v;
-			cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+			cfqg->new_weight = ctx.v ?: cfqgd->weight;
 		} else {
 			cfqg->dev_leaf_weight = ctx.v;
-			cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+			cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
 		}
 		ret = 0;
 	}
@@ -1730,16 +1776,18 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
+	struct cfq_group_data *cfqgd;
 
 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
 		return -EINVAL;
 
 	spin_lock_irq(&blkcg->lock);
+	cfqgd = blkcg_to_cfqgd(blkcg);
 
 	if (!is_leaf_weight)
-		blkcg->cfq_weight = val;
+		cfqgd->weight = val;
 	else
-		blkcg->cfq_leaf_weight = val;
+		cfqgd->leaf_weight = val;
 
 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
 		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@ -1749,10 +1797,10 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 
 		if (!is_leaf_weight) {
 			if (!cfqg->dev_weight)
-				cfqg->new_weight = blkcg->cfq_weight;
+				cfqg->new_weight = cfqgd->weight;
 		} else {
 			if (!cfqg->dev_leaf_weight)
-				cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+				cfqg->new_leaf_weight = cfqgd->leaf_weight;
 		}
 	}
 
@@ -4603,8 +4651,10 @@ static struct elevator_type iosched_cfq = {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkcg_policy blkcg_policy_cfq = {
 	.pd_size		= sizeof(struct cfq_group),
+	.cpd_size		= sizeof(struct cfq_group_data),
 	.cftypes		= cfq_blkcg_files,
 
+	.cpd_init_fn		= cfq_cpd_init,
 	.pd_init_fn		= cfq_pd_init,
 	.pd_offline_fn		= cfq_pd_offline,
 	.pd_reset_stats_fn	= cfq_pd_reset_stats,
-- 
cgit v1.2.3


From 0bb979472a7401022109e81dd89d777adea58710 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 10 Jun 2015 08:01:20 -0600
Subject: cfq-iosched: fix the setting of IOPS mode on SSDs

A previous commit wanted to make CFQ default to IOPS mode on
non-rotational storage, however it did so when the queue was
initialized and the non-rotational flag is only set later on
in the probe.

Add an elevator hook that gets called off the add_disk() path,
at that point we know that feature probing has finished, and
we can reliably check for the various flags that drivers can
set.

Fixes: 41c0126b ("block: Make CFQ default to IOPS mode on SSDs")
Tested-by: Romain Francoise <romain@orebokech.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 15 ++++++++++++++-
 block/elevator.c    |  2 ++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c808ad87652d..d1d0cb235cd2 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4508,7 +4508,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_target_latency = cfq_target_latency;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
-	cfqd->cfq_slice_idle = blk_queue_nonrot(q) ? 0 : cfq_slice_idle;
+	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->hw_tag = -1;
@@ -4525,6 +4525,18 @@ out_free:
 	return ret;
 }
 
+static void cfq_registered_queue(struct request_queue *q)
+{
+	struct elevator_queue *e = q->elevator;
+	struct cfq_data *cfqd = e->elevator_data;
+
+	/*
+	 * Default to IOPS mode with no idling for SSDs
+	 */
+	if (blk_queue_nonrot(q))
+		cfqd->cfq_slice_idle = 0;
+}
+
 /*
  * sysfs parts below -->
  */
@@ -4640,6 +4652,7 @@ static struct elevator_type iosched_cfq = {
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
+		.elevator_registered_fn =	cfq_registered_queue,
 	},
 	.icq_size	=	sizeof(struct cfq_io_cq),
 	.icq_align	=	__alignof__(struct cfq_io_cq),
diff --git a/block/elevator.c b/block/elevator.c
index 59794d0d38e3..5f0452734a40 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -810,6 +810,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
+		if (e->type->ops.elevator_registered_fn)
+			e->type->ops.elevator_registered_fn(q);
 	}
 	return error;
 }
-- 
cgit v1.2.3


From 4ceab71b9d84e55b59a76b54b2999dc377aae6e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 19 Jun 2015 10:13:01 -0600
Subject: cfq-iosched: move group scheduling functions under ifdef

If CFQ_GROUP_IOSCHED is not set, the compiler produces the
following warning:

  CC      block/cfq-iosched.o
  linux/block/cfq-iosched.c:469:2:
    warning: 'cpd_to_cfqgd' defined but not used [-Wunused-function]
    *cpd_to_cfqgd(struct blkcg_policy_data *cpd)
     ^

In reality, two other lookup functions aren't used either if
CFQ_GROUP_IOSCHED isn't set. Move all three under one of the
CFQ_GROUP_IOSCHED sections in the code.

Reported-by: Vladimir Zapolskiy <vz@mleia.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d1d0cb235cd2..dbd0207928fb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -460,22 +460,6 @@ CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 
-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-
-static struct cfq_group_data
-*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
-{
-	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
-}
-
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-	return pd_to_blkg(&cfqg->pd);
-}
-
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 
 /* cfqg stats flags */
@@ -620,6 +604,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 
+static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+}
+
+static struct cfq_group_data
+*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+}
+
+static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+{
+	return pd_to_blkg(&cfqg->pd);
+}
+
 static struct blkcg_policy blkcg_policy_cfq;
 
 static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
-- 
cgit v1.2.3


From 9470e4a693db84bee7becbba8de01af02bb23c9f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 19 Jun 2015 10:19:36 -0600
Subject: cfq-iosched: fix sysfs oops when attempting to read unconfigured
 weights

If none of the devices in the system are using CFQ, then attempting to
read:

/sys/fs/cgroup/blkio/blkio.leaf_weight

will results in a NULL dereference. Check for a valid cfq_group_data
struct before attempting to dereference it.

Reported-by: Andrey Wagin <avagin@gmail.com>
Fixes: e48453c3 ("block, cgroup: implement policy-specific per-blkcg data")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dbd0207928fb..ed86fb242cd4 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1714,16 +1714,26 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
 static int cfq_print_weight(struct seq_file *sf, void *v)
 {
 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+	unsigned int val = 0;
 
-	seq_printf(sf, "%u\n", blkcg_to_cfqgd(blkcg)->weight);
+	if (cgd)
+		val = cgd->weight;
+
+	seq_printf(sf, "%u\n", val);
 	return 0;
 }
 
 static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
 {
 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (cgd)
+		val = cgd->leaf_weight;
 
-	seq_printf(sf, "%u\n", blkcg_to_cfqgd(blkcg)->leaf_weight);
+	seq_printf(sf, "%u\n", val);
 	return 0;
 }
 
-- 
cgit v1.2.3


From ae994ea972473c0ace9d55f718b60f0727af1381 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Sat, 20 Jun 2015 10:26:50 -0600
Subject: cfq-iosched: fix other locations where blkcg_to_cfqgd() can return
 NULL

Commit 9470e4a693db only covered the initial bug report, there are
other spots in CFQ where we need to check that we actually have
a valid cfq_group_data structure.

Fixes: e48453c3 ("block, cgroup: implement policy-specific per-blkcg data")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ed86fb242cd4..d8ad45ccd8fa 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1754,6 +1754,9 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	cfqgd = blkcg_to_cfqgd(blkcg);
+	if (!cfqg || !cfqgd)
+		goto err;
+
 	if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
 		if (!is_leaf_weight) {
 			cfqg->dev_weight = ctx.v;
@@ -1765,6 +1768,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
 		ret = 0;
 	}
 
+err:
 	blkg_conf_finish(&ctx);
 	return ret ?: nbytes;
 }
@@ -1787,12 +1791,17 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct blkcg_gq *blkg;
 	struct cfq_group_data *cfqgd;
+	int ret = 0;
 
 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
 		return -EINVAL;
 
 	spin_lock_irq(&blkcg->lock);
 	cfqgd = blkcg_to_cfqgd(blkcg);
+	if (!cfqgd) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	if (!is_leaf_weight)
 		cfqgd->weight = val;
@@ -1814,8 +1823,9 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
 		}
 	}
 
+out:
 	spin_unlock_irq(&blkcg->lock);
-	return 0;
+	return ret;
 }
 
 static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-- 
cgit v1.2.3