summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorMax Krummenacher <max.krummenacher@toradex.com>2020-07-07 14:39:39 +0200
committerMax Krummenacher <max.krummenacher@toradex.com>2020-07-07 18:42:28 +0200
commitbb02de12d4adbd463837a57556fe6a4a29535e29 (patch)
tree83674f672af49651fe39c7d7eee52ae513ea4107 /block
parentbabac008e5cf168abca1a85bda2e8071ca27a5c0 (diff)
parentfd8cd8ac940c8b45b75474415291a3b941c865ab (diff)
Merge tag 'v5.4.47' into 5.4-2.1.x-imx
This is the 5.4.47 stable release All conflicts resolved in favour of HEAD, i.e. 5.4-2.1.x-imx Conflicts: arch/arm/boot/dts/imx6qdl.dtsi arch/arm/mach-imx/Kconfig arch/arm/mach-imx/common.h arch/arm/mach-imx/suspend-imx6.S arch/arm64/boot/dts/freescale/imx8qxp-mek.dts arch/powerpc/include/asm/cacheflush.h drivers/cpufreq/imx6q-cpufreq.c drivers/dma/imx-sdma.c drivers/edac/synopsys_edac.c drivers/firmware/imx/imx-scu.c drivers/net/ethernet/freescale/fec.h drivers/net/ethernet/freescale/fec_main.c drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c drivers/net/phy/phy_device.c drivers/perf/fsl_imx8_ddr_perf.c drivers/usb/cdns3/gadget.c drivers/usb/dwc3/gadget.c include/uapi/linux/dma-buf.h Signed-off-by: Max Krummenacher <max.krummenacher@toradex.com>
Diffstat (limited to 'block')
-rw-r--r--block/bfq-cgroup.c96
-rw-r--r--block/bfq-iosched.c28
-rw-r--r--block/bfq-iosched.h3
-rw-r--r--block/bfq-wf2q.c12
-rw-r--r--block/blk-core.c11
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk-ioc.c7
-rw-r--r--block/blk-iocost.c123
-rw-r--r--block/blk-mq-sched.c44
-rw-r--r--block/blk-mq.c30
-rw-r--r--block/blk-mq.h3
-rw-r--r--block/blk-settings.c3
12 files changed, 243 insertions, 119 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 86a607cf19a1..12b707a4e52f 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -332,7 +332,7 @@ static void bfqg_put(struct bfq_group *bfqg)
kfree(bfqg);
}
-static void bfqg_and_blkg_get(struct bfq_group *bfqg)
+void bfqg_and_blkg_get(struct bfq_group *bfqg)
{
/* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
bfqg_get(bfqg);
@@ -593,12 +593,13 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
*/
entity = &bfqg->entity;
for_each_entity(entity) {
- bfqg = container_of(entity, struct bfq_group, entity);
- if (bfqg != bfqd->root_group) {
- parent = bfqg_parent(bfqg);
+ struct bfq_group *curr_bfqg = container_of(entity,
+ struct bfq_group, entity);
+ if (curr_bfqg != bfqd->root_group) {
+ parent = bfqg_parent(curr_bfqg);
if (!parent)
parent = bfqd->root_group;
- bfq_group_set_parent(bfqg, parent);
+ bfq_group_set_parent(curr_bfqg, parent);
}
}
@@ -624,6 +625,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_entity *entity = &bfqq->entity;
+ /*
+ * Get extra reference to prevent bfqq from being freed in
+ * next possible expire or deactivate.
+ */
+ bfqq->ref++;
+
/* If bfqq is empty, then bfq_bfqq_expire also invokes
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
* from data structures related to current group. Otherwise we
@@ -653,6 +660,8 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
bfq_schedule_dispatch(bfqd);
+ /* release extra ref taken above, bfqq may happen to be freed now */
+ bfq_put_queue(bfqq);
}
/**
@@ -688,10 +697,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
if (entity->sched_data != &bfqg->sched_data) {
bic_set_bfqq(bic, NULL, 0);
- bfq_log_bfqq(bfqd, async_bfqq,
- "bic_change_group: %p %d",
- async_bfqq, async_bfqq->ref);
- bfq_put_queue(async_bfqq);
+ bfq_release_process_ref(bfqd, async_bfqq);
}
}
@@ -792,39 +798,53 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
/**
* bfq_reparent_leaf_entity - move leaf entity to the root_group.
* @bfqd: the device data structure with the root group.
- * @entity: the entity to move.
+ * @entity: the entity to move, if entity is a leaf; or the parent entity
+ * of an active leaf entity to move, if entity is not a leaf.
*/
static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
- struct bfq_entity *entity)
+ struct bfq_entity *entity,
+ int ioprio_class)
{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_queue *bfqq;
+ struct bfq_entity *child_entity = entity;
+
+ while (child_entity->my_sched_data) { /* leaf not reached yet */
+ struct bfq_sched_data *child_sd = child_entity->my_sched_data;
+ struct bfq_service_tree *child_st = child_sd->service_tree +
+ ioprio_class;
+ struct rb_root *child_active = &child_st->active;
+ child_entity = bfq_entity_of(rb_first(child_active));
+
+ if (!child_entity)
+ child_entity = child_sd->in_service_entity;
+ }
+
+ bfqq = bfq_entity_to_bfqq(child_entity);
bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
}
/**
- * bfq_reparent_active_entities - move to the root group all active
- * entities.
+ * bfq_reparent_active_queues - move to the root group all active queues.
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
- * @st: the service tree with the entities.
+ * @st: the service tree to start the search from.
*/
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
- struct bfq_group *bfqg,
- struct bfq_service_tree *st)
+static void bfq_reparent_active_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ struct bfq_service_tree *st,
+ int ioprio_class)
{
struct rb_root *active = &st->active;
- struct bfq_entity *entity = NULL;
-
- if (!RB_EMPTY_ROOT(&st->active))
- entity = bfq_entity_of(rb_first(active));
+ struct bfq_entity *entity;
- for (; entity ; entity = bfq_entity_of(rb_first(active)))
- bfq_reparent_leaf_entity(bfqd, entity);
+ while ((entity = bfq_entity_of(rb_first(active))))
+ bfq_reparent_leaf_entity(bfqd, entity, ioprio_class);
if (bfqg->sched_data.in_service_entity)
bfq_reparent_leaf_entity(bfqd,
- bfqg->sched_data.in_service_entity);
+ bfqg->sched_data.in_service_entity,
+ ioprio_class);
}
/**
@@ -857,13 +877,6 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
st = bfqg->sched_data.service_tree + i;
/*
- * The idle tree may still contain bfq_queues belonging
- * to exited task because they never migrated to a different
- * cgroup from the one being destroyed now.
- */
- bfq_flush_idle_tree(st);
-
- /*
* It may happen that some queues are still active
* (busy) upon group destruction (if the corresponding
* processes have been forced to terminate). We move
@@ -875,7 +888,20 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
* There is no need to put the sync queues, as the
* scheduler has taken no reference.
*/
- bfq_reparent_active_entities(bfqd, bfqg, st);
+ bfq_reparent_active_queues(bfqd, bfqg, st, i);
+
+ /*
+ * The idle tree may still contain bfq_queues
+ * belonging to exited task because they never
+ * migrated to a different cgroup from the one being
+ * destroyed now. In addition, even
+ * bfq_reparent_active_queues() may happen to add some
+ * entities to the idle tree. It happens if, in some
+ * of the calls to bfq_bfqq_move() performed by
+ * bfq_reparent_active_queues(), the queue to move is
+ * empty and gets expired.
+ */
+ bfq_flush_idle_tree(st);
}
__bfq_deactivate_entity(entity, false);
@@ -1380,6 +1406,10 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
return bfqq->bfqd->root_group;
}
+void bfqg_and_blkg_get(struct bfq_group *bfqg) {}
+
+void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
+
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
struct bfq_group *bfqg;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 5498d05b873d..88497bff1135 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -614,6 +614,10 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL;
}
+ /* oom_bfqq does not participate in queue merging */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+
/*
* bfqq cannot be merged any longer (see comments in
* bfq_setup_cooperator): no point in adding bfqq into the
@@ -2713,8 +2717,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
}
-
-static
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/*
@@ -4822,9 +4824,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
{
struct bfq_queue *item;
struct hlist_node *n;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqg = bfqq_group(bfqq);
-#endif
if (bfqq->bfqd)
bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
@@ -4897,9 +4897,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
bfqq->bfqd->last_completed_rq_bfqq = NULL;
kmem_cache_free(bfq_pool, bfqq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
bfqg_and_blkg_put(bfqg);
-#endif
}
static void bfq_put_cooperator(struct bfq_queue *bfqq)
@@ -6210,20 +6208,28 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
return bfqq;
}
-static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
+static void
+bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
- struct bfq_data *bfqd = bfqq->bfqd;
enum bfqq_expiration reason;
unsigned long flags;
spin_lock_irqsave(&bfqd->lock, flags);
- bfq_clear_bfqq_wait_request(bfqq);
+ /*
+ * Considering that bfqq may be in race, we should firstly check
+ * whether bfqq is in service before doing something on it. If
+ * the bfqq in race is not in service, it has already been expired
+ * through __bfq_bfqq_expire func and its wait_request flags has
+ * been cleared in __bfq_bfqd_reset_in_service func.
+ */
if (bfqq != bfqd->in_service_queue) {
spin_unlock_irqrestore(&bfqd->lock, flags);
return;
}
+ bfq_clear_bfqq_wait_request(bfqq);
+
if (bfq_bfqq_budget_timeout(bfqq))
/*
* Also here the queue can be safely expired
@@ -6268,7 +6274,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
* early.
*/
if (bfqq)
- bfq_idle_slice_timer_body(bfqq);
+ bfq_idle_slice_timer_body(bfqd, bfqq);
return HRTIMER_NORESTART;
}
@@ -6383,10 +6389,10 @@ static void bfq_exit_queue(struct elevator_queue *e)
hrtimer_cancel(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
/* release oom-queue reference to root group */
bfqg_and_blkg_put(bfqd->root_group);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
#else
spin_lock_irq(&bfqd->lock);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5d1a519640f6..c0232975075d 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -916,6 +916,7 @@ struct bfq_group {
#else
struct bfq_group {
+ struct bfq_entity entity;
struct bfq_sched_data sched_data;
struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
@@ -949,6 +950,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_schedule_dispatch(struct bfq_data *bfqd);
void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
@@ -978,6 +980,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
+void bfqg_and_blkg_get(struct bfq_group *bfqg);
void bfqg_and_blkg_put(struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 05f0bf4a1144..44079147e396 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -536,7 +536,9 @@ static void bfq_get_entity(struct bfq_entity *entity)
bfqq->ref++;
bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
bfqq, bfqq->ref);
- }
+ } else
+ bfqg_and_blkg_get(container_of(entity, struct bfq_group,
+ entity));
}
/**
@@ -650,8 +652,14 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
entity->on_st = false;
st->wsum -= entity->weight;
- if (bfqq && !is_in_service)
+ if (is_in_service)
+ return;
+
+ if (bfqq)
bfq_put_queue(bfqq);
+ else
+ bfqg_and_blkg_put(container_of(entity, struct bfq_group,
+ entity));
}
/**
diff --git a/block/blk-core.c b/block/blk-core.c
index 1075aaff606d..d5e668ec751b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -886,14 +886,11 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * Non-mq queues do not honor REQ_NOWAIT, so complete a bio
- * with BLK_STS_AGAIN status in order to catch -EAGAIN and
- * to give a chance to the caller to repeat request gracefully.
+ * For a REQ_NOWAIT based request, return -EOPNOTSUPP
+ * if queue is not a request based queue.
*/
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) {
- status = BLK_STS_AGAIN;
- goto end_io;
- }
+ if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
+ goto not_supported;
if (should_fail_bio(bio))
goto end_io;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b1f0a1ac505c..5aa6fada2259 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -399,7 +399,7 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- blk_mq_request_bypass_insert(rq, false);
+ blk_mq_request_bypass_insert(rq, false, false);
return;
}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 5ed59ac6ae58..9df50fb507ca 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -84,6 +84,7 @@ static void ioc_destroy_icq(struct io_cq *icq)
* making it impossible to determine icq_cache. Record it in @icq.
*/
icq->__rcu_icq_cache = et->icq_cache;
+ icq->flags |= ICQ_DESTROYED;
call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
}
@@ -212,15 +213,21 @@ static void __ioc_clear_queue(struct list_head *icq_list)
{
unsigned long flags;
+ rcu_read_lock();
while (!list_empty(icq_list)) {
struct io_cq *icq = list_entry(icq_list->next,
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock_irqsave(&ioc->lock, flags);
+ if (icq->flags & ICQ_DESTROYED) {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ continue;
+ }
ioc_destroy_icq(icq);
spin_unlock_irqrestore(&ioc->lock, flags);
}
+ rcu_read_unlock();
}
/**
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 27ca68621137..d083f7704082 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -469,7 +469,7 @@ struct ioc_gq {
*/
atomic64_t vtime;
atomic64_t done_vtime;
- atomic64_t abs_vdebt;
+ u64 abs_vdebt;
u64 last_vtime;
/*
@@ -1145,7 +1145,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
struct iocg_wake_ctx ctx = { .iocg = iocg };
u64 margin_ns = (u64)(ioc->period_us *
WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
- u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
+ u64 vdebt, vshortage, expires, oexpires;
s64 vbudget;
u32 hw_inuse;
@@ -1155,18 +1155,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
vbudget = now->vnow - atomic64_read(&iocg->vtime);
/* pay off debt */
- abs_vdebt = atomic64_read(&iocg->abs_vdebt);
- vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
+ vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
if (vdebt && vbudget > 0) {
u64 delta = min_t(u64, vbudget, vdebt);
u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
- abs_vdebt);
+ iocg->abs_vdebt);
atomic64_add(delta, &iocg->vtime);
atomic64_add(delta, &iocg->done_vtime);
- atomic64_sub(abs_delta, &iocg->abs_vdebt);
- if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
- atomic64_set(&iocg->abs_vdebt, 0);
+ iocg->abs_vdebt -= abs_delta;
}
/*
@@ -1222,12 +1219,18 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
u64 expires, oexpires;
u32 hw_inuse;
+ lockdep_assert_held(&iocg->waitq.lock);
+
/* debt-adjust vtime */
current_hweight(iocg, NULL, &hw_inuse);
- vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
+ vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
- /* clear or maintain depending on the overage */
- if (time_before_eq64(vtime, now->vnow)) {
+ /*
+ * Clear or maintain depending on the overage. Non-zero vdebt is what
+ * guarantees that @iocg is online and future iocg_kick_delay() will
+ * clear use_delay. Don't leave it on when there's no vdebt.
+ */
+ if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
blkcg_clear_delay(blkg);
return false;
}
@@ -1261,9 +1264,12 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
{
struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
struct ioc_now now;
+ unsigned long flags;
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
ioc_now(iocg->ioc, &now);
iocg_kick_delay(iocg, &now, 0);
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
return HRTIMER_NORESTART;
}
@@ -1318,7 +1324,7 @@ static bool iocg_is_idle(struct ioc_gq *iocg)
return false;
/* is something in flight? */
- if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
+ if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
return false;
return true;
@@ -1371,14 +1377,13 @@ static void ioc_timer_fn(struct timer_list *timer)
* should have woken up in the last period and expire idle iocgs.
*/
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
- if (!waitqueue_active(&iocg->waitq) &&
- !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
+ if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
+ !iocg_is_idle(iocg))
continue;
spin_lock(&iocg->waitq.lock);
- if (waitqueue_active(&iocg->waitq) ||
- atomic64_read(&iocg->abs_vdebt)) {
+ if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
/* might be oversleeping vtime / hweight changes, kick */
iocg_kick_waitq(iocg, &now);
iocg_kick_delay(iocg, &now, 0);
@@ -1594,7 +1599,7 @@ skip_surplus_transfers:
vrate_min, vrate_max);
}
- trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
+ trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
nr_lagging, nr_shortages,
nr_surpluses);
@@ -1603,7 +1608,7 @@ skip_surplus_transfers:
ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
- &missed_ppm, rq_wait_pct, nr_lagging,
+ missed_ppm, rq_wait_pct, nr_lagging,
nr_shortages, nr_surpluses);
}
@@ -1721,28 +1726,49 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* tests are racy but the races aren't systemic - we only miss once
* in a while which is fine.
*/
- if (!waitqueue_active(&iocg->waitq) &&
- !atomic64_read(&iocg->abs_vdebt) &&
+ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
time_before_eq64(vtime + cost, now.vnow)) {
iocg_commit_bio(iocg, bio, cost);
return;
}
/*
- * We're over budget. If @bio has to be issued regardless,
- * remember the abs_cost instead of advancing vtime.
- * iocg_kick_waitq() will pay off the debt before waking more IOs.
+ * We activated above but w/o any synchronization. Deactivation is
+ * synchronized with waitq.lock and we won't get deactivated as long
+ * as we're waiting or has debt, so we're good if we're activated
+ * here. In the unlikely case that we aren't, just issue the IO.
+ */
+ spin_lock_irq(&iocg->waitq.lock);
+
+ if (unlikely(list_empty(&iocg->active_list))) {
+ spin_unlock_irq(&iocg->waitq.lock);
+ iocg_commit_bio(iocg, bio, cost);
+ return;
+ }
+
+ /*
+ * We're over budget. If @bio has to be issued regardless, remember
+ * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
+ * off the debt before waking more IOs.
+ *
* This way, the debt is continuously paid off each period with the
- * actual budget available to the cgroup. If we just wound vtime,
- * we would incorrectly use the current hw_inuse for the entire
- * amount which, for example, can lead to the cgroup staying
- * blocked for a long time even with substantially raised hw_inuse.
+ * actual budget available to the cgroup. If we just wound vtime, we
+ * would incorrectly use the current hw_inuse for the entire amount
+ * which, for example, can lead to the cgroup staying blocked for a
+ * long time even with substantially raised hw_inuse.
+ *
+ * An iocg with vdebt should stay online so that the timer can keep
+ * deducting its vdebt and [de]activate use_delay mechanism
+ * accordingly. We don't want to race against the timer trying to
+ * clear them and leave @iocg inactive w/ dangling use_delay heavily
+ * penalizing the cgroup and its descendants.
*/
if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
- atomic64_add(abs_cost, &iocg->abs_vdebt);
+ iocg->abs_vdebt += abs_cost;
if (iocg_kick_delay(iocg, &now, cost))
blkcg_schedule_throttle(rqos->q,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+ spin_unlock_irq(&iocg->waitq.lock);
return;
}
@@ -1759,20 +1785,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* All waiters are on iocg->waitq and the wait states are
* synchronized using waitq.lock.
*/
- spin_lock_irq(&iocg->waitq.lock);
-
- /*
- * We activated above but w/o any synchronization. Deactivation is
- * synchronized with waitq.lock and we won't get deactivated as
- * long as we're waiting, so we're good if we're activated here.
- * In the unlikely case that we are deactivated, just issue the IO.
- */
- if (unlikely(list_empty(&iocg->active_list))) {
- spin_unlock_irq(&iocg->waitq.lock);
- iocg_commit_bio(iocg, bio, cost);
- return;
- }
-
init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
wait.wait.private = current;
wait.bio = bio;
@@ -1804,6 +1816,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
struct ioc_now now;
u32 hw_inuse;
u64 abs_cost, cost;
+ unsigned long flags;
/* bypass if disabled or for root cgroup */
if (!ioc->enabled || !iocg->level)
@@ -1823,15 +1836,28 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
iocg->cursor = bio_end;
/*
- * Charge if there's enough vtime budget and the existing request
- * has cost assigned. Otherwise, account it as debt. See debt
- * handling in ioc_rqos_throttle() for details.
+ * Charge if there's enough vtime budget and the existing request has
+ * cost assigned.
*/
if (rq->bio && rq->bio->bi_iocost_cost &&
- time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
+ time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
iocg_commit_bio(iocg, bio, cost);
- else
- atomic64_add(abs_cost, &iocg->abs_vdebt);
+ return;
+ }
+
+ /*
+ * Otherwise, account it as debt if @iocg is online, which it should
+ * be for the vast majority of cases. See debt handling in
+ * ioc_rqos_throttle() for details.
+ */
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ if (likely(!list_empty(&iocg->active_list))) {
+ iocg->abs_vdebt += abs_cost;
+ iocg_kick_delay(iocg, &now, cost);
+ } else {
+ iocg_commit_bio(iocg, bio, cost);
+ }
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
}
static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
@@ -2001,7 +2027,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
iocg->ioc = ioc;
atomic64_set(&iocg->vtime, now.vnow);
atomic64_set(&iocg->done_vtime, now.vnow);
- atomic64_set(&iocg->abs_vdebt, 0);
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
INIT_LIST_HEAD(&iocg->active_list);
iocg->hweight_active = HWEIGHT_WHOLE;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ca22afd47b3d..74cedea56034 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -361,13 +361,19 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
bool has_sched,
struct request *rq)
{
- /* dispatch flush rq directly */
- if (rq->rq_flags & RQF_FLUSH_SEQ) {
- spin_lock(&hctx->lock);
- list_add(&rq->queuelist, &hctx->dispatch);
- spin_unlock(&hctx->lock);
+ /*
+ * dispatch flush and passthrough rq directly
+ *
+ * passthrough request has to be added to hctx->dispatch directly.
+ * For some reason, device may be in one situation which can't
+ * handle FS request, so STS_RESOURCE is always returned and the
+ * FS request will be added to hctx->dispatch. However passthrough
+ * request may be required at that time for fixing the problem. If
+ * passthrough request is added to scheduler queue, there isn't any
+ * chance to dispatch it given we prioritize requests in hctx->dispatch.
+ */
+ if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
return true;
- }
if (has_sched)
rq->rq_flags |= RQF_SORTED;
@@ -391,8 +397,32 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
WARN_ON(e && (rq->tag != -1));
- if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
+ if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
+ /*
+ * Firstly normal IO request is inserted to scheduler queue or
+ * sw queue, meantime we add flush request to dispatch queue(
+ * hctx->dispatch) directly and there is at most one in-flight
+ * flush request for each hw queue, so it doesn't matter to add
+ * flush request to tail or front of the dispatch queue.
+ *
+ * Secondly in case of NCQ, flush request belongs to non-NCQ
+ * command, and queueing it will fail when there is any
+ * in-flight normal IO request(NCQ command). When adding flush
+ * rq to the front of hctx->dispatch, it is easier to introduce
+ * extra time to flush rq's latency because of S_SCHED_RESTART
+ * compared with adding to the tail of dispatch queue, then
+ * chance of flush merge is increased, and less flush requests
+ * will be issued to controller. It is observed that ~10% time
+ * is saved in blktests block/004 on disk attached to AHCI/NCQ
+ * drive when adding flush rq to the front of hctx->dispatch.
+ *
+ * Simply queue flush rq to the front of hctx->dispatch so that
+ * intensive flush workloads can benefit in case of NCQ HW.
+ */
+ at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
+ blk_mq_request_bypass_insert(rq, at_head, false);
goto run;
+ }
if (e && e->type->ops.insert_requests) {
LIST_HEAD(list);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec791156e9cc..757c0fd9f0cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -761,7 +761,7 @@ static void blk_mq_requeue_work(struct work_struct *work)
* merge.
*/
if (rq->rq_flags & RQF_DONTPREP)
- blk_mq_request_bypass_insert(rq, false);
+ blk_mq_request_bypass_insert(rq, false, false);
else
blk_mq_sched_insert_request(rq, true, false, false);
}
@@ -1232,8 +1232,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
hctx = rq->mq_hctx;
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+ if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+ blk_mq_put_driver_tag(rq);
break;
+ }
if (!blk_mq_get_driver_tag(rq)) {
/*
@@ -1313,7 +1315,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
q->mq_ops->commit_rqs(hctx);
spin_lock(&hctx->lock);
- list_splice_init(list, &hctx->dispatch);
+ list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
/*
@@ -1668,12 +1670,16 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
+ bool run_queue)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
spin_lock(&hctx->lock);
- list_add_tail(&rq->queuelist, &hctx->dispatch);
+ if (at_head)
+ list_add(&rq->queuelist, &hctx->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
if (run_queue)
@@ -1863,7 +1869,7 @@ insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
- blk_mq_request_bypass_insert(rq, run_queue);
+ blk_mq_request_bypass_insert(rq, false, run_queue);
return BLK_STS_OK;
}
@@ -1879,7 +1885,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- blk_mq_request_bypass_insert(rq, true);
+ blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
@@ -1913,7 +1919,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_request_bypass_insert(rq,
+ blk_mq_request_bypass_insert(rq, false,
list_empty(list));
break;
}
@@ -3003,6 +3009,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
+ /*
+ * blk_mq_map_queues() and multiple .map_queues() implementations
+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
+ * number of hardware queues.
+ */
+ if (set->nr_maps == 1)
+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
+
if (set->ops->map_queues && !is_kdump_kernel()) {
int i;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 32c62c64e6c2..f2075978db50 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -66,7 +66,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
*/
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
+ bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c8eda2e7b91e..be1dca0103a4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -664,6 +664,9 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
top, bottom);
}
+
+ t->backing_dev_info->io_pages =
+ t->limits.max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(disk_stack_limits);