From 5d01bbd111d6ff9ea9d9847774f66dff39633776 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Jul 2012 10:42:35 +0000 Subject: rcu: Yield simpler The rcu_yield() code is amazing. It's there to avoid starvation of the system when lots of (boosting) work is to be done. Now looking at the code it's functionality is: Make the thread SCHED_OTHER and very nice, i.e. get it out of the way Arm a timer with 2 ticks schedule() Now if the system goes idle the rcu task returns, regains SCHED_FIFO and plugs on. If the systems stays busy the timer fires and wakes a per node kthread which in turn makes the per cpu thread SCHED_FIFO and brings it back on the cpu. For the boosting thread the "make it FIFO" bit is missing and it just runs some magic boost checks. Now this is a lot of code with extra threads and complexity. It's way simpler to let the tasks when they detect overload schedule away for 2 ticks and defer the normal wakeup as long as they are in yielded state and the cpu is not idle. That solves the same problem and the only difference is that when the cpu goes idle it's not guaranteed that the thread returns right away, but it won't be longer out than two ticks, so no harm is done. If that's an issue than it is way simpler just to wake the task from idle as RCU has callbacks there anyway. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: Rusty Russell Cc: Namhyung Kim Reviewed-by: Paul E. McKenney Link: http://lkml.kernel.org/r/20120716103948.131256723@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/rcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..f08ee3bc5741 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -139,7 +139,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); #endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -1469,7 +1469,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -2594,11 +2594,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: - rcu_node_kthread_setaffinity(rnp, -1); + rcu_boost_kthread_setaffinity(rnp, -1); rcu_cpu_kthread_setrt(cpu, 1); break; case CPU_DOWN_PREPARE: - rcu_node_kthread_setaffinity(rnp, cpu); + rcu_boost_kthread_setaffinity(rnp, cpu); rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: -- cgit v1.2.3 From 62ab7072476ae1600e877cc62b43758e485f4f1e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Jul 2012 10:42:38 +0000 Subject: rcu: Use smp_hotplug_thread facility for RCUs per-CPU kthread Bring RCU into the new-age CPU-hotplug fold by modifying RCU's per-CPU kthread code to use the new smp_hotplug_thread facility. [ tglx: Adapted it to use callbacks and to the simplified rcu yield ] Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Srivatsa S. Bhat Cc: Rusty Russell Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20120716103948.673354828@linutronix.de Signed-off-by: Thomas Gleixner --- kernel/rcutree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f08ee3bc5741..11a4fdca1df7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -133,7 +133,6 @@ static int rcu_scheduler_fully_active __read_mostly; */ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); DEFINE_PER_CPU(char, rcu_cpu_has_work); @@ -1468,7 +1467,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* Adjust any no-longer-needed kthreads. */ - rcu_stop_cpu_kthread(cpu); rcu_boost_kthread_setaffinity(rnp, -1); /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ @@ -2595,11 +2593,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_ONLINE: case CPU_DOWN_FAILED: rcu_boost_kthread_setaffinity(rnp, -1); - rcu_cpu_kthread_setrt(cpu, 1); break; case CPU_DOWN_PREPARE: rcu_boost_kthread_setaffinity(rnp, cpu); - rcu_cpu_kthread_setrt(cpu, 0); break; case CPU_DYING: case CPU_DYING_FROZEN: -- cgit v1.2.3 From a10d206ef1a83121ab7430cb196e0376a7145b22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Sep 2012 13:55:30 -0700 Subject: rcu: Fix day-one dyntick-idle stall-warning bug Each grace period is supposed to have at least one callback waiting for that grace period to complete. However, if CONFIG_NO_HZ=n, an extra callback-free grace period is no big problem -- it will chew up a tiny bit of CPU time, but it will complete normally. In contrast, CONFIG_NO_HZ=y kernels have the potential for all the CPUs to go to sleep indefinitely, in turn indefinitely delaying completion of the callback-free grace period. Given that nothing is waiting on this grace period, this is also not a problem. That is, unless RCU CPU stall warnings are also enabled, as they are in recent kernels. In this case, if a CPU wakes up after at least one minute of inactivity, an RCU CPU stall warning will result. The reason that no one noticed until quite recently is that most systems have enough OS noise that they will never remain absolutely idle for a full minute. But there are some embedded systems with cut-down userspace configurations that consistently get into this situation. All this begs the question of exactly how a callback-free grace period gets started in the first place. This can happen due to the fact that CPUs do not necessarily agree on which grace period is in progress. If a CPU still believes that the grace period that just completed is still ongoing, it will believe that it has callbacks that need to wait for another grace period, never mind the fact that the grace period that they were waiting for just completed. This CPU can therefore erroneously decide to start a new grace period. Note that this can happen in TREE_RCU and TREE_PREEMPT_RCU even on a single-CPU system: Deadlock considerations mean that the CPU that detected the end of the grace period is not necessarily officially informed of this fact for some time. Once this CPU notices that the earlier grace period completed, it will invoke its callbacks. It then won't have any callbacks left. If no other CPU has any callbacks, we now have a callback-free grace period. This commit therefore makes CPUs check more carefully before starting a new grace period. This new check relies on an array of tail pointers into each CPU's list of callbacks. If the CPU is up to date on which grace periods have completed, it checks to see if any callbacks follow the RCU_DONE_TAIL segment, otherwise it checks to see if any callbacks follow the RCU_WAIT_TAIL segment. The reason that this works is that the RCU_WAIT_TAIL segment will be promoted to the RCU_DONE_TAIL segment as soon as the CPU is officially notified that the old grace period has ended. This change is to cpu_needs_another_gp(), which is called in a number of places. The only one that really matters is in rcu_start_gp(), where the root rcu_node structure's ->lock is held, which prevents any other CPU from starting or completing a grace period, so that the comparison that determines whether the CPU is missing the completion of a grace period is stable. Reported-by: Becky Bruce Reported-by: Subodh Nijsure Reported-by: Paul Walmsley Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Paul Walmsley # OMAP3730, OMAP4430 Cc: stable@vger.kernel.org --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..f7bcd9e6c054 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -305,7 +305,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); + return *rdp->nxttail[RCU_DONE_TAIL + + ACCESS_ONCE(rsp->completed) != rdp->completed] && + !rcu_gp_in_progress(rsp); } /* -- cgit v1.2.3 From b3dbec76e5334fbb063987dea14e7b255602d7e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Jun 2012 18:36:08 -0700 Subject: rcu: Move RCU grace-period initialization into a kthread As the first step towards allowing grace-period initialization to be preemptible, this commit moves the RCU grace-period initialization into its own kthread. This is needed to keep large-system scheduling latency at reasonable levels. Also change raw_spin_lock_irqsave() to raw_spin_lock_irq() as suggested by Peter Zijlstra in review comments. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 190 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 126 insertions(+), 64 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..4792f1642bf2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1041,6 +1041,102 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat __note_new_gpnum(rsp, rnp, rdp); } +/* + * Body of kthread that handles grace periods. + */ +static int rcu_gp_kthread(void *arg) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = arg; + + for (;;) { + + /* Handle grace-period start. */ + rnp = rcu_get_root(rsp); + for (;;) { + wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); + if (rsp->gp_flags) + break; + flush_signals(current); + } + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags = 0; + rdp = this_cpu_ptr(rsp->rda); + + if (rcu_gp_in_progress(rsp)) { + /* + * A grace period is already in progress, so + * don't start another one. + */ + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + if (rsp->fqs_active) { + /* + * We need a grace period, but force_quiescent_state() + * is running. Tell it to start one on our behalf. + */ + rsp->fqs_need_gp = 1; + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + /* Advance to a new grace period and initialize state. */ + rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + record_gp_stall_check_time(rsp); + raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + + /* Exclude any concurrent CPU-hotplug operations. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + + /* + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. + * This operation relies on the layout of the hierarchy + * within the rsp->node[] array. Note that other CPUs will + * access only the leaves of the hierarchy, which still + * indicate that no grace period is in progress, at least + * until the corresponding leaf node has been initialized. + * In addition, we have excluded CPU-hotplug operations. + * + * Note that the grace period cannot complete until + * we finish the initialization process, as there will + * be at least one qsmask bit set in the root node until + * that time, namely the one corresponding to this CPU, + * due to the fact that we have irqs disabled. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + /* force_quiescent_state() now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rsp->onofflock); + } + return 0; +} + /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold @@ -1058,77 +1154,20 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_scheduler_fully_active || + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* - * Either the scheduler hasn't yet spawned the first - * non-idle task or this CPU does not need another - * grace period. Either way, don't start a new grace - * period. + * Either we have not yet spawned the grace-period + * task or this CPU does not need another grace period. + * Either way, don't start a new grace period. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - if (rsp->fqs_active) { - /* - * This CPU needs a grace period, but force_quiescent_state() - * is running. Tell it to start one on this CPU's behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Advance to a new grace period and initialize state. */ - rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - record_gp_stall_check_time(rsp); - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ - - /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - - /* - * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. This - * operation relies on the layout of the hierarchy within the - * rsp->node[] array. Note that other CPUs will access only - * the leaves of the hierarchy, which still indicate that no - * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. - * - * Note that the grace period cannot complete until we finish - * the initialization process, as there will be at least one - * qsmask bit set in the root node until that time, namely the - * one corresponding to this CPU, due to the fact that we have - * irqs disabled. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + rsp->gp_flags = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up(&rsp->gp_wq); } /* @@ -2628,6 +2667,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } +/* + * Spawn the kthread that handles this RCU flavor's grace periods. + */ +static int __init rcu_spawn_gp_kthread(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp; + struct task_struct *t; + + for_each_rcu_flavor(rsp) { + t = kthread_run(rcu_gp_kthread, rsp, rsp->name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + rsp->gp_kthread = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + return 0; +} +early_initcall(rcu_spawn_gp_kthread); + /* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain @@ -2729,6 +2790,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rsp->rda = rda; + init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) -- cgit v1.2.3 From 79bce6724366b3827c5c673fb07d7063082873cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 17 Sep 2012 14:32:58 -0700 Subject: rcu: Prevent initialization-time quiescent-state race The next step in reducing RCU's grace-period initialization latency on large systems will make this initialization preemptible. Unfortunately, making the grace-period initialization subject to interrupts (let alone preemption) exposes the following race on systems whose rcu_node tree contains more than one node: 1. CPU 31 starts initializing the grace period, including the first leaf rcu_node structures, and is then preempted. 2. CPU 0 refers to the first leaf rcu_node structure, and notes that a new grace period has started. It passes through a quiescent state shortly thereafter, and informs the RCU core of this rite of passage. 3. CPU 0 enters an RCU read-side critical section, acquiring a pointer to an RCU-protected data item. 4. CPU 31 takes an interrupt whose handler removes the data item referenced by CPU 0 from the data structure, and registers an RCU callback in order to free it. 5. CPU 31 resumes initializing the grace period, including its own rcu_node structure. In invokes rcu_start_gp_per_cpu(), which advances all callbacks, including the one registered in #4 above, to be handled by the current grace period. 6. The remaining CPUs pass through quiescent states and inform the RCU core, but CPU 0 remains in its RCU read-side critical section, still referencing the now-removed data item. 7. The grace period completes and all the callbacks are invoked, including the one that frees the data item that CPU 0 is still referencing. Oops!!! One way to avoid this race is to remove grace-period acceleration from rcu_start_gp_per_cpu(). Now, the only reason for this acceleration was to allow CPUs bringing RCU out of idle state to have their callbacks invoked after only one grace period, rather than the two grace periods that would otherwise be required. But this acceleration does not work when RCU grace-period initialization is moved to a kthread because the CPU posting the callback is no longer necessarily the CPU that is initializing the resulting grace period. This commit therefore removes this now-pointless (and soon to be dangerous) grace-period acceleration, thus avoiding the above race. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4792f1642bf2..e7a534498aa0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1023,20 +1023,6 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Prior grace period ended, so advance callbacks for current CPU. */ __rcu_process_gp_end(rsp, rnp, rdp); - /* - * Because this CPU just now started the new grace period, we know - * that all of its callbacks will be covered by this upcoming grace - * period, even the ones that were registered arbitrarily recently. - * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. - * - * Other CPUs cannot be sure exactly when the grace period started. - * Therefore, their recently registered callbacks must pass through - * an additional RCU_NEXT_READY stage, so that they will be handled - * by the next RCU grace period. - */ - rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - /* Set state so that this CPU will detect the next quiescent state. */ __note_new_gpnum(rsp, rnp, rdp); } -- cgit v1.2.3 From 755609a9087fa983f567dc5452b2fa7b089b591f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Jun 2012 17:18:20 -0700 Subject: rcu: Allow RCU grace-period initialization to be preempted RCU grace-period initialization is currently carried out with interrupts disabled, which can result in 200-microsecond latency spikes on systems on which RCU has been configured for 4096 CPUs. This patch therefore makes the RCU grace-period initialization be preemptible, which should eliminate those latency spikes. Similar spikes from grace-period cleanup and the forcing of quiescent states will be dealt with similarly by later patches. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e7a534498aa0..781e5f0b7b17 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1030,7 +1030,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* * Body of kthread that handles grace periods. */ -static int rcu_gp_kthread(void *arg) +static int __noreturn rcu_gp_kthread(void *arg) { struct rcu_data *rdp; struct rcu_node *rnp; @@ -1056,6 +1056,7 @@ static int rcu_gp_kthread(void *arg) * don't start another one. */ raw_spin_unlock_irq(&rnp->lock); + cond_resched(); continue; } @@ -1066,6 +1067,7 @@ static int rcu_gp_kthread(void *arg) */ rsp->fqs_need_gp = 1; raw_spin_unlock_irq(&rnp->lock); + cond_resched(); continue; } @@ -1076,10 +1078,10 @@ static int rcu_gp_kthread(void *arg) rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + raw_spin_unlock_irq(&rnp->lock); /* Exclude any concurrent CPU-hotplug operations. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + get_online_cpus(); /* * Set the quiescent-state-needed bits in all the rcu_node @@ -1091,15 +1093,9 @@ static int rcu_gp_kthread(void *arg) * indicate that no grace period is in progress, at least * until the corresponding leaf node has been initialized. * In addition, we have excluded CPU-hotplug operations. - * - * Note that the grace period cannot complete until - * we finish the initialization process, as there will - * be at least one qsmask bit set in the root node until - * that time, namely the one corresponding to this CPU, - * due to the fact that we have irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; @@ -1110,17 +1106,17 @@ static int rcu_gp_kthread(void *arg) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); /* force_quiescent_state() now OK. */ rsp->fqs_state = RCU_SIGNAL_INIT; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_unlock_irq(&rsp->onofflock); + raw_spin_unlock_irq(&rnp->lock); + put_online_cpus(); } - return 0; } /* -- cgit v1.2.3 From cabc49c1ff51baaf1958d501a7a616ce91245c93 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Jun 2012 17:07:14 -0700 Subject: rcu: Move RCU grace-period cleanup into kthread As a first step towards allowing grace-period cleanup to be preemptible, this commit moves the RCU grace-period cleanup into the same kthread that is now used to initialize grace periods. This is needed to keep scheduling latency down to a dull roar. [ paulmck: Get rid of stray spin_lock_irqsave() calls. ] Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 112 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 62 insertions(+), 50 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 781e5f0b7b17..52c3102dc5f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1032,6 +1032,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat */ static int __noreturn rcu_gp_kthread(void *arg) { + unsigned long gp_duration; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp = arg; @@ -1116,6 +1117,65 @@ static int __noreturn rcu_gp_kthread(void *arg) rsp->fqs_state = RCU_SIGNAL_INIT; raw_spin_unlock_irq(&rnp->lock); put_online_cpus(); + + /* Handle grace-period end. */ + rnp = rcu_get_root(rsp); + for (;;) { + wait_event_interruptible(rsp->gp_wq, + !ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)); + if (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + flush_signals(current); + } + + raw_spin_lock_irq(&rnp->lock); + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; + + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node + * structures so that other CPUs don't have to + * wait until the start of the next grace period + * to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); + rnp->completed = rsp->gpnum; + /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); + } + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + } + + rsp->completed = rsp->gpnum; /* Declare grace period done. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + rsp->fqs_state = RCU_GP_IDLE; + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; + raw_spin_unlock_irq(&rnp->lock); } } @@ -1162,57 +1222,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - unsigned long gp_duration; - struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - - /* - * Ensure that all grace-period and pre-grace-period activity - * is seen before the assignment to rsp->completed. - */ - smp_mb(); /* See above block comment. */ - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; - - /* - * We know the grace period is complete, but to everyone else - * it appears to still be ongoing. But it is also the case - * that to everyone else it looks like there is nothing that - * they can do to advance the grace period. It is therefore - * safe for us to drop the lock in order to mark the grace - * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. - */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - - /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->gpnum; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - } - - rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; - rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* -- cgit v1.2.3 From c856bafae7f5b3f59ac1d99279a9b99b3b36ad12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 08:19:05 -0700 Subject: rcu: Allow RCU grace-period cleanup to be preempted RCU grace-period cleanup is currently carried out with interrupts disabled, which can result in excessive latency spikes on large systems (many hundreds or thousands of CPUs). This patch therefore makes the RCU grace-period cleanup be preemptible, including voluntary preemption points, which should eliminate those latency spikes. Similar spikes from forcing of quiescent states will be dealt with similarly by later patches. Updated to replace uses of spin_lock_irqsave() with spin_lock_irq(), as suggested by Peter Zijlstra. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 52c3102dc5f7..ddc6acc85d26 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1151,7 +1151,7 @@ static int __noreturn rcu_gp_kthread(void *arg) * completed. */ if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irq(&rnp->lock); /* * Propagate new ->completed value to rcu_node @@ -1160,14 +1160,13 @@ static int __noreturn rcu_gp_kthread(void *arg) * to process their callbacks. */ rcu_for_each_node_breadth_first(rsp, rnp) { - /* irqs already disabled. */ - raw_spin_lock(&rnp->lock); + raw_spin_lock_irq(&rnp->lock); rnp->completed = rsp->gpnum; - /* irqs remain disabled. */ - raw_spin_unlock(&rnp->lock); + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } rnp = rcu_get_root(rsp); - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irq(&rnp->lock); } rsp->completed = rsp->gpnum; /* Declare grace period done. */ -- cgit v1.2.3 From 7fdefc10e1d730d4608cc59d386bc446f5b9ee99 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Jun 2012 11:08:41 -0700 Subject: rcu: Break up rcu_gp_kthread() into subfunctions Then rcu_gp_kthread() function is too large and furthermore needs to have the force_quiescent_state() code pulled in. This commit therefore breaks up rcu_gp_kthread() into rcu_gp_init() and rcu_gp_cleanup(). Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 250 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 135 insertions(+), 115 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ddc6acc85d26..f0c0c1b4b6d4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1028,95 +1028,159 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat } /* - * Body of kthread that handles grace periods. + * Initialize a new grace period. */ -static int __noreturn rcu_gp_kthread(void *arg) +static int rcu_gp_init(struct rcu_state *rsp) { - unsigned long gp_duration; struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); - for (;;) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags = 0; - /* Handle grace-period start. */ - rnp = rcu_get_root(rsp); - for (;;) { - wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); - if (rsp->gp_flags) - break; - flush_signals(current); - } + if (rcu_gp_in_progress(rsp)) { + /* Grace period already in progress, don't start another. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; + } + + if (rsp->fqs_active) { + /* + * We need a grace period, but force_quiescent_state() + * is running. Tell it to start one on our behalf. + */ + rsp->fqs_need_gp = 1; + raw_spin_unlock_irq(&rnp->lock); + return 0; + } + + /* Advance to a new grace period and initialize state. */ + rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); + rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + record_gp_stall_check_time(rsp); + raw_spin_unlock_irq(&rnp->lock); + + /* Exclude any concurrent CPU-hotplug operations. */ + get_online_cpus(); + + /* + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first order, + * starting from the root rcu_node structure, relying on the layout + * of the tree within the rsp->node[] array. Note that other CPUs + * will access only the leaves of the hierarchy, thus seeing that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. + * + * The grace period cannot complete until the initialization + * process finishes, because this kthread handles both. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags = 0; rdp = this_cpu_ptr(rsp->rda); + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); + } - if (rcu_gp_in_progress(rsp)) { - /* - * A grace period is already in progress, so - * don't start another one. - */ - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - continue; - } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); + /* force_quiescent_state() now OK. */ + rsp->fqs_state = RCU_SIGNAL_INIT; + raw_spin_unlock_irq(&rnp->lock); + put_online_cpus(); + return 1; +} - if (rsp->fqs_active) { - /* - * We need a grace period, but force_quiescent_state() - * is running. Tell it to start one on our behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - continue; - } +/* + * Clean up after the old grace period. + */ +static int rcu_gp_cleanup(struct rcu_state *rsp) +{ + unsigned long gp_duration; + struct rcu_data *rdp; + struct rcu_node *rnp = rcu_get_root(rsp); - /* Advance to a new grace period and initialize state. */ - rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - record_gp_stall_check_time(rsp); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_lock_irq(&rnp->lock); + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; - /* Exclude any concurrent CPU-hotplug operations. */ - get_online_cpus(); + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + rdp = this_cpu_ptr(rsp->rda); + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock_irq(&rnp->lock); /* - * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first - * order, starting from the root rcu_node structure. - * This operation relies on the layout of the hierarchy - * within the rsp->node[] array. Note that other CPUs will - * access only the leaves of the hierarchy, which still - * indicate that no grace period is in progress, at least - * until the corresponding leaf node has been initialized. - * In addition, we have excluded CPU-hotplug operations. + * Propagate new ->completed value to rcu_node + * structures so that other CPUs don't have to + * wait until the start of the next grace period + * to process their callbacks. */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - if (rnp == rdp->mynode) - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); + rnp->completed = rsp->gpnum; raw_spin_unlock_irq(&rnp->lock); cond_resched(); } - rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - /* force_quiescent_state() now OK. */ - rsp->fqs_state = RCU_SIGNAL_INIT; - raw_spin_unlock_irq(&rnp->lock); - put_online_cpus(); + } + + rsp->completed = rsp->gpnum; /* Declare grace period done. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + rsp->fqs_state = RCU_GP_IDLE; + if (cpu_needs_another_gp(rsp, rdp)) + rsp->gp_flags = 1; + raw_spin_unlock_irq(&rnp->lock); + return 1; +} + +/* + * Body of kthread that handles grace periods. + */ +static int __noreturn rcu_gp_kthread(void *arg) +{ + struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); + + for (;;) { + + /* Handle grace-period start. */ + for (;;) { + wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); + if (rsp->gp_flags && rcu_gp_init(rsp)) + break; + cond_resched(); + flush_signals(current); + } /* Handle grace-period end. */ rnp = rcu_get_root(rsp); @@ -1125,56 +1189,12 @@ static int __noreturn rcu_gp_kthread(void *arg) !ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)); if (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) + !rcu_preempt_blocked_readers_cgp(rnp) && + rcu_gp_cleanup(rsp)) break; + cond_resched(); flush_signals(current); } - - raw_spin_lock_irq(&rnp->lock); - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; - - /* - * We know the grace period is complete, but to everyone else - * it appears to still be ongoing. But it is also the case - * that to everyone else it looks like there is nothing that - * they can do to advance the grace period. It is therefore - * safe for us to drop the lock in order to mark the grace - * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. - */ - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock_irq(&rnp->lock); - - /* - * Propagate new ->completed value to rcu_node - * structures so that other CPUs don't have to - * wait until the start of the next grace period - * to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - } - - rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); - rsp->fqs_state = RCU_GP_IDLE; - if (cpu_needs_another_gp(rsp, rdp)) - rsp->gp_flags = 1; - raw_spin_unlock_irq(&rnp->lock); } } -- cgit v1.2.3 From bfa00b4c4028f39357d16279ff0fddf550241593 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 09:54:10 -0700 Subject: rcu: Prevent offline CPUs from executing RCU core code Earlier versions of RCU invoked the RCU core from the CPU_DYING notifier in order to note a quiescent state for the outgoing CPU. Because the CPU is marked "offline" during the execution of the CPU_DYING notifiers, the RCU core had to tolerate being invoked from an offline CPU. However, commit b1420f1c (Make rcu_barrier() less disruptive) left only tracing code in the CPU_DYING notifier, so the RCU core need no longer execute on offline CPUs. This commit therefore enforces this restriction. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f0c0c1b4b6d4..340a5f54b6af 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1892,6 +1892,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) { struct rcu_state *rsp; + if (cpu_is_offline(smp_processor_id())) + return; trace_rcu_utilization("Start RCU core"); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); -- cgit v1.2.3 From 4cdfc175c25c89eedc08460b5e6239c2ec67fcb6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Jun 2012 17:06:26 -0700 Subject: rcu: Move quiescent-state forcing into kthread As the first step towards allowing quiescent-state forcing to be preemptible, this commit moves RCU quiescent-state forcing into the same kthread that is now used to initialize and clean up after grace periods. This is yet another step towards keeping scheduling latency down to a dull roar. Updated to change from raw_spin_lock_irqsave() to raw_spin_lock_irq() and to remove the now-unused rcu_state structure fields as suggested by Peter Zijlstra. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 199 +++++++++++++++++++++---------------------------------- 1 file changed, 74 insertions(+), 125 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 340a5f54b6af..6182686de4a6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -72,7 +72,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ .name = #sname, \ } @@ -226,7 +225,8 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); -static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); /* @@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); */ void rcu_bh_force_quiescent_state(void) { - force_quiescent_state(&rcu_bh_state, 0); + force_quiescent_state(&rcu_bh_state); } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); @@ -286,7 +286,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); */ void rcu_sched_force_quiescent_state(void) { - force_quiescent_state(&rcu_sched_state, 0); + force_quiescent_state(&rcu_sched_state); } EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); @@ -784,11 +784,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) else if (!trigger_all_cpu_backtrace()) dump_stack(); - /* If so configured, complain about tasks blocking the grace period. */ + /* Complain about tasks blocking the grace period. */ rcu_print_detail_task_stall(rsp); - force_quiescent_state(rsp, 0); /* Kick them all. */ + force_quiescent_state(rsp); /* Kick them all. */ } static void print_cpu_stall(struct rcu_state *rsp) @@ -1036,7 +1036,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags = 0; + rsp->gp_flags = 0; /* Clear all flags: New grace period. */ if (rcu_gp_in_progress(rsp)) { /* Grace period already in progress, don't start another. */ @@ -1044,22 +1044,9 @@ static int rcu_gp_init(struct rcu_state *rsp) return 0; } - if (rsp->fqs_active) { - /* - * We need a grace period, but force_quiescent_state() - * is running. Tell it to start one on our behalf. - */ - rsp->fqs_need_gp = 1; - raw_spin_unlock_irq(&rnp->lock); - return 0; - } - /* Advance to a new grace period and initialize state. */ rsp->gpnum++; trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); - WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); - rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); @@ -1096,19 +1083,40 @@ static int rcu_gp_init(struct rcu_state *rsp) cond_resched(); } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - /* force_quiescent_state() now OK. */ - rsp->fqs_state = RCU_SIGNAL_INIT; - raw_spin_unlock_irq(&rnp->lock); put_online_cpus(); return 1; } +/* + * Do one round of quiescent-state forcing. + */ +int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +{ + int fqs_state = fqs_state_in; + struct rcu_node *rnp = rcu_get_root(rsp); + + rsp->n_force_qs++; + if (fqs_state == RCU_SAVE_DYNTICK) { + /* Collect dyntick-idle snapshots. */ + force_qs_rnp(rsp, dyntick_save_progress_counter); + fqs_state = RCU_FORCE_QS; + } else { + /* Handle dyntick-idle and offline CPUs. */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + } + /* Clear flag to prevent immediate re-entry. */ + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + raw_spin_unlock_irq(&rnp->lock); + } + return fqs_state; +} + /* * Clean up after the old grace period. */ -static int rcu_gp_cleanup(struct rcu_state *rsp) +static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; struct rcu_data *rdp; @@ -1160,7 +1168,6 @@ static int rcu_gp_cleanup(struct rcu_state *rsp) if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); - return 1; } /* @@ -1168,6 +1175,8 @@ static int rcu_gp_cleanup(struct rcu_state *rsp) */ static int __noreturn rcu_gp_kthread(void *arg) { + int fqs_state; + int ret; struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1175,26 +1184,43 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle grace-period start. */ for (;;) { - wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); - if (rsp->gp_flags && rcu_gp_init(rsp)) + wait_event_interruptible(rsp->gp_wq, + rsp->gp_flags & + RCU_GP_FLAG_INIT); + if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && + rcu_gp_init(rsp)) break; cond_resched(); flush_signals(current); } - /* Handle grace-period end. */ - rnp = rcu_get_root(rsp); + /* Handle quiescent-state forcing. */ + fqs_state = RCU_SAVE_DYNTICK; for (;;) { - wait_event_interruptible(rsp->gp_wq, - !ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)); + rsp->jiffies_force_qs = jiffies + + RCU_JIFFIES_TILL_FORCE_QS; + ret = wait_event_interruptible_timeout(rsp->gp_wq, + (rsp->gp_flags & RCU_GP_FLAG_FQS) || + (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)), + RCU_JIFFIES_TILL_FORCE_QS); + /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp) && - rcu_gp_cleanup(rsp)) + !rcu_preempt_blocked_readers_cgp(rnp)) break; - cond_resched(); - flush_signals(current); + /* If time for quiescent-state forcing, do it. */ + if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { + fqs_state = rcu_gp_fqs(rsp, fqs_state); + cond_resched(); + } else { + /* Deal with stray signal. */ + cond_resched(); + flush_signals(current); + } } + + /* Handle grace-period end. */ + rcu_gp_cleanup(rsp); } } @@ -1226,7 +1252,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) return; } - rsp->gp_flags = 1; + rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock_irqrestore(&rnp->lock, flags); wake_up(&rsp->gp_wq); } @@ -1777,72 +1803,20 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +static void force_quiescent_state(struct rcu_state *rsp) { unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - trace_rcu_utilization("Start fqs"); - if (!rcu_gp_in_progress(rsp)) { - trace_rcu_utilization("End fqs"); - return; /* No grace period in progress, nothing to force. */ - } - if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) + return; /* Someone beat us to it. */ + if (!raw_spin_trylock_irqsave(&rnp->lock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - trace_rcu_utilization("End fqs"); - return; /* Someone else is already on the job. */ - } - if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) - goto unlock_fqs_ret; /* no emergency and done recently. */ - rsp->n_force_qs++; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; - if(!rcu_gp_in_progress(rsp)) { - rsp->n_force_qs_ngp++; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - goto unlock_fqs_ret; /* no GP in progress, time updated. */ - } - rsp->fqs_active = 1; - switch (rsp->fqs_state) { - case RCU_GP_IDLE: - case RCU_GP_INIT: - - break; /* grace period idle or initializing, ignore. */ - - case RCU_SAVE_DYNTICK: - - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - - /* Record dyntick-idle state. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - if (rcu_gp_in_progress(rsp)) - rsp->fqs_state = RCU_FORCE_QS; - break; - - case RCU_FORCE_QS: - - /* Check dyntick-idle state, send IPI to laggarts. */ - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); - - /* Leave state in case more forcing is required. */ - - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - break; - } - rsp->fqs_active = 0; - if (rsp->fqs_need_gp) { - raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ - rsp->fqs_need_gp = 0; - rcu_start_gp(rsp, flags); /* releases rnp->lock */ - trace_rcu_utilization("End fqs"); return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ -unlock_fqs_ret: - raw_spin_unlock_irqrestore(&rsp->fqslock, flags); - trace_rcu_utilization("End fqs"); + rsp->gp_flags |= RCU_GP_FLAG_FQS; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } /* @@ -1858,13 +1832,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) WARN_ON_ONCE(rdp->beenonline == 0); - /* - * If an RCU GP has gone long enough, go check for dyntick - * idle CPUs and, if needed, send resched IPIs. - */ - if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); - /* * Advance callbacks in response to end of earlier grace * period that some other CPU ended. @@ -1965,12 +1932,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); + force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; rdp->qlen_last_fqs_check = rdp->qlen; } - } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) - force_quiescent_state(rsp, 1); + } } static void @@ -2251,17 +2217,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->qs_pending && !rdp->passed_quiesce) { - - /* - * If force_quiescent_state() coming soon and this CPU - * needs a quiescent state, and this is either RCU-sched - * or RCU-bh, force a local reschedule. - */ rdp->n_rp_qs_pending++; - if (!rdp->preemptible && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, - jiffies)) - set_need_resched(); } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; @@ -2291,13 +2247,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) return 1; } - /* Has an RCU GP gone long enough to send resched IPIs &c? */ - if (rcu_gp_in_progress(rsp) && - ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { - rdp->n_rp_need_fqs++; - return 1; - } - /* nothing to do */ rdp->n_rp_need_nothing++; return 0; -- cgit v1.2.3 From b4be093fee0200789df59b6c90e2d099a20f55b3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Jun 2012 08:41:11 -0700 Subject: rcu: Allow RCU quiescent-state forcing to be preempted RCU quiescent-state forcing is currently carried out without preemption points, which can result in excessive latency spikes on large systems (many hundreds or thousands of CPUs). This patch therefore inserts a voluntary preemption point into force_qs_rnp(), which should greatly reduce the magnitude of these spikes. Reported-by: Mike Galbraith Reported-by: Dimitri Sivanich Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6182686de4a6..723e2e723074 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1767,6 +1767,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { + cond_resched(); mask = 0; raw_spin_lock_irqsave(&rnp->lock, flags); if (!rcu_gp_in_progress(rsp)) { -- cgit v1.2.3 From 394f2769aa0dbcf027bae6fb52835e25e05d332e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Jun 2012 17:00:35 -0700 Subject: rcu: Prevent force_quiescent_state() memory contention Large systems running RCU_FAST_NO_HZ kernels see extreme memory contention on the rcu_state structure's ->fqslock field. This can be avoided by disabling RCU_FAST_NO_HZ, either at compile time or at boot time (via the nohz kernel boot parameter), but large systems will no doubt become sensitive to energy consumption. This commit therefore uses a combining-tree approach to spread the memory contention across new cache lines in the leaf rcu_node structures. This can be thought of as a tournament lock that has only a try-lock acquisition primitive. The effect on small systems is minimal, because such systems have an rcu_node "tree" consisting of a single node. In addition, this functionality is not used on fastpaths. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 723e2e723074..43d57a17fcc5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -61,6 +61,7 @@ /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; #define RCU_STATE_INITIALIZER(sname, cr) { \ .level = { &sname##_state.node[0] }, \ @@ -1807,16 +1808,35 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) static void force_quiescent_state(struct rcu_state *rsp) { unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); + bool ret; + struct rcu_node *rnp; + struct rcu_node *rnp_old = NULL; + + /* Funnel through hierarchy to reduce memory contention. */ + rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp != NULL; rnp = rnp->parent) { + ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + !raw_spin_trylock(&rnp->fqslock); + if (rnp_old != NULL) + raw_spin_unlock(&rnp_old->fqslock); + if (ret) { + rsp->n_force_qs_lh++; + return; + } + rnp_old = rnp; + } + /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) + /* Reached the root of the rcu_node tree, acquire lock. */ + raw_spin_lock_irqsave(&rnp_old->lock, flags); + raw_spin_unlock(&rnp_old->fqslock); + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + rsp->n_force_qs_lh++; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); return; /* Someone beat us to it. */ - if (!raw_spin_trylock_irqsave(&rnp->lock, flags)) { - rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ - return; } rsp->gp_flags |= RCU_GP_FLAG_FQS; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ } @@ -2704,10 +2724,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static char *buf[] = { "rcu_node_level_0", - "rcu_node_level_1", - "rcu_node_level_2", - "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ + static char *buf[] = { "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static char *fqs[] = { "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ int cpustride = 1; int i; int j; @@ -2732,6 +2756,9 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); + raw_spin_lock_init(&rnp->fqslock); + lockdep_set_class_and_name(&rnp->fqslock, + &rcu_fqs_class[i], fqs[i]); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; -- cgit v1.2.3 From d40011f601b450396104de42c631981502946cf0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Jun 2012 20:45:57 -0700 Subject: rcu: Control grace-period duration from sysfs Although almost everyone is well-served by the defaults, some uses of RCU benefit from shorter grace periods, while others benefit more from the greater efficiency provided by longer grace periods. Situations requiring a large number of grace periods to elapse (and wireshark startup has been called out as an example of this) are helped by lower-latency grace periods. Furthermore, in some embedded applications, people are willing to accept a small degradation in update efficiency (due to there being more of the shorter grace-period operations) in order to gain the lower latency. In contrast, those few systems with thousands of CPUs need longer grace periods because the CPU overhead of a grace period rises roughly linearly with the number of CPUs. Such systems normally do not make much use of facilities that require large numbers of grace periods to elapse, so this is a good tradeoff. Therefore, this commit allows the durations to be controlled from sysfs. There are two sysfs parameters, one named "jiffies_till_first_fqs" that specifies the delay in jiffies from the end of grace-period initialization until the first attempt to force quiescent states, and the other named "jiffies_till_next_fqs" that specifies the delay (again in jiffies) between subsequent attempts to force quiescent states. They both default to three jiffies, which is compatible with the old hard-coded behavior. At some future time, it may be possible to automatically increase the grace-period length with the number of CPUs, but we do not yet have sufficient data to do a good job. Preliminary data indicates that we should add an addiitonal jiffy to each of the delays for every 200 CPUs in the system, but more experimentation is needed. For now, the number of systems with more than 1,000 CPUs is small enough that this can be relegated to boot-time hand tuning. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 43d57a17fcc5..c0d3d56f0404 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -226,6 +226,12 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_suppress, int, 0644); module_param(rcu_cpu_stall_timeout, int, 0644); +static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; +static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; + +module_param(jiffies_till_first_fqs, ulong, 0644); +module_param(jiffies_till_next_fqs, ulong, 0644); + static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -1177,6 +1183,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) static int __noreturn rcu_gp_kthread(void *arg) { int fqs_state; + unsigned long j; int ret; struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1197,14 +1204,18 @@ static int __noreturn rcu_gp_kthread(void *arg) /* Handle quiescent-state forcing. */ fqs_state = RCU_SAVE_DYNTICK; + j = jiffies_till_first_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_first_fqs = HZ; + } for (;;) { - rsp->jiffies_force_qs = jiffies + - RCU_JIFFIES_TILL_FORCE_QS; + rsp->jiffies_force_qs = jiffies + j; ret = wait_event_interruptible_timeout(rsp->gp_wq, (rsp->gp_flags & RCU_GP_FLAG_FQS) || (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)), - RCU_JIFFIES_TILL_FORCE_QS); + j); /* If grace period done, leave loop. */ if (!ACCESS_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) @@ -1218,6 +1229,14 @@ static int __noreturn rcu_gp_kthread(void *arg) cond_resched(); flush_signals(current); } + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } } /* Handle grace-period end. */ -- cgit v1.2.3 From 7e5c2dfb4de15e21f62c956ec32cda9372ca993b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 1 Jul 2012 15:42:33 -0700 Subject: rcu: Make rcutree module parameters visible in sysfs The module parameters blimit, qhimark, and qlomark (and more recently, rcu_fanout_leaf) have permission masks of zero, so that their values are not visible from sysfs. This is unnecessary and inconvenient to administrators who might like an easy way to see what these values are on a running system. This commit therefore sets their permission masks to 0444, allowing them to be read but not written. Reported-by: Rusty Russell Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c0d3d56f0404..f91a20c652b5 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -88,7 +88,7 @@ LIST_HEAD(rcu_struct_flavors); /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; -module_param(rcu_fanout_leaf, int, 0); +module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ NUM_RCU_LVL_0, @@ -216,9 +216,9 @@ static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0); +module_param(blimit, int, 0444); +module_param(qhimark, int, 0444); +module_param(qlowmark, int, 0444); int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; -- cgit v1.2.3 From 5d4b86594984d8746b01487c768d8548463c173f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jul 2012 07:56:57 -0700 Subject: rcu: Fix day-zero grace-period initialization/cleanup race The current approach to grace-period initialization is vulnerable to extremely low-probability races. These races stem from the fact that the old grace period is marked completed on the same traversal through the rcu_node structure that is marking the start of the new grace period. This means that some rcu_node structures will believe that the old grace period is still in effect at the same time that other rcu_node structures believe that the new grace period has already started. These sorts of disagreements can result in too-short grace periods, as shown in the following scenario: 1. CPU 0 completes a grace period, but needs an additional grace period, so starts initializing one, initializing all the non-leaf rcu_node structures and the first leaf rcu_node structure. Because CPU 0 is both completing the old grace period and starting a new one, it marks the completion of the old grace period and the start of the new grace period in a single traversal of the rcu_node structures. Therefore, CPUs corresponding to the first rcu_node structure can become aware that the prior grace period has completed, but CPUs corresponding to the other rcu_node structures will see this same prior grace period as still being in progress. 2. CPU 1 passes through a quiescent state, and therefore informs the RCU core. Because its leaf rcu_node structure has already been initialized, this CPU's quiescent state is applied to the new (and only partially initialized) grace period. 3. CPU 1 enters an RCU read-side critical section and acquires a reference to data item A. Note that this CPU believes that its critical section started after the beginning of the new grace period, and therefore will not block this new grace period. 4. CPU 16 exits dyntick-idle mode. Because it was in dyntick-idle mode, other CPUs informed the RCU core of its extended quiescent state for the past several grace periods. This means that CPU 16 is not yet aware that these past grace periods have ended. Assume that CPU 16 corresponds to the second leaf rcu_node structure -- which has not yet been made aware of the new grace period. 5. CPU 16 removes data item A from its enclosing data structure and passes it to call_rcu(), which queues a callback in the RCU_NEXT_TAIL segment of the callback queue. 6. CPU 16 enters the RCU core, possibly because it has taken a scheduling-clock interrupt, or alternatively because it has more than 10,000 callbacks queued. It notes that the second most recent grace period has completed (recall that because it corresponds to the second as-yet-uninitialized rcu_node structure, it cannot yet become aware that the most recent grace period has completed), and therefore advances its callbacks. The callback for data item A is therefore in the RCU_NEXT_READY_TAIL segment of the callback queue. 7. CPU 0 completes initialization of the remaining leaf rcu_node structures for the new grace period, including the structure corresponding to CPU 16. 8. CPU 16 again enters the RCU core, again, possibly because it has taken a scheduling-clock interrupt, or alternatively because it now has more than 10,000 callbacks queued. It notes that the most recent grace period has ended, and therefore advances its callbacks. The callback for data item A is therefore in the RCU_DONE_TAIL segment of the callback queue. 9. All CPUs other than CPU 1 pass through quiescent states. Because CPU 1 already passed through its quiescent state, the new grace period completes. Note that CPU 1 is still in its RCU read-side critical section, still referencing data item A. 10. Suppose that CPU 2 wais the last CPU to pass through a quiescent state for the new grace period, and suppose further that CPU 2 did not have any callbacks queued, therefore not needing an additional grace period. CPU 2 therefore traverses all of the rcu_node structures, marking the new grace period as completed, but does not initialize a new grace period. 11. CPU 16 yet again enters the RCU core, yet again possibly because it has taken a scheduling-clock interrupt, or alternatively because it now has more than 10,000 callbacks queued. It notes that the new grace period has ended, and therefore advances its callbacks. The callback for data item A is therefore in the RCU_DONE_TAIL segment of the callback queue. This means that this callback is now considered ready to be invoked. 12. CPU 16 invokes the callback, freeing data item A while CPU 1 is still referencing it. This scenario represents a day-zero bug for TREE_RCU. This commit therefore ensures that the old grace period is marked completed in all leaf rcu_node structures before a new grace period is marked started in any of them. That said, it would have been insanely difficult to force this race to happen before the grace-period initialization process was preemptible. Therefore, this commit is not a candidate for -stable. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Conflicts: kernel/rcutree.c --- kernel/rcutree.c | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f91a20c652b5..145f27fe3a1f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1141,37 +1141,31 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * they can do to advance the grace period. It is therefore * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. - * - * But if this CPU needs another grace period, it will take - * care of this while initializing the next grace period. - * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL - * because the callbacks have not yet been advanced: Those - * callbacks are waiting on the grace period that just now - * completed. */ - rdp = this_cpu_ptr(rsp->rda); - if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq(&rnp->lock); - /* - * Propagate new ->completed value to rcu_node - * structures so that other CPUs don't have to - * wait until the start of the next grace period - * to process their callbacks. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - } - rnp = rcu_get_root(rsp); + /* + * Propagate new ->completed value to rcu_node structures so + * that other CPUs don't have to wait until the start of the next + * grace period to process their callbacks. This also avoids + * some nasty RCU grace-period initialization races by forcing + * the end of the current grace period to be completely recorded in + * all of the rcu_node structures before the beginning of the next + * grace period is recorded in any of the rcu_node structures. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); + rnp->completed = rsp->gpnum; + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; + rdp = this_cpu_ptr(rsp->rda); if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); -- cgit v1.2.3 From 661a85dc0d2ec0404e3b80909e413a9d5e42a239 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jul 2012 05:57:03 -0700 Subject: rcu: Add random PROVE_RCU_DELAY to grace-period initialization Preemption greatly raised the probability of certain types of race conditions, so this commit adds an anti-heisenbug to greatly increase the collision cross section, also known as the probability of occurrence. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 145f27fe3a1f..f0f3a18c0a20 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "rcutree.h" #include @@ -1087,6 +1088,10 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); +#ifdef CONFIG_PROVE_RCU_DELAY + if ((random32() % (rcu_num_nodes * 8)) == 0) + schedule_timeout_uninterruptible(2); +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); } -- cgit v1.2.3 From 25d30cf4250f74e5ceb35f8f39739782408db633 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jul 2012 05:23:18 -0700 Subject: rcu: Adjust for unconditional ->completed assignment Now that the rcu_node structures' ->completed fields are unconditionally assigned at grace-period cleanup time, they should already have the correct value for the new grace period at grace-period initialization time. This commit therefore inserts a WARN_ON_ONCE() to verify this invariant. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f0f3a18c0a20..a2eadd04fb29 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1080,6 +1080,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; + WARN_ON_ONCE(rnp->completed != rsp->completed); rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); @@ -2777,7 +2778,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = 0; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; rnp->qsmask = 0; rnp->qsmaskinit = 0; rnp->grplo = j * cpustride; -- cgit v1.2.3 From d7d6a11e8609f0319d4a2d8ede348f8b3374b652 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Aug 2012 15:00:05 -0700 Subject: rcu: Simplify quiescent-state detection The current quiescent-state detection algorithm is needlessly complex. It records the grace-period number corresponding to the quiescent state at the time of the quiescent state, which works, but it seems better to simply erase any record of previous quiescent states at the time that the CPU notices the new grace period. This has the further advantage of removing another piece of RCU for which lockless reasoning is required. Therefore, this commit makes this change. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a2eadd04fb29..6194402ec853 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -176,8 +176,6 @@ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -187,8 +185,6 @@ void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesce_gpnum = rdp->gpnum; - barrier(); if (rdp->passed_quiesce == 0) trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); rdp->passed_quiesce = 1; @@ -899,12 +895,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); - if (rnp->qsmask & rdp->grpmask) { - rdp->qs_pending = 1; - rdp->passed_quiesce = 0; - } else { - rdp->qs_pending = 0; - } + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); } } @@ -984,10 +976,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * our behalf. Catch up with this state to avoid noting * spurious new grace periods. If another grace period * has started, then rnp->gpnum will have advanced, so - * we will detect this later on. + * we will detect this later on. Of course, any quiescent + * states we found for the old GP are now invalid. */ - if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { rdp->gpnum = rdp->completed; + rdp->passed_quiesce = 0; + } /* * If RCU does not need a quiescent state from this CPU, @@ -1358,7 +1353,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; unsigned long mask; @@ -1366,7 +1361,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { + if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum) { /* * The grace period in which this quiescent state was @@ -1425,7 +1421,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } #ifdef CONFIG_HOTPLUG_CPU @@ -2600,7 +2596,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - rdp->passed_quiesce_gpnum = rnp->gpnum - 1; trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ -- cgit v1.2.3 From 4dbd6bb38dd1cbfa5cb21e56e51dffc74aa20038 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Sep 2012 21:43:57 -0700 Subject: rcu: Handle unbalanced rcu_node configurations with few CPUs If CONFIG_RCU_FANOUT_EXACT=y, if there are not enough CPUs (according to nr_cpu_ids) to require more than a single rcu_node structure, but if NR_CPUS is larger than would fit into a single rcu_node structure, then the current rcu_init_levelspread() code is subject to integer overflow in the eight-bit ->levelspread[] array in the rcu_state structure. In this case, the solution is -not- to increase the size of the elements in this array because the values in that array should be constrained to the number of bits in an unsigned long. Instead, this commit replaces NR_CPUS with nr_cpu_ids in the rcu_init_levelspread() function's initialization of the cprv local variable. This results in all of the arithmetic being consistently based off of the nr_cpu_ids value, thus avoiding the overflow, which was caused by the mixing of nr_cpu_ids and NR_CPUS. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6194402ec853..8b9496fee235 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2723,7 +2723,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) int cprv; int i; - cprv = NR_CPUS; + cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { ccur = rsp->levelcnt[i]; rsp->levelspread[i] = (cprv + ccur - 1) / ccur; -- cgit v1.2.3 From b17c7035f37f47c7f7cb08a5555ab2aebfa31f91 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Sep 2012 15:38:02 -0700 Subject: rcu: Shrink RCU based on number of CPUs Currently, rcu_init_geometry() only reshapes RCU's combining trees if the leaf fanout is changed at boot time. This means that by default, kernels compiled with (say) NR_CPUS=4096 will keep oversized data structures, even when running on systems with (say) four CPUs. This commit therefore checks to see if the maximum number of CPUs on the actual running system (nr_cpu_ids) differs from NR_CPUS, and if so reshapes the combining trees accordingly. Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8b9496fee235..b703989148e4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2821,7 +2821,8 @@ static void __init rcu_init_geometry(void) int rcu_capacity[MAX_RCU_LVLS + 1]; /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + nr_cpu_ids == NR_CPUS) return; /* -- cgit v1.2.3 From a82dcc76021e22c174ba85d90b7a8c750b7362d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Aug 2012 14:29:20 -0700 Subject: rcu: Make offline-CPU checking allow for indefinite delays The rcu_implicit_offline_qs() function implicitly assumed that execution would progress predictably when interrupts are disabled, which is of course not guaranteed when running on a hypervisor. Furthermore, this function is short, and is called from one place only in a short function. This commit therefore ensures that the timing is checked before checking the condition, which guarantees correct behavior even given indefinite delays. It also inlines rcu_implicit_offline_qs() into rcu_implicit_dynticks_qs(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 53 +++++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 32 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..2c4ee4cdbc0e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -318,35 +318,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -/* - * If the specified CPU is offline, tell the caller that it is in - * a quiescent state. Otherwise, whack it with a reschedule IPI. - * Grace periods can end up waiting on an offline CPU when that - * CPU is in the process of coming online -- it will be added to the - * rcu_node bitmasks before it actually makes it online. The same thing - * can happen while a CPU is in the process of coming online. Because this - * race is quite rare, we check for it after detecting that the grace - * period has been delayed rather than checking each and every CPU - * each and every time we start a new grace period. - */ -static int rcu_implicit_offline_qs(struct rcu_data *rdp) -{ - /* - * If the CPU is offline for more than a jiffy, it is in a quiescent - * state. We can trust its state not to change because interrupts - * are disabled. The reason for the jiffy's worth of slack is to - * handle CPUs initializing on the way up and finding their way - * to the idle loop on the way down. - */ - if (cpu_is_offline(rdp->cpu) && - ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); - rdp->offline_fqs++; - return 1; - } - return 0; -} - /* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * @@ -675,7 +646,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU. + * for this same CPU, or by virtue of having been offline. */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { @@ -699,8 +670,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 1; } - /* Go check for the CPU being offline. */ - return rcu_implicit_offline_qs(rdp); + /* + * Check for the CPU being offline, but only if the grace period + * is old enough. We don't need to worry about the CPU changing + * state: If we see it offline even once, it has been through a + * quiescent state. + * + * The reason for insisting that the grace period be at least + * one jiffy old is that CPUs that are not quite online and that + * have just gone offline can still execute RCU read-side critical + * sections. + */ + if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) + return 0; /* Grace period is not old enough. */ + barrier(); + if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + rdp->offline_fqs++; + return 1; + } + return 0; } static int jiffies_till_stall_check(void) -- cgit v1.2.3 From c8020a67e625c714c4dbedc8ae2944b461e204ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Aug 2012 16:55:59 -0700 Subject: rcu: Protect rcu_node accesses during CPU stall warnings The print_other_cpu_stall() function accesses a number of rcu_node fields without protection from the ->lock. In theory, this is not a problem because the fields accessed are all integers, but in practice the compiler can get nasty. Therefore, the commit extends the existing critical section to cover the entire loop body. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2c4ee4cdbc0e..2cf8eb3e2d43 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -746,14 +746,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) { + print_cpu_stall_info(rsp, + rnp->grplo + cpu); + ndetected++; + } + } raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rnp->qsmask == 0) - continue; - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, rnp->grplo + cpu); - ndetected++; - } } /* -- cgit v1.2.3 From c96ea7cfdd88d0a67c970502bc5313fede34b86b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Aug 2012 11:17:06 -0700 Subject: rcu: Avoid spurious RCU CPU stall warnings If a given CPU avoids the idle loop but also avoids starting a new RCU grace period for a full minute, RCU can issue spurious RCU CPU stall warnings. This commit fixes this issue by adding a check for ongoing grace period to avoid these spurious stall warnings. Reported-by: Becky Bruce Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2cf8eb3e2d43..98f275296c6d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -819,7 +819,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) j = ACCESS_ONCE(jiffies); js = ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + if (rcu_gp_in_progress(rsp) && + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); -- cgit v1.2.3 From fdab649b1aa732cd6e79654349088465cdff49af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Aug 2012 16:34:12 -0700 Subject: rcu: Remove redundant memory barrier from __call_rcu() The first memory barrier in __call_rcu() is supposed to order any updates done beforehand by the caller against the actual queuing of the callback. However, the second memory barrier (which is intended to order incrementing the queue lengths before queuing the callback) is also between the caller's updates and the queuing of the callback. The second memory barrier can therefore serve both purposes. This commit therefore removes the first memory barrier. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 98f275296c6d..ba4f4b4c2382 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1922,8 +1922,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), head->func = func; head->next = NULL; - smp_mb(); /* Ensure RCU update seen before callback registry. */ - /* * Opportunistically note grace-period endings and beginnings. * Note that we might see a beginning right after we see an -- cgit v1.2.3 From 1331e7a1bbe1f11b19c4327ba0853bee2a606543 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Aug 2012 17:43:50 -0700 Subject: rcu: Remove _rcu_barrier() dependency on __stop_machine() Currently, _rcu_barrier() relies on preempt_disable() to prevent any CPU from going offline, which in turn depends on CPU hotplug's use of __stop_machine(). This patch therefore makes _rcu_barrier() use get_online_cpus() to block CPU-hotplug operations. This has the added benefit of removing the need for _rcu_barrier() to adopt callbacks: Because CPU-hotplug operations are excluded, there can be no callbacks to adopt. This commit simplifies the code accordingly. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 83 ++++++++------------------------------------------------ 1 file changed, 11 insertions(+), 72 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..c45d3f745302 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1392,17 +1392,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) int i; struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - /* - * If there is an rcu_barrier() operation in progress, then - * only the task doing that operation is permitted to adopt - * callbacks. To do otherwise breaks rcu_barrier() and friends - * by causing them to fail to wait for the callbacks in the - * orphanage. - */ - if (rsp->rcu_barrier_in_progress && - rsp->rcu_barrier_in_progress != current) - return; - /* Do the accounting first. */ rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; @@ -1457,9 +1446,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them, if there is no _rcu_barrier() instance running. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. + * adopting them. There can only be one CPU hotplug operation at a time, + * so no other CPU can be attempting to update rcu_cpu_kthread_task. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { @@ -1521,10 +1509,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ -} - static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } @@ -2328,13 +2312,10 @@ static void rcu_barrier_func(void *type) static void _rcu_barrier(struct rcu_state *rsp) { int cpu; - unsigned long flags; struct rcu_data *rdp; - struct rcu_data rd; unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); unsigned long snap_done; - init_rcu_head_on_stack(&rd.barrier_head); _rcu_barrier_trace(rsp, "Begin", -1, snap); /* Take mutex to serialize concurrent rcu_barrier() requests. */ @@ -2374,70 +2355,30 @@ static void _rcu_barrier(struct rcu_state *rsp) /* * Initialize the count to one rather than to zero in order to * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Also flag this task as doing - * an rcu_barrier(). This will prevent anyone else from adopting - * orphaned callbacks, which could cause otherwise failure if a - * CPU went offline and quickly came back online. To see this, - * consider the following sequence of events: - * - * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. - * 2. CPU 1 goes offline, orphaning its callbacks. - * 3. CPU 0 adopts CPU 1's orphaned callbacks. - * 4. CPU 1 comes back online. - * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. - * 6. Both rcu_barrier_callback() callbacks are invoked, awakening - * us -- but before CPU 1's orphaned callbacks are invoked!!! + * (or preemption of this task). Exclude CPU-hotplug operations + * to ensure that no offline CPU has callbacks queued. */ init_completion(&rsp->barrier_completion); atomic_set(&rsp->barrier_cpu_count, 1); - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rsp->rcu_barrier_in_progress = current; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + get_online_cpus(); /* - * Force every CPU with callbacks to register a new callback - * that will tell us when all the preceding callbacks have - * been invoked. If an offline CPU has callbacks, wait for - * it to either come back online or to finish orphaning those - * callbacks. + * Force each CPU with callbacks to register a new callback. + * When that callback is invoked, we will know that all of the + * corresponding CPU's preceding callbacks have been invoked. */ - for_each_possible_cpu(cpu) { - preempt_disable(); + for_each_online_cpu(cpu) { rdp = per_cpu_ptr(rsp->rda, cpu); - if (cpu_is_offline(cpu)) { - _rcu_barrier_trace(rsp, "Offline", cpu, - rsp->n_barrier_done); - preempt_enable(); - while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) - schedule_timeout_interruptible(1); - } else if (ACCESS_ONCE(rdp->qlen)) { + if (ACCESS_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->n_barrier_done); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); - preempt_enable(); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, rsp->n_barrier_done); - preempt_enable(); } } - - /* - * Now that all online CPUs have rcu_barrier_callback() callbacks - * posted, we can adopt all of the orphaned callbacks and place - * an rcu_barrier_callback() callback after them. When that is done, - * we are guaranteed to have an rcu_barrier_callback() callback - * following every callback that could possibly have been - * registered before _rcu_barrier() was called. - */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rcu_adopt_orphan_cbs(rsp); - rsp->rcu_barrier_in_progress = NULL; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - atomic_inc(&rsp->barrier_cpu_count); - smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ - rd.rsp = rsp; - rsp->call(&rd.barrier_head, rcu_barrier_callback); + put_online_cpus(); /* * Now that we have an rcu_barrier_callback() callback on each @@ -2458,8 +2399,6 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); - - destroy_rcu_head_on_stack(&rd.barrier_head); } /** -- cgit v1.2.3 From 0d8ee37e2fcb7b77b9c5dee784beca5a215cad4c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 Aug 2012 13:16:15 -0700 Subject: rcu: Disallow callback registry on offline CPUs Posting a callback after the CPU_DEAD notifier effectively leaks that callback unless/until that CPU comes back online. Silence is unhelpful when attempting to track down such leaks, so this commit emits a WARN_ON_ONCE() and unconditionally leaks the callback when an offline CPU attempts to register a callback. The rdp->nxttail[RCU_NEXT_TAIL] is set to NULL in the CPU_DEAD notifier and restored in the CPU_UP_PREPARE notifier, allowing _call_rcu() to determine exactly when posting callbacks is illegal. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c45d3f745302..be76c80a14d1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1505,6 +1505,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); + init_callback_list(rdp); + /* Disallow further callbacks on this CPU. */ + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1927,6 +1930,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { + /* _call_rcu() is illegal on offline CPU; leak the callback. */ + WARN_ON_ONCE(1); + local_irq_restore(flags); + return; + } ACCESS_ONCE(rdp->qlen)++; if (lazy) rdp->qlen_lazy++; @@ -2464,6 +2473,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); -- cgit v1.2.3 From adf5091e6ccaa02905e7a28f9ff44f46c7f4c230 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jun 2012 11:20:21 -0700 Subject: rcu: New rcu_user_enter() and rcu_user_exit() APIs RCU currently insists that only idle tasks can enter RCU idle mode, which prohibits an adaptive tickless kernel (AKA nohz cpusets), which in turn would mean that usermode execution would always take scheduling-clock interrupts, even when there is only one task runnable on the CPU in question. This commit therefore adds rcu_user_enter() and rcu_user_exit(), which allow non-idle tasks to enter RCU idle mode. These are quite similar to rcu_idle_enter() and rcu_idle_exit(), respectively, except that they omit the idle-task checks. [ Updated to use "user" flag rather than separate check functions. ] [ paulmck: Updated to drop exports of new functions based on Josh's patch getting rid of the need for them. ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Reviewed-by: Josh Triplett --- kernel/rcutree.c | 135 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 101 insertions(+), 34 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7387e46009d9..af0dc3472a4b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -322,16 +322,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * * If the new value of the ->dynticks_nesting counter now is zero, * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, + bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current)) { + if (!is_idle_task(current) && !user) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -348,7 +349,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* - * The idle task is not permitted to enter the idle loop while + * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), @@ -359,19 +360,11 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) "Illegal idle entry in RCU-sched read-side critical section."); } -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. */ -void rcu_idle_enter(void) +static void rcu_eqs_enter(bool user) { unsigned long flags; long long oldval; @@ -385,11 +378,53 @@ void rcu_idle_enter(void) rdtp->dynticks_nesting = 0; else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(rdtp, oldval); + rcu_eqs_enter_common(rdtp, oldval, user); local_irq_restore(flags); } + +/** + * rcu_idle_enter - inform RCU that current CPU is entering idle + * + * Enter idle mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. + */ +void rcu_idle_enter(void) +{ + rcu_eqs_enter(0); +} EXPORT_SYMBOL_GPL(rcu_idle_enter); +/** + * rcu_user_enter - inform RCU that we are resuming userspace. + * + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_user_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ +void rcu_user_enter(void) +{ + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + rcu_eqs_enter(1); +} + + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -420,18 +455,19 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_idle_enter_common(rdtp, oldval); + rcu_eqs_enter_common(rdtp, oldval, 1); local_irq_restore(flags); } /* - * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * rcu_eqs_exit_common - current CPU moving away from extended quiescent state * * If the new value of the ->dynticks_nesting counter was previously zero, * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, + int user) { smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); @@ -440,7 +476,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current)) { + if (!is_idle_task(current) && !user) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -452,18 +488,11 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) } } -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. */ -void rcu_idle_exit(void) +static void rcu_eqs_exit(bool user) { unsigned long flags; struct rcu_dynticks *rdtp; @@ -477,11 +506,49 @@ void rcu_idle_exit(void) rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; else rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(rdtp, oldval); + rcu_eqs_exit_common(rdtp, oldval, user); local_irq_restore(flags); } + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to + * allow for the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + rcu_eqs_exit(0); +} EXPORT_SYMBOL_GPL(rcu_idle_exit); +/** + * rcu_user_exit - inform RCU that we are exiting userspace. + * + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. + */ +void rcu_user_exit(void) +{ + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + rcu_eqs_exit(1); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -515,7 +582,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_idle_exit_common(rdtp, oldval); + rcu_eqs_exit_common(rdtp, oldval, 1); local_irq_restore(flags); } -- cgit v1.2.3 From 19dd1591fc379f1d89f39cd99cbbe97433baa3c3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 4 Jun 2012 16:42:35 -0700 Subject: rcu: New rcu_user_enter_after_irq() and rcu_user_exit_after_irq() APIs In some cases, it is necessary to enter or exit userspace-RCU-idle mode from an interrupt handler, for example, if some other CPU sends this CPU a resched IPI. In this case, the current CPU would enter the IPI handler in userspace-RCU-idle mode, but would need to exit the IPI handler after having exited that mode. To allow this to work, this commit adds two new APIs to TREE_RCU: - rcu_user_enter_after_irq(). This must be called from an interrupt between rcu_irq_enter() and rcu_irq_exit(). After the irq calls rcu_irq_exit(), the irq handler will return into an RCU extended quiescent state. In theory, this interrupt is never a nested interrupt, but in practice it might interrupt softirq, which looks to RCU like a nested interrupt. - rcu_user_exit_after_irq(). This must be called from a non-nesting interrupt, interrupting an RCU extended quiescent state, also between rcu_irq_enter() and rcu_irq_exit(). After the irq calls rcu_irq_exit(), the irq handler will return in an RCU non-quiescent state. [ Combined with "Allow calls to rcu_exit_user_irq from nesting irqs." ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index af0dc3472a4b..4138f59fa2f4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -425,6 +425,27 @@ void rcu_user_enter(void) } +/** + * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace + * after the current irq returns. + * + * This is similar to rcu_user_enter() but in the context of a non-nesting + * irq. After this call, RCU enters into idle mode when the interrupt + * returns. + */ +void rcu_user_enter_after_irq(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + /* Ensure this irq is interrupting a non-idle RCU state. */ + WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); + rdtp->dynticks_nesting = 1; + local_irq_restore(flags); +} + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -549,6 +570,28 @@ void rcu_user_exit(void) rcu_eqs_exit(1); } +/** + * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace + * idle mode after the current non-nesting irq returns. + * + * This is similar to rcu_user_exit() but in the context of an irq. + * This is called when the irq has interrupted a userspace RCU idle mode + * context. When the current non-nesting interrupt returns after this call, + * the CPU won't restore the RCU idle mode. + */ +void rcu_user_exit_after_irq(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + /* Ensure we are interrupting an RCU idle mode. */ + WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); + rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; + local_irq_restore(flags); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * -- cgit v1.2.3 From 2b1d5024e17be459aa6385763ca3faa8f01c52d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:30 +0200 Subject: rcu: Settle config for userspace extended quiescent state Create a new config option under the RCU menu that put CPUs under RCU extended quiescent state (as in dynticks idle mode) when they run in userspace. This require some contribution from architectures to hook into kernel and userspace boundaries. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4138f59fa2f4..79fa2db1595b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -400,6 +400,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); +#ifdef CONFIG_RCU_USER_QS /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -424,7 +425,6 @@ void rcu_user_enter(void) rcu_eqs_enter(1); } - /** * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace * after the current irq returns. @@ -445,6 +445,7 @@ void rcu_user_enter_after_irq(void) rdtp->dynticks_nesting = 1; local_irq_restore(flags); } +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -548,6 +549,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); +#ifdef CONFIG_RCU_USER_QS /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -591,6 +593,7 @@ void rcu_user_exit_after_irq(void) rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; local_irq_restore(flags); } +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle -- cgit v1.2.3 From c5d900bf676b1e2a61c44483932c8088651bbb4e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:31 +0200 Subject: rcu: Allow rcu_user_enter()/exit() to nest Allow calls to rcu_user_enter() even if we are already in userspace (as seen by RCU) and allow calls to rcu_user_exit() even if we are already in the kernel. This makes the APIs more flexible to be called from architectures. Exception entries for example won't need to know if they come from userspace before calling rcu_user_exit(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 79fa2db1595b..d62c04482228 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -366,11 +366,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, */ static void rcu_eqs_enter(bool user) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); @@ -379,7 +377,6 @@ static void rcu_eqs_enter(bool user) else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_eqs_enter_common(rdtp, oldval, user); - local_irq_restore(flags); } /** @@ -396,7 +393,11 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { + unsigned long flags; + + local_irq_save(flags); rcu_eqs_enter(0); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -411,6 +412,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { + unsigned long flags; + struct rcu_dynticks *rdtp; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -422,7 +426,15 @@ void rcu_user_enter(void) if (in_interrupt()) return; - rcu_eqs_enter(1); + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->in_user) { + rdtp->in_user = true; + rcu_eqs_enter(1); + } + local_irq_restore(flags); } /** @@ -516,11 +528,9 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, */ static void rcu_eqs_exit(bool user) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); @@ -529,7 +539,6 @@ static void rcu_eqs_exit(bool user) else rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(rdtp, oldval, user); - local_irq_restore(flags); } /** @@ -545,7 +554,11 @@ static void rcu_eqs_exit(bool user) */ void rcu_idle_exit(void) { + unsigned long flags; + + local_irq_save(flags); rcu_eqs_exit(0); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -558,6 +571,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_user_exit(void) { + unsigned long flags; + struct rcu_dynticks *rdtp; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -569,7 +585,13 @@ void rcu_user_exit(void) if (in_interrupt()) return; - rcu_eqs_exit(1); + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (rdtp->in_user) { + rdtp->in_user = false; + rcu_eqs_exit(1); + } + local_irq_restore(flags); } /** @@ -2586,6 +2608,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); +#ifdef CONFIG_RCU_USER_QS + WARN_ON_ONCE(rdp->dynticks->in_user); +#endif rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); -- cgit v1.2.3 From 1e1a689f10a27a4fe1ab9b4c6db04fa7232746a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:32 +0200 Subject: rcu: Ignore userspace extended quiescent state by default By default we don't want to enter into RCU extended quiescent state while in userspace because doing this produces some overhead (eg: use of syscall slowpath). Set it off by default and ready to run when some feature like adaptive tickless need it. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d62c04482228..6b82a9565149 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -206,6 +206,9 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_RCU_USER_QS + .ignore_user_qs = true, +#endif }; static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -430,7 +433,7 @@ void rcu_user_enter(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->in_user) { + if (!rdtp->ignore_user_qs && !rdtp->in_user) { rdtp->in_user = true; rcu_eqs_enter(1); } -- cgit v1.2.3 From 04e7e951532b390b16feb070be9972b8fad2fc57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Jul 2012 15:06:40 -0700 Subject: rcu: Switch task's syscall hooks on context switch Clear the syscalls hook of a task when it's scheduled out so that if the task migrates, it doesn't run the syscall slow path on a CPU that might not need it. Also set the syscalls hook on the next task if needed. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b82a9565149..d2e74c8d4b0e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -717,6 +717,21 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_RCU_USER_QS +void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next) +{ + struct rcu_dynticks *rdtp; + + /* Interrupts are disabled in context switch */ + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->ignore_user_qs) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} +#endif /* #ifdef CONFIG_RCU_USER_QS */ + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* -- cgit v1.2.3 From 1fd2b4425a5702c112b441e20b250ac8833a9608 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:40 +0200 Subject: rcu: Userspace RCU extended QS selftest Provide a config option that enables the userspace RCU extended quiescent state on every CPUs by default. This is for testing purpose. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2e74c8d4b0e..812d04b6b395 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -206,7 +206,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_RCU_USER_QS +#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) .ignore_user_qs = true, #endif }; -- cgit v1.2.3 From cb349ca95407cbc11424d5e9fc7c8e700709041b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Sep 2012 17:35:31 -0700 Subject: rcu: Apply micro-optimization and int/bool fixes to RCU's idle handling Checking "user" before "is_idle_task()" allows better optimizations in cases where inlining is possible. Also, "bool" should be passed "true" or "false" rather than "1" or "0". This commit therefore makes these changes, as noted in Josh's review. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 812d04b6b395..4fb2376ddf06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -335,7 +335,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current) && !user) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -399,7 +399,7 @@ void rcu_idle_enter(void) unsigned long flags; local_irq_save(flags); - rcu_eqs_enter(0); + rcu_eqs_enter(false); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -435,7 +435,7 @@ void rcu_user_enter(void) rdtp = &__get_cpu_var(rcu_dynticks); if (!rdtp->ignore_user_qs && !rdtp->in_user) { rdtp->in_user = true; - rcu_eqs_enter(1); + rcu_eqs_enter(true); } local_irq_restore(flags); } @@ -492,7 +492,7 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, 1); + rcu_eqs_enter_common(rdtp, oldval, true); local_irq_restore(flags); } @@ -513,7 +513,7 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current) && !user) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -560,7 +560,7 @@ void rcu_idle_exit(void) unsigned long flags; local_irq_save(flags); - rcu_eqs_exit(0); + rcu_eqs_exit(false); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -592,7 +592,7 @@ void rcu_user_exit(void) rdtp = &__get_cpu_var(rcu_dynticks); if (rdtp->in_user) { rdtp->in_user = false; - rcu_eqs_exit(1); + rcu_eqs_exit(true); } local_irq_restore(flags); } @@ -653,7 +653,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, 1); + rcu_eqs_exit_common(rdtp, oldval, true); local_irq_restore(flags); } -- cgit v1.2.3