From ed2d372c0738386b8a184a6a6bea9c16df6ffb68 Mon Sep 17 00:00:00 2001 From: Christian Dietrich Date: Mon, 6 Sep 2010 16:37:05 +0200 Subject: sched: Remove unnecessary #ifdef CONFIG_SMP The CONFIG_SMP ifdef isn't necessary at this point, because it is checked in an outer ifdef level already and has no effect here. Cleanup only, no functional effect. Signed-off-by: Christian Dietrich Cc: vamos-dev@i4.informatik.uni-erlangen.de Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Tejun Heo LKML-Reference: <7a3a39ef3f765a4473cb026b1f204059568a7098.1283782701.git.qy03fugy@stud.informatik.uni-erlangen.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 09b574e7f4df..8eef8e5512d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -426,9 +426,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -437,7 +435,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. -- cgit v1.2.3 From f269893c575167447cc9f6d1867e639fb5b6f0c5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Aug 2010 10:28:15 +0200 Subject: sched: Merge cpu_to_core_group functions Merge and simplify the two cpu_to_core_group variants so that the resulting function follows the same pattern like cpu_to_phys_group. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <20100831082843.953617555@de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8eef8e5512d4..1a0c084b1cf9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6552,31 +6552,23 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct static_sched_domain, core_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; - +#ifdef CONFIG_SCHED_SMT cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); +#else + group = cpu; +#endif if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } -#elif defined(CONFIG_SCHED_MC) -static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) -{ - if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; -} -#endif +#endif /* CONFIG_SCHED_MC */ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); -- cgit v1.2.3 From 01a08546af311c065f34727787dd0cc8dc0c216f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 31 Aug 2010 10:28:16 +0200 Subject: sched: Add book scheduling domain On top of the SMT and MC scheduling domains this adds the BOOK scheduling domain. This is useful for NUMA like machines which do not have an interface which tells which piece of memory is attached to which node or where the hardware performs striping. Signed-off-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <20100831082844.253053798@de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 1a0c084b1cf9..26f83e2f1534 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6506,6 +6506,7 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t this_sibling_map; cpumask_var_t this_core_map; + cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6517,6 +6518,7 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, + sa_this_book_map, sa_this_core_map, sa_this_sibling_map, sa_nodemask, @@ -6570,6 +6572,31 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map, } #endif /* CONFIG_SCHED_MC */ +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); + +static int +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) +{ + int group = cpu; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); +#endif + if (sg) + *sg = &per_cpu(sched_group_book, group).sg; + return group; +} +#endif /* CONFIG_SCHED_BOOK */ + static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6578,7 +6605,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK + cpumask_and(mask, cpu_book_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC) cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6839,6 +6869,9 @@ SD_INIT_FUNC(CPU) #ifdef CONFIG_SCHED_MC SD_INIT_FUNC(MC) #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif static int default_relax_domain_level = -1; @@ -6888,6 +6921,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_book_map: + free_cpumask_var(d->this_book_map); /* fall through */ case sa_this_core_map: free_cpumask_var(d->this_core_map); /* fall through */ case sa_this_sibling_map: @@ -6934,8 +6969,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) return sa_this_core_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_book_map; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -6993,6 +7030,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK + sd = &per_cpu(book_domains, i).sd; + SD_INIT(sd, BOOK); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + static struct sched_domain *__build_mc_sched_domain(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *parent, int i) @@ -7049,6 +7103,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_core_group, d->send_covered, d->tmpmask); break; +#endif +#ifdef CONFIG_SCHED_BOOK + case SD_LV_BOOK: /* set up book groups */ + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); + if (cpu == cpumask_first(d->this_book_map)) + init_sched_build_groups(d->this_book_map, cpu_map, + &cpu_to_book_group, + d->send_covered, d->tmpmask); + break; #endif case SD_LV_CPU: /* set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); @@ -7097,12 +7160,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); } @@ -7133,6 +7198,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, init_sched_groups_power(i, sd); } #endif +#ifdef CONFIG_SCHED_BOOK + for_each_cpu(i, cpu_map) { + sd = &per_cpu(book_domains, i).sd; + init_sched_groups_power(i, sd); + } +#endif for_each_cpu(i, cpu_map) { sd = &per_cpu(phys_domains, i).sd; @@ -7158,6 +7229,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) + sd = &per_cpu(book_domains, i).sd; #else sd = &per_cpu(phys_domains, i).sd; #endif -- cgit v1.2.3 From 7740191cd909b75d75685fb08a5d1f54b8a9d28b Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 13 Sep 2010 17:47:00 -0400 Subject: sched: Fix string comparison in /proc/sched_features Fix incorrect handling of the following case: INTERACTIVE INTERACTIVE_SOMETHING_ELSE The comparison only checks up to each element's length. Changelog since v1: - Embellish using some Rostedtisms. [ mingo: ^^ == smaller and cleaner ] Signed-off-by: Mathieu Desnoyers Reviewed-by: Steven Rostedt Cc: Cc: Peter Zijlstra Cc: Tony Lindgren LKML-Reference: <20100913214700.GB16118@Krystal> Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 26f83e2f1534..b40b82e33590 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -721,7 +721,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -732,6 +732,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -739,9 +740,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else -- cgit v1.2.3 From 31915ab4cbf507aadab40847cf9989da5e88b090 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Sep 2010 14:42:25 +0200 Subject: sched: Remove branch hints within context_switch() With 710390d9 "sched: Optimize branch hint in context_switch()" the branch hint logic within context_switch() got inversed. In fact the hints "if (likely(!mm))" and "if (likely(!prev->mm))" mean that it is likely that the previous and next task are kernel threads. That assumption is certainly counter intuitive, but Tim has shown that at least with his workload this is true. Nevertheless the truth is: it depends on the current workload. So just remove the annotations which also improves readability. Reported-by: Tim Blechmann Signed-off-by: Heiko Carstens Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20100916124225.GA2209@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b40b82e33590..16a1129f51ec 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2845,14 +2845,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (likely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (likely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } -- cgit v1.2.3 From a8027073eb127cd207070891374b5c54c2ce3d23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Sep 2010 15:13:34 -0400 Subject: tracing/sched: Add sched_pi_setprio tracepoint Add a tracepoint that shows the priority of a task being boosted via priority inheritance. Cc: Gregory Haskins Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9ca8ad05950b..4ad473814350 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4355,6 +4355,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; -- cgit v1.2.3 From 6506cf6ce68d78a5470a8360c965dafe8e4b78e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Sep 2010 17:50:31 +0200 Subject: sched: fix RCU lockdep splat from task_group() This addresses the following RCU lockdep splat: [0.051203] CPU0: AMD QEMU Virtual CPU version 0.12.4 stepping 03 [0.052999] lockdep: fixing up alternatives. [0.054105] [0.054106] =================================================== [0.054999] [ INFO: suspicious rcu_dereference_check() usage. ] [0.054999] --------------------------------------------------- [0.054999] kernel/sched.c:616 invoked rcu_dereference_check() without protection! [0.054999] [0.054999] other info that might help us debug this: [0.054999] [0.054999] [0.054999] rcu_scheduler_active = 1, debug_locks = 1 [0.054999] 3 locks held by swapper/1: [0.054999] #0: (cpu_add_remove_lock){+.+.+.}, at: [] cpu_up+0x42/0x6a [0.054999] #1: (cpu_hotplug.lock){+.+.+.}, at: [] cpu_hotplug_begin+0x2a/0x51 [0.054999] #2: (&rq->lock){-.-...}, at: [] init_idle+0x2f/0x113 [0.054999] [0.054999] stack backtrace: [0.054999] Pid: 1, comm: swapper Not tainted 2.6.35 #1 [0.054999] Call Trace: [0.054999] [] lockdep_rcu_dereference+0x9b/0xa3 [0.054999] [] task_group+0x7b/0x8a [0.054999] [] set_task_rq+0x13/0x40 [0.054999] [] init_idle+0xd2/0x113 [0.054999] [] fork_idle+0xb8/0xc7 [0.054999] [] ? mark_held_locks+0x4d/0x6b [0.054999] [] do_fork_idle+0x17/0x2b [0.054999] [] native_cpu_up+0x1c1/0x724 [0.054999] [] ? do_fork_idle+0x0/0x2b [0.054999] [] _cpu_up+0xac/0x127 [0.054999] [] cpu_up+0x55/0x6a [0.054999] [] kernel_init+0xe1/0x1ff [0.054999] [] kernel_thread_helper+0x4/0x10 [0.054999] [] ? restore_args+0x0/0x30 [0.054999] [] ? kernel_init+0x0/0x1ff [0.054999] [] ? kernel_thread_helper+0x0/0x10 [0.056074] Booting Node 0, Processors #1lockdep: fixing up alternatives. [0.130045] #2lockdep: fixing up alternatives. [0.203089] #3 Ok. [0.275286] Brought up 4 CPUs [0.276005] Total of 4 processors activated (16017.17 BogoMIPS). The cgroup_subsys_state structures referenced by idle tasks are never freed, because the idle tasks should be part of the root cgroup, which is not removable. The problem is that while we do in-fact hold rq->lock, the newly spawned idle thread's cpu is not yet set to the correct cpu so the lockdep check in task_group(): lockdep_is_held(&task_rq(p)->lock) will fail. But this is a chicken and egg problem. Setting the CPU's runqueue requires that the CPU's runqueue already be set. ;-) So insert an RCU read-side critical section to avoid the complaint. Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..ae8f75a5ceb4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5337,7 +5337,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) -- cgit v1.2.3 From 4924627423d5e286136ad2520f5be536345ae590 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Oct 2010 21:46:10 +0200 Subject: sched: Unindent labels Labels should be on column 0. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 2111491f6424..7f522832250c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4891,7 +4891,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { @@ -8141,9 +8141,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(cfs_rq); - err: +err: return 0; } @@ -8231,9 +8231,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(rt_rq); - err: +err: return 0; } @@ -8591,7 +8591,7 @@ static int tg_set_bandwidth(struct task_group *tg, raw_spin_unlock(&rt_rq->rt_runtime_lock); } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); -- cgit v1.2.3 From 34f971f6f7988be4d014eec3e3526bee6d007ffa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Sep 2010 13:53:15 +0200 Subject: sched: Create special class for stop/migrate work In order to separate the stop/migrate work thread from the SCHED_FIFO implementation, create a special class for it that is of higher priority than SCHED_FIFO itself. This currently solves a problem where cpu-hotplug consumes so much cpu-time that the SCHED_FIFO class gets throttled, but has the bandwidth replenishment timer pending on the now dead cpu. It is also required for when we add the planned deadline scheduling class above SCHED_FIFO, as the stop/migrate thread still needs to transcent those tasks. Tested-by: Heiko Carstens Signed-off-by: Peter Zijlstra LKML-Reference: <1285165776.2275.1022.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 54 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 7f522832250c..5f64fed56a44 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -486,7 +486,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; @@ -1837,7 +1837,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1917,10 +1917,41 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -3720,17 +3751,13 @@ pick_next_task(struct rq *rq) return p; } - class = sched_class_highest; - for ( ; ; ) { + for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; } + + BUG(); /* the idle class will always have a runnable task */ } /* @@ -4659,6 +4686,15 @@ recheck: */ rq = __task_rq_lock(p); + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EINVAL; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* -- cgit v1.2.3 From 17bdcf949d03306b308c5fb694849cd35f119807 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 11 Oct 2010 16:36:51 +0200 Subject: sched: Drop all load weight manipulation for RT tasks Load weights are for the CFS, they do not belong in the RT task. This makes all RT scheduling classes leave the CFS weights alone. This fixes a real bug as well: I noticed the following phonomena: a process elevated to SCHED_RR forks with SCHED_RESET_ON_FORK set, and the child is indeed SCHED_OTHER, and the niceval is indeed reset to 0. However the weight inserted by set_load_weight() remains at 0, giving the task insignificat priority. With this fix, the weight is reset to what the task had before being elevated to SCHED_RR/SCHED_FIFO. Cc: Lennart Poettering Cc: stable@kernel.org Signed-off-by: Linus Walleij Signed-off-by: Peter Zijlstra LKML-Reference: <1286807811-10568-1-git-send-email-linus.walleij@stericsson.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 5f64fed56a44..728081a7ef1c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1855,12 +1855,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ -- cgit v1.2.3 From ef8002f6848236de5adc613063ebeabddea8a6fb Mon Sep 17 00:00:00 2001 From: Nikhil Rao Date: Wed, 13 Oct 2010 12:09:35 -0700 Subject: sched: Do not consider SCHED_IDLE tasks to be cache hot This patch adds a check in task_hot to return if the task has SCHED_IDLE policy. SCHED_IDLE tasks have very low weight, and when run with regular workloads, are typically scheduled many milliseconds apart. There is no need to consider these tasks hot for load balancing. Signed-off-by: Nikhil Rao Signed-off-by: Peter Zijlstra LKML-Reference: <1287173550-30365-2-git-send-email-ncrao@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 728081a7ef1c..771b518e5f1f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2025,6 +2025,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + /* * Buddy candidates are cache hot: */ -- cgit v1.2.3 From 75e1056f5c57050415b64cb761a3acc35d91f013 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:16 -0700 Subject: sched: Fix softirq time accounting Peter Zijlstra found a bug in the way softirq time is accounted in VIRT_CPU_ACCOUNTING on this thread: http://lkml.indiana.edu/hypermail//linux/kernel/1009.2/01366.html The problem is, softirq processing uses local_bh_disable internally. There is no way, later in the flow, to differentiate between whether softirq is being processed or is it just that bh has been disabled. So, a hardirq when bh is disabled results in time being wrongly accounted as softirq. Looking at the code a bit more, the problem exists in !VIRT_CPU_ACCOUNTING as well. As account_system_time() in normal tick based accouting also uses softirq_count, which will be set even when not in softirq with bh disabled. Peter also suggested solution of using 2*SOFTIRQ_OFFSET as irq count for local_bh_{disable,enable} and using just SOFTIRQ_OFFSET while softirq processing. The patch below does that and adds API in_serving_softirq() which returns whether we are currently processing softirq or not. Also changes one of the usages of softirq_count in net/sched/cls_cgroup.c to in_serving_softirq. Looks like many usages of in_softirq really want in_serving_softirq. Those changes can be made individually on a case by case basis. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-2-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 771b518e5f1f..089be8adb074 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3422,7 +3422,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (in_serving_softirq()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); -- cgit v1.2.3 From b52bfee445d315549d41eacf2fa7c156e7d153d5 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:19 -0700 Subject: sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time s390/powerpc/ia64 have support for CONFIG_VIRT_CPU_ACCOUNTING which does the fine granularity accounting of user, system, hardirq, softirq times. Adding that option on archs like x86 will be challenging however, given the state of TSC reliability on various platforms and also the overhead it will add in syscall entry exit. Instead, add a lighter variant that only does finer accounting of hardirq and softirq times, providing precise irq times (instead of timer tick based samples). This accounting is added with a new config option CONFIG_IRQ_TIME_ACCOUNTING so that there won't be any overhead for users not interested in paying the perf penalty. This accounting is based on sched_clock, with the code being generic. So, other archs may find it useful as well. This patch just adds the core logic and does not enable this logic yet. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-5-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 089be8adb074..9b302e355791 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1908,6 +1908,55 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + int cpu; + u64 now, delta; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + now = sched_clock(); + cpu = smp_processor_id(); + delta = now - per_cpu(irq_start_time, cpu); + per_cpu(irq_start_time, cpu) = now; + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + per_cpu(cpu_hardirq_time, cpu) += delta; + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + per_cpu(cpu_softirq_time, cpu) += delta; + + local_irq_restore(flags); +} + +#endif + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" -- cgit v1.2.3 From 305e6835e05513406fa12820e40e4a8ecb63743c Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:21 -0700 Subject: sched: Do not account irq time to current task Scheduler accounts both softirq and interrupt processing times to the currently running task. This means, if the interrupt processing was for some other task in the system, then the current task ends up being penalized as it gets shorter runtime than otherwise. Change sched task accounting to acoount only actual task time from currently running task. Now update_curr(), modifies the delta_exec to depend on rq->clock_task. Note that this change only handles CONFIG_IRQ_TIME_ACCOUNTING case. We can extend this to CONFIG_VIRT_CPU_ACCOUNTING with minimal effort. But, thats for later. This change will impact scheduling behavior in interrupt heavy conditions. Tested on a 4-way system with eth0 handled by CPU 2 and a network heavy task (nc) running on CPU 3 (and no RSS/RFS). With that I have CPU 2 spending 75%+ of its time in irq processing. CPU 3 spending around 35% time running nc task. Now, if I run another CPU intensive task on CPU 2, without this change /proc//schedstat shows 100% of time accounted to this task. With this change, it rightly shows less than 25% accounted to this task as remaining time is actually spent on irq processing. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-7-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9b302e355791..9e01b7100ef6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -491,6 +491,7 @@ struct rq { struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -641,10 +642,19 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); + inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) { + int cpu = cpu_of(rq); + u64 irq_time; + + rq->clock = sched_clock_cpu(cpu); + irq_time = irq_time_cpu(cpu); + if (rq->clock - irq_time > rq->clock_task) + rq->clock_task = rq->clock - irq_time; + } } /* @@ -1910,6 +1920,18 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ static DEFINE_PER_CPU(u64, cpu_hardirq_time); static DEFINE_PER_CPU(u64, cpu_softirq_time); @@ -1926,6 +1948,14 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static u64 irq_time_cpu(int cpu) +{ + if (!sched_clock_irqtime) + return 0; + + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + void account_system_vtime(struct task_struct *curr) { unsigned long flags; @@ -1955,6 +1985,13 @@ void account_system_vtime(struct task_struct *curr) local_irq_restore(flags); } +#else + +static u64 irq_time_cpu(int cpu) +{ + return 0; +} + #endif #include "sched_idletask.c" @@ -3322,7 +3359,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock - p->se.exec_start; + ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } -- cgit v1.2.3 From aa483808516ca5cacfa0e5849691f64fec25828e Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:22 -0700 Subject: sched: Remove irq time from available CPU power The idea was suggested by Peter Zijlstra here: http://marc.info/?l=linux-kernel&m=127476934517534&w=2 irq time is technically not available to the tasks running on the CPU. This patch removes irq time from CPU power piggybacking on sched_rt_avg_update(). Tested this by keeping CPU X busy with a network intensive task having 75% oa a single CPU irq processing (hard+soft) on a 4-way system. And start seven cycle soakers on the system. Without this change, there will be two tasks on each CPU. With this change, there is a single task on irq busy CPU X and remaining 7 tasks are spread around among other 3 CPUs. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-8-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9e01b7100ef6..bff9ef537df0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -519,6 +519,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -643,6 +647,7 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); inline void update_rq_clock(struct rq *rq) { @@ -654,6 +659,8 @@ inline void update_rq_clock(struct rq *rq) irq_time = irq_time_cpu(cpu); if (rq->clock - irq_time > rq->clock_task) rq->clock_task = rq->clock - irq_time; + + sched_irq_time_avg_update(rq, irq_time); } } @@ -1985,6 +1992,15 @@ void account_system_vtime(struct task_struct *curr) local_irq_restore(flags); } +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ + if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { + u64 delta_irq = curr_irq_time - rq->prev_irq_time; + rq->prev_irq_time = curr_irq_time; + sched_rt_avg_update(rq, delta_irq); + } +} + #else static u64 irq_time_cpu(int cpu) @@ -1992,6 +2008,8 @@ static u64 irq_time_cpu(int cpu) return 0; } +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + #endif #include "sched_idletask.c" -- cgit v1.2.3 From d267f87fb8179c6dba03d08b91952e81bc3723c7 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 4 Oct 2010 17:03:23 -0700 Subject: sched: Call tick_check_idle before __irq_enter When CPU is idle and on first interrupt, irq_enter calls tick_check_idle() to notify interruption from idle. But, there is a problem if this call is done after __irq_enter, as all routines in __irq_enter may find stale time due to yet to be done tick_check_idle. Specifically, trace calls in __irq_enter when they use global clock and also account_system_vtime change in this patch as it wants to use sched_clock_cpu() to do proper irq timing. But, tick_check_idle was moved after __irq_enter intentionally to prevent problem of unneeded ksoftirqd wakeups by the commit ee5f80a: irq: call __irq_enter() before calling the tick_idle_check Impact: avoid spurious ksoftirqd wakeups Moving tick_check_idle() before __irq_enter and wrapping it with local_bh_enable/disable would solve both the problems. Fixed-by: Yong Zhang Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1286237003-12406-9-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index bff9ef537df0..567f5cb9808c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1974,8 +1974,8 @@ void account_system_vtime(struct task_struct *curr) local_irq_save(flags); - now = sched_clock(); cpu = smp_processor_id(); + now = sched_clock_cpu(cpu); delta = now - per_cpu(irq_start_time, cpu); per_cpu(irq_start_time, cpu) = now; /* -- cgit v1.2.3 From b7dadc38797584f6203386da1947ed5edf516646 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 Oct 2010 20:00:37 +0200 Subject: sched: Export account_system_vtime() KVM uses it for example: ERROR: "account_system_vtime" [arch/x86/kvm/kvm.ko] undefined! Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <1286237003-12406-3-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 567f5cb9808c..5998222f901c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1991,6 +1991,7 @@ void account_system_vtime(struct task_struct *curr) local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(account_system_vtime); static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { -- cgit v1.2.3 From b0ae19811375031ae3b3fecc65b702a9c6e5cc28 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Oct 2010 04:21:18 +0900 Subject: security: remove unused parameter from security_task_setscheduler() All security modules shouldn't change sched_param parameter of security_task_setscheduler(). This is not only meaningless, but also make a harmful result if caller pass a static variable. This patch remove policy and sched_param parameter from security_task_setscheduler() becuase none of security module is using it. Cc: James Morris Signed-off-by: KOSAKI Motohiro Signed-off-by: James Morris --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..df6579d9b4df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4645,7 +4645,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4887,7 +4887,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; -- cgit v1.2.3