summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile15
-rw-r--r--kernel/audit.c20
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c6
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c168
-rw-r--r--kernel/bpf/core.c308
-rw-r--r--kernel/bpf/hashtab.c458
-rw-r--r--kernel/bpf/helpers.c17
-rw-r--r--kernel/bpf/inode.c7
-rw-r--r--kernel/bpf/percpu_freelist.c100
-rw-r--r--kernel/bpf/percpu_freelist.h31
-rw-r--r--kernel/bpf/stackmap.c290
-rw-r--r--kernel/bpf/syscall.c122
-rw-r--r--kernel/bpf/verifier.c831
-rw-r--r--kernel/cgroup.c1467
-rw-r--r--kernel/cpu.c65
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/events/callchain.c67
-rw-r--r--kernel/events/core.c1140
-rw-r--r--kernel/events/internal.h12
-rw-r--r--kernel/events/ring_buffer.c134
-rw-r--r--kernel/events/uprobes.c18
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c27
-rw-r--r--kernel/hung_task.c21
-rw-r--r--kernel/irq/ipi.c46
-rw-r--r--kernel/irq/irqdesc.c26
-rw-r--r--kernel/irq/irqdomain.c19
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kcov.c274
-rw-r--r--kernel/kexec_core.c7
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c191
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/locktorture.c25
-rw-r--r--kernel/locking/qspinlock_stat.h32
-rw-r--r--kernel/locking/rwsem-spinlock.c19
-rw-r--r--kernel/locking/rwsem-xadd.c38
-rw-r--r--kernel/locking/rwsem.c19
-rw-r--r--kernel/memremap.c27
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c61
-rw-r--r--kernel/power/hibernate.c1
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/power/swap.c18
-rw-r--r--kernel/printk/printk.c140
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rcu/Makefile5
-rw-r--r--kernel/rcu/rcuperf.c655
-rw-r--r--kernel/rcu/rcutorture.c29
-rw-r--r--kernel/rcu/tree.c302
-rw-r--r--kernel/rcu/tree.h20
-rw-r--r--kernel/rcu/tree_plugin.h37
-rw-r--r--kernel/rcu/tree_trace.c13
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/resource.c13
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/clock.c48
-rw-r--r--kernel/sched/core.c838
-rw-r--r--kernel/sched/cpuacct.c174
-rw-r--r--kernel/sched/cpuacct.h4
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq.c48
-rw-r--r--kernel/sched/cpufreq_schedutil.c530
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/deadline.c56
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/fair.c541
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/loadavg.c11
-rw-r--r--kernel/sched/rt.c39
-rw-r--r--kernel/sched/sched.h161
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/seccomp.c4
-rw-r--r--kernel/signal.c37
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c22
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/taskstats.c37
-rw-r--r--kernel/time/hrtimer.c18
-rw-r--r--kernel/time/tick-sched.c88
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/time.c8
-rw-r--r--kernel/time/timer.c15
-rw-r--r--kernel/torture.c4
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--kernel/trace/bpf_trace.c133
-rw-r--r--kernel/trace/ftrace.c48
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c35
-rw-r--r--kernel/trace/trace.c76
-rw-r--r--kernel/trace/trace.h113
-rw-r--r--kernel/trace/trace_event_perf.c43
-rw-r--r--kernel/trace/trace_events.c27
-rw-r--r--kernel/trace/trace_events_filter.c12
-rw-r--r--kernel/trace/trace_events_trigger.c88
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c7
-rw-r--r--kernel/trace/trace_irqsoff.c9
-rw-r--r--kernel/trace/trace_kprobe.c37
-rw-r--r--kernel/trace/trace_mmiotrace.c2
-rw-r--r--kernel/trace/trace_nop.c4
-rw-r--r--kernel/trace/trace_output.c10
-rw-r--r--kernel/trace/trace_printk.c3
-rw-r--r--kernel/trace/trace_probe.c4
-rw-r--r--kernel/trace/trace_stat.c3
-rw-r--r--kernel/trace/trace_syscalls.c24
-rw-r--r--kernel/trace/trace_uprobe.c7
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/watchdog.c9
-rw-r--r--kernel/workqueue.c49
-rw-r--r--kernel/workqueue_internal.h2
117 files changed, 8426 insertions, 2490 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..f0c40bf49d9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -14,11 +14,21 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
-# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+# Do not trace internal ftrace files
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -69,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 2651e423b2dc..678c3f000191 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -809,6 +809,16 @@ static int audit_set_feature(struct sk_buff *skb)
return 0;
}
+static int audit_replace(pid_t pid)
+{
+ struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0,
+ &pid, sizeof(pid));
+
+ if (!skb)
+ return -ENOMEM;
+ return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+}
+
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
@@ -870,9 +880,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
if (s.mask & AUDIT_STATUS_PID) {
int new_pid = s.pid;
+ pid_t requesting_pid = task_tgid_vnr(current);
- if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+ if ((!new_pid) && (requesting_pid != audit_pid)) {
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
return -EACCES;
+ }
+ if (audit_pid && new_pid &&
+ audit_replace(requesting_pid) != -ECONNREFUSED) {
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
+ return -EEXIST;
+ }
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_pid = new_pid;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9f194aad0adc..d6709eb70970 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -185,7 +185,7 @@ static struct audit_watch *audit_init_watch(char *path)
return watch;
}
-/* Translate a watch string to kernel respresentation. */
+/* Translate a watch string to kernel representation. */
int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
{
struct audit_watch *watch;
@@ -367,7 +367,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
inode_unlock(d_backing_inode(parent->dentry));
if (d_is_positive(d)) {
/* update watch filter fields */
- watch->dev = d_backing_inode(d)->i_sb->s_dev;
+ watch->dev = d->d_sb->s_dev;
watch->ino = d_backing_inode(d)->i_ino;
}
dput(d);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index b8ff9e193753..94ca7b1e5e7e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -158,7 +158,7 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
return str;
}
-/* Translate an inode field to kernel respresentation. */
+/* Translate an inode field to kernel representation. */
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
@@ -415,7 +415,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
return 0;
}
-/* Translate struct audit_rule_data to kernel's rule respresentation. */
+/* Translate struct audit_rule_data to kernel's rule representation. */
static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t datasz)
{
@@ -593,7 +593,7 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule_data. */
+/* Translate kernel rule representation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
struct audit_rule_data *data;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 195ffaee50b9..7d0e3cf8abe1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
- signr, syscall_get_arch(), syscall, is_compat_task(),
- KSTK_EIP(current), code);
+ signr, syscall_get_arch(), syscall,
+ in_compat_syscall(), KSTK_EIP(current), code);
audit_log_end(ab);
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 13272582eee0..eed911d091da 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,4 +1,7 @@
obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 89ebbc4d1164..76d5a794e426 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -17,15 +17,43 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ free_percpu(array->pptrs[i]);
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+ void __percpu *ptr;
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++) {
+ ptr = __alloc_percpu_gfp(array->elem_size, 8,
+ GFP_USER | __GFP_NOWARN);
+ if (!ptr) {
+ bpf_array_free_percpu(array);
+ return -ENOMEM;
+ }
+ array->pptrs[i] = ptr;
+ }
+
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
struct bpf_array *array;
- u32 elem_size, array_size;
+ u64 array_size;
+ u32 elem_size;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0)
+ attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL);
if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
@@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
- /* check round_up into zero and u32 overflow */
- if (elem_size == 0 ||
- attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+ array_size = sizeof(*array);
+ if (percpu)
+ array_size += (u64) attr->max_entries * sizeof(void *);
+ else
+ array_size += (u64) attr->max_entries * elem_size;
+
+ /* make sure there is no u32 overflow later in round_up() */
+ if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
- array_size = sizeof(*array) + attr->max_entries * elem_size;
/* allocate all map elements and zero-initialize them */
array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}
/* copy mandatory map attributes */
+ array->map.map_type = attr->map_type;
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
- array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
array->elem_size = elem_size;
+ if (!percpu)
+ goto out;
+
+ array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
+
+ if (array_size >= U32_MAX - PAGE_SIZE ||
+ elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+ kvfree(array);
+ return ERR_PTR(-ENOMEM);
+ }
+out:
+ array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+
return &array->map;
}
@@ -67,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
return NULL;
return array->value + array->elem_size * index;
}
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return this_cpu_ptr(array->pptrs[index]);
+}
+
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -ENOENT;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ pptr = array->pptrs[index];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -99,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (map_flags > BPF_EXIST)
+ if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (map_flags == BPF_NOEXIST)
+ if (unlikely(map_flags == BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- memcpy(array->value + array->elem_size * index, value, map->value_size);
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ memcpy(this_cpu_ptr(array->pptrs[index]),
+ value, map->value_size);
+ else
+ memcpy(array->value + array->elem_size * index,
+ value, map->value_size);
+ return 0;
+}
+
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ if (unlikely(index >= array->map.max_entries))
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (unlikely(map_flags == BPF_NOEXIST))
+ /* all elements already exist */
+ return -EEXIST;
+
+ /* the user space will provide round_up(value_size, 8) bytes that
+ * will be copied into per-cpu area. bpf programs can only access
+ * value_size of it. During lookup the same extra bytes will be
+ * returned or zeros which were zero-filled by percpu_alloc,
+ * so no kernel data leaks possible
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ pptr = array->pptrs[index];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ off += size;
+ }
+ rcu_read_unlock();
return 0;
}
@@ -133,6 +259,9 @@ static void array_map_free(struct bpf_map *map)
*/
synchronize_rcu();
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ bpf_array_free_percpu(array);
+
kvfree(array);
}
@@ -150,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
.type = BPF_MAP_TYPE_ARRAY,
};
+static const struct bpf_map_ops percpu_array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list percpu_array_type __read_mostly = {
+ .ops = &percpu_array_ops,
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+};
+
static int __init register_array_map(void)
{
bpf_register_map_type(&array_type);
+ bpf_register_map_type(&percpu_array_type);
return 0;
}
late_initcall(register_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 972d9a8e4ac4..f1e8a0def99b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -27,6 +27,7 @@
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <linux/bpf.h>
+#include <linux/frame.h>
#include <asm/unaligned.h>
@@ -128,14 +129,83 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
return fp;
}
-EXPORT_SYMBOL_GPL(bpf_prog_realloc);
void __bpf_prog_free(struct bpf_prog *fp)
{
kfree(fp->aux);
vfree(fp);
}
-EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_JMP &&
+ /* Call and Exit are both special jumps with no
+ * target inside the BPF instruction image.
+ */
+ BPF_OP(insn->code) != BPF_CALL &&
+ BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 i, insn_cnt = prog->len;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_is_jmp_and_has_target(insn))
+ continue;
+
+ /* Adjust offset of jmps if we cross boundaries. */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+ insn->off -= delta;
+ }
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ struct bpf_prog *prog_adj;
+
+ /* Since our patchlet doesn't expand the image, we're done. */
+ if (insn_delta == 0) {
+ memcpy(prog->insnsi + off, patch, sizeof(*patch));
+ return prog;
+ }
+
+ insn_adj_cnt = prog->len + insn_delta;
+
+ /* Several new instructions need to be inserted. Make room
+ * for them. Likely, there's no need for a new allocation as
+ * last page could have large enough tailroom.
+ */
+ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+ GFP_USER);
+ if (!prog_adj)
+ return NULL;
+
+ prog_adj->len = insn_adj_cnt;
+
+ /* Patching happens in 3 steps:
+ *
+ * 1) Move over tail of insnsi from next instruction onwards,
+ * so we can patch the single target insn with one or more
+ * new ones (patching is always from 1 to n insns, n > 0).
+ * 2) Inject new instructions at the target location.
+ * 3) Adjust branch offsets if necessary.
+ */
+ insn_rest = insn_adj_cnt - off - len;
+
+ memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+ sizeof(*patch) * insn_rest);
+ memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+ bpf_adj_branches(prog_adj, off, insn_delta);
+
+ return prog_adj;
+}
#ifdef CONFIG_BPF_JIT
struct bpf_binary_header *
@@ -173,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
module_memfree(hdr);
}
+
+int bpf_jit_harden __read_mostly;
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+ const struct bpf_insn *aux,
+ struct bpf_insn *to_buff)
+{
+ struct bpf_insn *to = to_buff;
+ u32 imm_rnd = prandom_u32();
+ s16 off;
+
+ BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
+ BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+ if (from->imm == 0 &&
+ (from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
+ from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+ *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+ goto out;
+ }
+
+ switch (from->code) {
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+ break;
+
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+ break;
+
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
+ *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+ break;
+
+ case BPF_LD | BPF_IMM | BPF_DW:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+ break;
+ case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_ST | BPF_MEM | BPF_DW:
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+ break;
+ }
+out:
+ return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ kmemcheck_annotate_bitfield(fp, meta);
+
+ /* aux->prog still points to the fp_other one, so
+ * when promoting the clone to the real program,
+ * this still needs to be adapted.
+ */
+ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+ }
+
+ return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+ /* aux was stolen by the other clone, so we cannot free
+ * it from this path! It will be freed eventually by the
+ * other program on release.
+ *
+ * At this point, we don't need a deferred release since
+ * clone is guaranteed to not be locked.
+ */
+ fp->aux = NULL;
+ __bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+ /* We have to repoint aux->prog to self, as we don't
+ * know whether fp here is the clone or the original.
+ */
+ fp->aux->prog = fp;
+ bpf_prog_clone_free(fp_other);
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+ struct bpf_insn insn_buff[16], aux[2];
+ struct bpf_prog *clone, *tmp;
+ int insn_delta, insn_cnt;
+ struct bpf_insn *insn;
+ int i, rewritten;
+
+ if (!bpf_jit_blinding_enabled())
+ return prog;
+
+ clone = bpf_prog_clone_create(prog, GFP_USER);
+ if (!clone)
+ return ERR_PTR(-ENOMEM);
+
+ insn_cnt = clone->len;
+ insn = clone->insnsi;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ /* We temporarily need to hold the original ld64 insn
+ * so that we can still access the first part in the
+ * second blinding run.
+ */
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ insn[1].code == 0)
+ memcpy(aux, insn, sizeof(aux));
+
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+ if (!rewritten)
+ continue;
+
+ tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+ if (!tmp) {
+ /* Patching may have repointed aux->prog during
+ * realloc from the original one, so we need to
+ * fix it up here on error.
+ */
+ bpf_jit_prog_release_other(prog, clone);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ clone = tmp;
+ insn_delta = rewritten - 1;
+
+ /* Walk new program and skip insns we just inserted. */
+ insn = clone->insnsi + i + insn_delta;
+ insn_cnt += insn_delta;
+ i += insn_delta;
+ }
+
+ return clone;
+}
#endif /* CONFIG_BPF_JIT */
/* Base function for offset calculation. Needs to go into .text section,
@@ -649,6 +922,7 @@ load_byte:
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
+STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
@@ -690,15 +964,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
+ * @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
* The BPF program will be executed via BPF_PROG_RUN() macro.
*/
-int bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
fp->bpf_func = (void *) __bpf_prog_run;
- bpf_int_jit_compile(fp);
+ /* eBPF JITs can rewrite the program in case constant
+ * blinding is active. However, in case of error during
+ * blinding, bpf_int_jit_compile() must always return a
+ * valid program, which in this case would simply not
+ * be JITed, but falls back to the interpreter.
+ */
+ fp = bpf_int_jit_compile(fp);
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
@@ -706,7 +987,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
* with JITed or non JITed program concatenations and not
* all eBPF JITs might immediately support all features.
*/
- return bpf_check_tail_call(fp);
+ *err = bpf_check_tail_call(fp);
+
+ return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
@@ -762,14 +1045,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+{
+ return NULL;
+}
+
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
.func = NULL,
@@ -781,8 +1071,14 @@ const struct bpf_func_proto bpf_tail_call_proto = {
};
/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ return prog;
+}
+
+bool __weak bpf_helper_changes_skb_data(void *func)
{
+ return false;
}
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5b30fd8a315..fff3650d52fc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -13,6 +14,7 @@
#include <linux/jhash.h>
#include <linux/filter.h>
#include <linux/vmalloc.h>
+#include "percpu_freelist.h"
struct bucket {
struct hlist_head head;
@@ -22,6 +24,8 @@ struct bucket {
struct bpf_htab {
struct bpf_map map;
struct bucket *buckets;
+ void *elems;
+ struct pcpu_freelist freelist;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
@@ -29,26 +33,108 @@ struct bpf_htab {
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
- struct hlist_node hash_node;
+ union {
+ struct hlist_node hash_node;
+ struct bpf_htab *htab;
+ struct pcpu_freelist_node fnode;
+ };
struct rcu_head rcu;
u32 hash;
char key[0] __aligned(8);
};
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+ void __percpu *pptr)
+{
+ *(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+ return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+ return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+ int i;
+
+ if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ goto free_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+ htab->map.key_size);
+ free_percpu(pptr);
+ }
+free_elems:
+ vfree(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+ int err = -ENOMEM, i;
+
+ htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+ if (!htab->elems)
+ return -ENOMEM;
+
+ if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ goto skip_percpu_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ u32 size = round_up(htab->map.value_size, 8);
+ void __percpu *pptr;
+
+ pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ goto free_elems;
+ htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+ pptr);
+ }
+
+skip_percpu_elems:
+ err = pcpu_freelist_init(&htab->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
+ htab->map.max_entries);
+ return 0;
+
+free_elems:
+ htab_free_elems(htab);
+ return err;
+}
+
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
struct bpf_htab *htab;
int err, i;
+ u64 cost;
+
+ if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+ /* reserved bits should not be used */
+ return ERR_PTR(-EINVAL);
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
/* mandatory map attributes */
+ htab->map.map_type = attr->map_type;
htab->map.key_size = attr->key_size;
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
+ htab->map.map_flags = attr->map_flags;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -77,24 +163,39 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
+ if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ /* make sure the size for pcpu_alloc() is reasonable */
+ goto free_htab;
+
htab->elem_size = sizeof(struct htab_elem) +
- round_up(htab->map.key_size, 8) +
- htab->map.value_size;
+ round_up(htab->map.key_size, 8);
+ if (percpu)
+ htab->elem_size += sizeof(void *);
+ else
+ htab->elem_size += round_up(htab->map.value_size, 8);
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- if ((u64) htab->n_buckets * sizeof(struct bucket) +
- (u64) htab->elem_size * htab->map.max_entries >=
- U32_MAX - PAGE_SIZE)
+ cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+
+ if (percpu)
+ cost += (u64) round_up(htab->map.value_size, 8) *
+ num_possible_cpus() * htab->map.max_entries;
+
+ if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
- htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
- htab->elem_size * htab->map.max_entries,
- PAGE_SIZE) >> PAGE_SHIFT;
+ htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(htab->map.pages);
+ if (err)
+ goto free_htab;
err = -ENOMEM;
htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
@@ -111,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
- atomic_set(&htab->count, 0);
+ if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+ err = prealloc_elems_and_freelist(htab);
+ if (err)
+ goto free_buckets;
+ }
return &htab->map;
+free_buckets:
+ kvfree(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -148,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
}
/* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
@@ -166,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
l = lookup_elem_raw(head, hash, key, key_size);
+ return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
if (l)
return l->key + round_up(map->key_size, 8);
@@ -226,86 +340,248 @@ find_first_elem:
}
}
- /* itereated over all buckets and all elements */
+ /* iterated over all buckets and all elements */
return -ENOENT;
}
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
+{
+ if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+ free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ kfree(l);
+
+}
+
+static void htab_elem_free_rcu(struct rcu_head *head)
+{
+ struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+ struct bpf_htab *htab = l->htab;
+
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+ * we're calling kfree, otherwise deadlock is possible if kprobes
+ * are placed somewhere inside of slub
+ */
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ htab_elem_free(htab, l);
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
+}
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
+ } else {
+ atomic_dec(&htab->count);
+ l->htab = htab;
+ call_rcu(&l->rcu, htab_elem_free_rcu);
+ }
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+ void *value, u32 key_size, u32 hash,
+ bool percpu, bool onallcpus)
+{
+ u32 size = htab->map.value_size;
+ bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+ struct htab_elem *l_new;
+ void __percpu *pptr;
+
+ if (prealloc) {
+ l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
+ if (!l_new)
+ return ERR_PTR(-E2BIG);
+ } else {
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(l_new->key, key, key_size);
+ if (percpu) {
+ /* round up value_size to 8 bytes */
+ size = round_up(size, 8);
+
+ if (prealloc) {
+ pptr = htab_elem_get_ptr(l_new, key_size);
+ } else {
+ /* alloc_percpu zero-fills */
+ pptr = __alloc_percpu_gfp(size, 8,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!pptr) {
+ kfree(l_new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ } else {
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+ if (!prealloc)
+ htab_elem_set_ptr(l_new, key_size, pptr);
+ } else {
+ memcpy(l_new->key + round_up(key_size, 8), value, size);
+ }
+
+ l_new->hash = hash;
+ return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+ u64 map_flags)
+{
+ if (l_old && map_flags == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!l_old && map_flags == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new, *l_old;
+ struct htab_elem *l_new = NULL, *l_old;
struct hlist_head *head;
- struct bucket *b;
unsigned long flags;
- u32 key_size;
+ struct bucket *b;
+ u32 key_size, hash;
int ret;
- if (map_flags > BPF_EXIST)
+ if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
- /* allocate new element outside of lock */
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return -ENOMEM;
-
key_size = map->key_size;
- memcpy(l_new->key, key, key_size);
- memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+ hash = htab_map_hash(key, key_size);
- l_new->hash = htab_map_hash(l_new->key, key_size);
- b = __select_bucket(htab, l_new->hash);
+ b = __select_bucket(htab, hash);
head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
- l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
- if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
- /* if elem with this 'key' doesn't exist and we've reached
- * max_entries limit, fail insertion of new elem
- */
- ret = -E2BIG;
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
goto err;
- }
- if (l_old && map_flags == BPF_NOEXIST) {
- /* elem already exists */
- ret = -EEXIST;
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+ if (IS_ERR(l_new)) {
+ /* all pre-allocated elements are in use or memory exhausted */
+ ret = PTR_ERR(l_new);
goto err;
}
- if (!l_old && map_flags == BPF_EXIST) {
- /* elem doesn't exist, cannot update it */
- ret = -ENOENT;
- goto err;
- }
-
- /* add new element to the head of the list, so that concurrent
- * search will find it before old elem
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
*/
hlist_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_del_rcu(&l_old->hash_node);
- kfree_rcu(l_old, rcu);
- } else {
- atomic_inc(&htab->count);
+ free_htab_elem(htab, l_old);
}
+ ret = 0;
+err:
raw_spin_unlock_irqrestore(&b->lock, flags);
+ return ret;
+}
- return 0;
+static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
+
+ if (l_old) {
+ void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
+ u32 size = htab->map.value_size;
+
+ /* per-cpu hash map can update value in-place */
+ if (!onallcpus) {
+ memcpy(this_cpu_ptr(pptr), value, size);
+ } else {
+ int off = 0, cpu;
+
+ size = round_up(size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+ } else {
+ l_new = alloc_htab_elem(htab, key, value, key_size,
+ hash, true, onallcpus);
+ if (IS_ERR(l_new)) {
+ ret = PTR_ERR(l_new);
+ goto err;
+ }
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ }
+ ret = 0;
err:
raw_spin_unlock_irqrestore(&b->lock, flags);
- kfree(l_new);
return ret;
}
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
@@ -331,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
if (l) {
hlist_del_rcu(&l->hash_node);
- atomic_dec(&htab->count);
- kfree_rcu(l, rcu);
+ free_htab_elem(htab, l);
ret = 0;
}
@@ -351,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node);
- atomic_dec(&htab->count);
- kfree(l);
+ htab_elem_free(htab, l);
}
}
}
-
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -369,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
*/
synchronize_rcu();
- /* some of kfree_rcu() callbacks for elements of this map may not have
- * executed. It's ok. Proceed to free residual elements and map itself
+ /* some of free_htab_elem() callbacks for elements of this map may
+ * not have executed. Wait for them.
*/
- delete_all_elements(htab);
+ rcu_barrier();
+ if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+ delete_all_elements(htab);
+ } else {
+ htab_free_elems(htab);
+ pcpu_freelist_destroy(&htab->freelist);
+ }
kvfree(htab->buckets);
kfree(htab);
}
@@ -391,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l)
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ else
+ return NULL;
+}
+
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int ret = -ENOENT;
+ int cpu, off = 0;
+ u32 size;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ l = __htab_map_lookup_elem(map, key);
+ if (!l)
+ goto out;
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_update_elem = htab_percpu_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+ .ops = &htab_percpu_ops,
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
+ bpf_register_map_type(&htab_percpu_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4504ca66118d..ad7a0573f71b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
struct task_struct *task = current;
char *buf = (char *) (long) r1;
- if (!task)
- return -EINVAL;
+ if (unlikely(!task))
+ goto err_clear;
- memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ strncpy(buf, task->comm, size);
+
+ /* Verifier guarantees that size > 0. For task->comm exceeding
+ * size, guarantee that buf is %NUL-terminated. Unconditionally
+ * done here to save the size test.
+ */
+ buf[size - 1] = 0;
return 0;
+err_clear:
+ memset(buf, 0, size);
+ return -EINVAL;
}
const struct bpf_func_proto bpf_get_current_comm_proto = {
.func = bpf_get_current_comm,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
+ .arg1_type = ARG_PTR_TO_RAW_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..8f94ca1860cf 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
{
switch (type) {
case BPF_TYPE_PROG:
- atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ raw = bpf_prog_inc(raw);
break;
case BPF_TYPE_MAP:
- bpf_map_inc(raw, true);
+ raw = bpf_map_inc(raw, true);
break;
default:
WARN_ON_ONCE(1);
@@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
goto out;
raw = bpf_any_get(inode->i_private, *type);
- touch_atime(&path);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
path_put(&path);
return raw;
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
new file mode 100644
index 000000000000..5c51d1985b51
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.c
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "percpu_freelist.h"
+
+int pcpu_freelist_init(struct pcpu_freelist *s)
+{
+ int cpu;
+
+ s->freelist = alloc_percpu(struct pcpu_freelist_head);
+ if (!s->freelist)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
+
+ raw_spin_lock_init(&head->lock);
+ head->first = NULL;
+ }
+ return 0;
+}
+
+void pcpu_freelist_destroy(struct pcpu_freelist *s)
+{
+ free_percpu(s->freelist);
+}
+
+static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
+{
+ raw_spin_lock(&head->lock);
+ node->next = head->first;
+ head->first = node;
+ raw_spin_unlock(&head->lock);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
+
+ __pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems)
+{
+ struct pcpu_freelist_head *head;
+ unsigned long flags;
+ int i, cpu, pcpu_entries;
+
+ pcpu_entries = nr_elems / num_possible_cpus() + 1;
+ i = 0;
+
+ /* disable irq to workaround lockdep false positive
+ * in bpf usage pcpu_freelist_populate() will never race
+ * with pcpu_freelist_push()
+ */
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+again:
+ head = per_cpu_ptr(s->freelist, cpu);
+ __pcpu_freelist_push(head, buf);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+ local_irq_restore(flags);
+}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_head *head;
+ struct pcpu_freelist_node *node;
+ int orig_cpu, cpu;
+
+ orig_cpu = cpu = raw_smp_processor_id();
+ while (1) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ raw_spin_lock(&head->lock);
+ node = head->first;
+ if (node) {
+ head->first = node->next;
+ raw_spin_unlock(&head->lock);
+ return node;
+ }
+ raw_spin_unlock(&head->lock);
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = 0;
+ if (cpu == orig_cpu)
+ return NULL;
+ }
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
new file mode 100644
index 000000000000..3049aae8ea1e
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __PERCPU_FREELIST_H__
+#define __PERCPU_FREELIST_H__
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+struct pcpu_freelist_head {
+ struct pcpu_freelist_node *first;
+ raw_spinlock_t lock;
+};
+
+struct pcpu_freelist {
+ struct pcpu_freelist_head __percpu *freelist;
+};
+
+struct pcpu_freelist_node {
+ struct pcpu_freelist_node *next;
+};
+
+void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems);
+int pcpu_freelist_init(struct pcpu_freelist *);
+void pcpu_freelist_destroy(struct pcpu_freelist *s);
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..c8ee35287bfe
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,290 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+#include "percpu_freelist.h"
+
+struct stack_map_bucket {
+ struct pcpu_freelist_node fnode;
+ u32 hash;
+ u32 nr;
+ u64 ip[];
+};
+
+struct bpf_stack_map {
+ struct bpf_map map;
+ void *elems;
+ struct pcpu_freelist freelist;
+ u32 n_buckets;
+ struct stack_map_bucket *buckets[];
+};
+
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+ u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ int err;
+
+ smap->elems = vzalloc(elem_size * smap->map.max_entries);
+ if (!smap->elems)
+ return -ENOMEM;
+
+ err = pcpu_freelist_init(&smap->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+ smap->map.max_entries);
+ return 0;
+
+free_elems:
+ vfree(smap->elems);
+ return err;
+}
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+ u32 value_size = attr->value_size;
+ struct bpf_stack_map *smap;
+ u64 cost, n_buckets;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->map_flags)
+ return ERR_PTR(-EINVAL);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ value_size < 8 || value_size % 8 ||
+ value_size / 8 > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+
+ /* hash table size must be power of 2 */
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+
+ cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
+ if (!smap) {
+ smap = vzalloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = -E2BIG;
+ cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_smap;
+
+ smap->map.map_type = attr->map_type;
+ smap->map.key_size = attr->key_size;
+ smap->map.value_size = value_size;
+ smap->map.max_entries = attr->max_entries;
+ smap->n_buckets = n_buckets;
+ smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ err = bpf_map_precharge_memlock(smap->map.pages);
+ if (err)
+ goto free_smap;
+
+ err = get_callchain_buffers();
+ if (err)
+ goto free_smap;
+
+ err = prealloc_elems_and_freelist(smap);
+ if (err)
+ goto put_buffers;
+
+ return &smap->map;
+
+put_buffers:
+ put_callchain_buffers();
+free_smap:
+ kvfree(smap);
+ return ERR_PTR(err);
+}
+
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+ struct pt_regs *regs = (struct pt_regs *) (long) r1;
+ struct bpf_map *map = (struct bpf_map *) (long) r2;
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct perf_callchain_entry *trace;
+ struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+ u32 max_depth = map->value_size / 8;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 hash, id, trace_nr, trace_len;
+ bool user = flags & BPF_F_USER_STACK;
+ bool kernel = !user;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+
+ if (unlikely(!trace))
+ /* couldn't fetch the stack trace */
+ return -EFAULT;
+
+ /* get_perf_callchain() guarantees that trace->nr >= init_nr
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
+ */
+ trace_nr = trace->nr - init_nr;
+
+ if (trace_nr <= skip)
+ /* skipping more than usable stack trace */
+ return -EFAULT;
+
+ trace_nr -= skip;
+ trace_len = trace_nr * sizeof(u64);
+ ips = trace->ip + skip + init_nr;
+ hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+ id = hash & (smap->n_buckets - 1);
+ bucket = READ_ONCE(smap->buckets[id]);
+
+ if (bucket && bucket->hash == hash) {
+ if (flags & BPF_F_FAST_STACK_CMP)
+ return id;
+ if (bucket->nr == trace_nr &&
+ memcmp(bucket->ip, ips, trace_len) == 0)
+ return id;
+ }
+
+ /* this call stack is not in the map, try to add it */
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+
+ memcpy(new_bucket->ip, ips, trace_len);
+ new_bucket->hash = hash;
+ new_bucket->nr = trace_nr;
+
+ old_bucket = xchg(&smap->buckets[id], new_bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return id;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+ .func = bpf_get_stackid,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* Called from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *bucket, *old_bucket;
+ u32 id = *(u32 *)key, trace_len;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -ENOENT;
+
+ bucket = xchg(&smap->buckets[id], NULL);
+ if (!bucket)
+ return -ENOENT;
+
+ trace_len = bucket->nr * sizeof(u64);
+ memcpy(value, bucket->ip, trace_len);
+ memset(value + trace_len, 0, map->value_size - trace_len);
+
+ old_bucket = xchg(&smap->buckets[id], bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EINVAL;
+}
+
+static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *old_bucket;
+ u32 id = *(u32 *)key;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -E2BIG;
+
+ old_bucket = xchg(&smap->buckets[id], NULL);
+ if (old_bucket) {
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+
+ /* wait for bpf programs to complete before freeing stack map */
+ synchronize_rcu();
+
+ vfree(smap->elems);
+ pcpu_freelist_destroy(&smap->freelist);
+ kvfree(smap);
+ put_callchain_buffers();
+}
+
+static const struct bpf_map_ops stack_map_ops = {
+ .map_alloc = stack_map_alloc,
+ .map_free = stack_map_free,
+ .map_get_next_key = stack_map_get_next_key,
+ .map_lookup_elem = stack_map_lookup_elem,
+ .map_update_elem = stack_map_update_elem,
+ .map_delete_elem = stack_map_delete_elem,
+};
+
+static struct bpf_map_type_list stack_map_type __read_mostly = {
+ .ops = &stack_map_ops,
+ .type = BPF_MAP_TYPE_STACK_TRACE,
+};
+
+static int __init register_stack_map(void)
+{
+ bpf_register_map_type(&stack_map_type);
+ return 0;
+}
+late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 637397059f76..46ecce4b79ed 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
#include <linux/filter.h>
#include <linux/version.h>
+DEFINE_PER_CPU(int, bpf_prog_active);
+
int sysctl_unprivileged_bpf_disabled __read_mostly;
static LIST_HEAD(bpf_map_types);
@@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+int bpf_map_precharge_memlock(u32 pages)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit, cur;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ cur = atomic_long_read(&user->locked_vm);
+ free_uid(user);
+ if (cur + pages > memlock_limit)
+ return -EPERM;
+ return 0;
+}
+
static int bpf_map_charge_memlock(struct bpf_map *map)
{
struct user_struct *user = get_current_user();
@@ -122,11 +137,13 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"map_type:\t%u\n"
"key_size:\t%u\n"
"value_size:\t%u\n"
- "max_entries:\t%u\n",
+ "max_entries:\t%u\n"
+ "map_flags:\t%#x\n",
map->map_type,
map->key_size,
map->value_size,
- map->max_entries);
+ map->max_entries,
+ map->map_flags);
}
#endif
@@ -151,7 +168,7 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -201,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
return f.file->private_data;
}
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
{
- atomic_inc(&map->refcnt);
+ if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&map->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
if (uref)
atomic_inc(&map->usercnt);
+ return map;
}
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -217,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
if (IS_ERR(map))
return map;
- bpf_map_inc(map, true);
+ map = bpf_map_inc(map, true);
fdput(f);
return map;
@@ -229,6 +253,11 @@ static void __user *u64_to_ptr(__u64 val)
return (void __user *) (unsigned long) val;
}
+int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
+ return -ENOTSUPP;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
@@ -239,6 +268,7 @@ static int map_lookup_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value, *ptr;
+ u32 value_size;
struct fd f;
int err;
@@ -259,23 +289,37 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else
+ value_size = map->value_size;
+
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
- if (ptr)
- memcpy(value, ptr, map->value_size);
- rcu_read_unlock();
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else {
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, value_size);
+ rcu_read_unlock();
+ err = ptr ? 0 : -ENOENT;
+ }
- err = -ENOENT;
- if (!ptr)
+ if (err)
goto free_value;
err = -EFAULT;
- if (copy_to_user(uvalue, value, map->value_size) != 0)
+ if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
err = 0;
@@ -298,6 +342,7 @@ static int map_update_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
+ u32 value_size;
struct fd f;
int err;
@@ -318,21 +363,37 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else
+ value_size = map->value_size;
+
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -EFAULT;
- if (copy_from_user(value, uvalue, map->value_size) != 0)
+ if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* eBPF program that use maps are running under rcu_read_lock(),
- * therefore all map accessors rely on this fact, so do the same here
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
*/
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
free_value:
kfree(value);
@@ -371,9 +432,13 @@ static int map_delete_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
free_key:
kfree(key);
@@ -600,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&prog->aux->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ return prog;
+}
+
/* called by sockets/tracing/seccomp before attaching program to an event
* pairs with bpf_prog_put()
*/
@@ -612,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
if (IS_ERR(prog))
return prog;
- atomic_inc(&prog->aux->refcnt);
+ prog = bpf_prog_inc(prog);
fdput(f);
return prog;
@@ -688,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr)
fixup_bpf_calls(prog);
/* eBPF program is ready to be JITed */
- err = bpf_prog_select_runtime(prog);
+ prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e7f7ab739e4..a08d66215245 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -136,13 +137,32 @@ enum bpf_reg_type {
FRAME_PTR, /* reg == frame_pointer */
PTR_TO_STACK, /* reg == frame_pointer + imm */
CONST_IMM, /* constant integer value */
+
+ /* PTR_TO_PACKET represents:
+ * skb->data
+ * skb->data + imm
+ * skb->data + (u16) var
+ * skb->data + (u16) var + imm
+ * if (range > 0) then [ptr, ptr + range - off) is safe to access
+ * if (id > 0) means that some 'var' was added
+ * if (off > 0) menas that 'imm' was added
+ */
+ PTR_TO_PACKET,
+ PTR_TO_PACKET_END, /* skb->data + headlen */
};
struct reg_state {
enum bpf_reg_type type;
union {
- /* valid when type == CONST_IMM | PTR_TO_STACK */
- int imm;
+ /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
+ s64 imm;
+
+ /* valid when type == PTR_TO_PACKET* */
+ struct {
+ u32 id;
+ u16 off;
+ u16 range;
+ };
/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
@@ -202,6 +222,16 @@ struct verifier_env {
bool allow_ptr_leaks;
};
+#define BPF_COMPLEXITY_LIMIT_INSNS 65536
+#define BPF_COMPLEXITY_LIMIT_STACK 1024
+
+struct bpf_call_arg_meta {
+ struct bpf_map *map_ptr;
+ bool raw_mode;
+ int regno;
+ int access_size;
+};
+
/* verbose verifier prints what it's seeing
* bpf_check() is called under lock, so no race to access these global vars
*/
@@ -237,39 +267,39 @@ static const char * const reg_type_str[] = {
[FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
[CONST_IMM] = "imm",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_END] = "pkt_end",
};
-static const struct {
- int map_type;
- int func_id;
-} func_limit[] = {
- {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
- {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
-};
-
-static void print_verifier_state(struct verifier_env *env)
+static void print_verifier_state(struct verifier_state *state)
{
+ struct reg_state *reg;
enum bpf_reg_type t;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- t = env->cur_state.regs[i].type;
+ reg = &state->regs[i];
+ t = reg->type;
if (t == NOT_INIT)
continue;
verbose(" R%d=%s", i, reg_type_str[t]);
if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%d", env->cur_state.regs[i].imm);
+ verbose("%lld", reg->imm);
+ else if (t == PTR_TO_PACKET)
+ verbose("(id=%d,off=%d,r=%d)",
+ reg->id, reg->off, reg->range);
+ else if (t == UNKNOWN_VALUE && reg->imm)
+ verbose("%lld", reg->imm);
else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL)
verbose("(ks=%d,vs=%d)",
- env->cur_state.regs[i].map_ptr->key_size,
- env->cur_state.regs[i].map_ptr->value_size);
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+ if (state->stack_slot_type[i] == STACK_SPILL)
verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+ reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
}
verbose("\n");
}
@@ -453,7 +483,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
elem->next = env->head;
env->head = elem;
env->stack_size++;
- if (env->stack_size > 1024) {
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
verbose("BPF program is too complex\n");
goto err;
}
@@ -476,7 +506,6 @@ static void init_reg_state(struct reg_state *regs)
for (i = 0; i < MAX_BPF_REG; i++) {
regs[i].type = NOT_INIT;
regs[i].imm = 0;
- regs[i].map_ptr = NULL;
}
/* frame pointer */
@@ -491,7 +520,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
regs[regno].imm = 0;
- regs[regno].map_ptr = NULL;
}
enum reg_arg_type {
@@ -547,6 +575,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_END:
case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
@@ -646,13 +676,38 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
return 0;
}
+#define MAX_PACKET_OFF 0xffff
+
+static int check_packet_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct reg_state *reg = &regs[regno];
+ int linear_size = (int) reg->range - (int) reg->off;
+
+ if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
+ verbose("verifier bug\n");
+ return -EFAULT;
+ }
+ if (off < 0 || off + size > linear_size) {
+ verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
+ off, size, linear_size);
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check access to 'struct bpf_context' fields */
static int check_ctx_access(struct verifier_env *env, int off, int size,
enum bpf_access_type t)
{
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t))
+ env->prog->aux->ops->is_valid_access(off, size, t)) {
+ /* remember the offset of last byte accessed in ctx */
+ if (env->prog->aux->max_ctx_offset < off + size)
+ env->prog->aux->max_ctx_offset = off + size;
return 0;
+ }
verbose("invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
@@ -672,6 +727,45 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
+static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
+ int off, int size)
+{
+ if (reg->type != PTR_TO_PACKET) {
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ } else {
+ return 0;
+ }
+ }
+
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ break;
+ default:
+ verbose("verifier is misconfigured\n");
+ return -EACCES;
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ /* misaligned access to packet is ok on x86,arm,arm64 */
+ return 0;
+
+ if (reg->id && size != 1) {
+ verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+ return -EACCES;
+ }
+
+ /* skb->data is NET_IP_ALIGN-ed */
+ if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+ verbose("misaligned packet access off %d+%d+%d size %d\n",
+ NET_IP_ALIGN, reg->off, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -683,21 +777,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
int value_regno)
{
struct verifier_state *state = &env->cur_state;
+ struct reg_state *reg = &state->regs[regno];
int size, err = 0;
- if (state->regs[regno].type == PTR_TO_STACK)
- off += state->regs[regno].imm;
+ if (reg->type == PTR_TO_STACK)
+ off += reg->imm;
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n", off, size);
- return -EACCES;
- }
+ err = check_ptr_alignment(env, reg, off, size);
+ if (err)
+ return err;
- if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_VALUE) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
@@ -707,18 +801,25 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
- } else if (state->regs[regno].type == PTR_TO_CTX) {
+ } else if (reg->type == PTR_TO_CTX) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
err = check_ctx_access(env, off, size, t);
- if (!err && t == BPF_READ && value_regno >= 0)
+ if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value(state->regs, value_regno);
+ if (off == offsetof(struct __sk_buff, data) &&
+ env->allow_ptr_leaks)
+ /* note that reg.[id|off|range] == 0 */
+ state->regs[value_regno].type = PTR_TO_PACKET;
+ else if (off == offsetof(struct __sk_buff, data_end) &&
+ env->allow_ptr_leaks)
+ state->regs[value_regno].type = PTR_TO_PACKET_END;
+ }
- } else if (state->regs[regno].type == FRAME_PTR ||
- state->regs[regno].type == PTR_TO_STACK) {
+ } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -734,11 +835,28 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
} else {
err = check_stack_read(state, off, size, value_regno);
}
+ } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ if (t == BPF_WRITE) {
+ verbose("cannot write into packet\n");
+ return -EACCES;
+ }
+ err = check_packet_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[state->regs[regno].type]);
+ regno, reg_type_str[reg->type]);
return -EACCES;
}
+
+ if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
+ state->regs[value_regno].type == UNKNOWN_VALUE) {
+ /* 1 or 2 byte load zero-extends, determine the number of
+ * zero upper bits. Not doing it fo 4 byte load, since
+ * such values cannot be added to ptr_to_packet anyway.
+ */
+ state->regs[value_regno].imm = 64 - size * 8;
+ }
return err;
}
@@ -778,15 +896,25 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized
*/
-static int check_stack_boundary(struct verifier_env *env,
- int regno, int access_size)
+static int check_stack_boundary(struct verifier_env *env, int regno,
+ int access_size, bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
{
struct verifier_state *state = &env->cur_state;
struct reg_state *regs = state->regs;
int off, i;
- if (regs[regno].type != PTR_TO_STACK)
+ if (regs[regno].type != PTR_TO_STACK) {
+ if (zero_size_allowed && access_size == 0 &&
+ regs[regno].type == CONST_IMM &&
+ regs[regno].imm == 0)
+ return 0;
+
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[regs[regno].type],
+ reg_type_str[PTR_TO_STACK]);
return -EACCES;
+ }
off = regs[regno].imm;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -796,6 +924,12 @@ static int check_stack_boundary(struct verifier_env *env,
return -EACCES;
}
+ if (meta && meta->raw_mode) {
+ meta->access_size = access_size;
+ meta->regno = regno;
+ return 0;
+ }
+
for (i = 0; i < access_size; i++) {
if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
verbose("invalid indirect read from stack off %d+%d size %d\n",
@@ -807,7 +941,8 @@ static int check_stack_boundary(struct verifier_env *env,
}
static int check_func_arg(struct verifier_env *env, u32 regno,
- enum bpf_arg_type arg_type, struct bpf_map **mapp)
+ enum bpf_arg_type arg_type,
+ struct bpf_call_arg_meta *meta)
{
struct reg_state *reg = env->cur_state.regs + regno;
enum bpf_reg_type expected_type;
@@ -829,15 +964,26 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return 0;
}
- if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ } else if (arg_type == ARG_CONST_STACK_SIZE ||
+ arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
} else if (arg_type == ARG_PTR_TO_CTX) {
expected_type = PTR_TO_CTX;
+ } else if (arg_type == ARG_PTR_TO_STACK ||
+ arg_type == ARG_PTR_TO_RAW_STACK) {
+ expected_type = PTR_TO_STACK;
+ /* One exception here. In case function allows for NULL to be
+ * passed in as argument, it's a CONST_IMM type. Final test
+ * happens during stack boundary checking.
+ */
+ if (reg->type == CONST_IMM && reg->imm == 0)
+ expected_type = CONST_IMM;
+ meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -851,14 +997,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
- *mapp = reg->map_ptr;
-
+ meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
*/
- if (!*mapp) {
+ if (!meta->map_ptr) {
/* in function declaration map_ptr must come before
* map_key, so that it's verified and known before
* we have to check map_key here. Otherwise it means
@@ -867,20 +1012,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->key_size);
-
+ err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
+ false, NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
- if (!*mapp) {
+ if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->value_size);
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->value_size,
+ false, NULL);
+ } else if (arg_type == ARG_CONST_STACK_SIZE ||
+ arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+ bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
/* bpf_xxx(..., buf, len) call will access 'len' bytes
* from stack pointer 'buf'. Check it
* note: regno == len, regno - 1 == buf
@@ -890,7 +1039,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno - 1, reg->imm);
+ err = check_stack_boundary(env, regno - 1, reg->imm,
+ zero_size_allowed, meta);
}
return err;
@@ -898,24 +1048,93 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
{
- bool bool_map, bool_func;
- int i;
-
if (!map)
return 0;
- for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
- bool_map = (map->map_type == func_limit[i].map_type);
- bool_func = (func_id == func_limit[i].func_id);
- /* only when map & func pair match it can continue.
- * don't allow any other map type to be passed into
- * the special func;
- */
- if (bool_func && bool_map != bool_func)
- return -EINVAL;
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_STACK_TRACE:
+ if (func_id != BPF_FUNC_get_stackid)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_get_stackid:
+ if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
+ goto error;
+ break;
+ default:
+ break;
}
return 0;
+error:
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
+ return -EINVAL;
+}
+
+static int check_raw_mode(const struct bpf_func_proto *fn)
+{
+ int count = 0;
+
+ if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+
+ return count > 1 ? -EINVAL : 0;
+}
+
+static void clear_all_pkt_pointers(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs, *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (regs[i].type == PTR_TO_PACKET ||
+ regs[i].type == PTR_TO_PACKET_END)
+ mark_reg_unknown_value(regs, i);
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ reg = &state->spilled_regs[i / BPF_REG_SIZE];
+ if (reg->type != PTR_TO_PACKET &&
+ reg->type != PTR_TO_PACKET_END)
+ continue;
+ reg->type = UNKNOWN_VALUE;
+ reg->imm = 0;
+ }
}
static int check_call(struct verifier_env *env, int func_id)
@@ -923,8 +1142,9 @@ static int check_call(struct verifier_env *env, int func_id)
struct verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
struct reg_state *regs = state->regs;
- struct bpf_map *map = NULL;
struct reg_state *reg;
+ struct bpf_call_arg_meta meta;
+ bool changes_data;
int i, err;
/* find function prototype */
@@ -947,23 +1167,45 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
+ changes_data = bpf_helper_changes_skb_data(fn->func);
+
+ memset(&meta, 0, sizeof(meta));
+
+ /* We only support one arg being in raw mode at the moment, which
+ * is sufficient for the helper functions we have right now.
+ */
+ err = check_raw_mode(fn);
+ if (err) {
+ verbose("kernel subsystem misconfigured func %d\n", func_id);
+ return err;
+ }
+
/* check args */
- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
if (err)
return err;
+ /* Mark slots with STACK_MISC in case of raw mode, stack offset
+ * is inferred from register state.
+ */
+ for (i = 0; i < meta.access_size; i++) {
+ err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+ if (err)
+ return err;
+ }
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
reg = regs + caller_saved[i];
@@ -982,28 +1224,211 @@ static int check_call(struct verifier_env *env, int func_id)
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
- if (map == NULL) {
+ if (meta.map_ptr == NULL) {
verbose("kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
- regs[BPF_REG_0].map_ptr = map;
+ regs[BPF_REG_0].map_ptr = meta.map_ptr;
} else {
verbose("unknown return type %d of func %d\n",
fn->ret_type, func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(map, func_id);
+ err = check_map_func_compatibility(meta.map_ptr, func_id);
if (err)
return err;
+ if (changes_data)
+ clear_all_pkt_pointers(env);
+ return 0;
+}
+
+static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct reg_state *dst_reg = &regs[insn->dst_reg];
+ struct reg_state *src_reg = &regs[insn->src_reg];
+ s32 imm;
+
+ if (BPF_SRC(insn->code) == BPF_K) {
+ /* pkt_ptr += imm */
+ imm = insn->imm;
+
+add_imm:
+ if (imm <= 0) {
+ verbose("addition of negative constant to packet pointer is not allowed\n");
+ return -EACCES;
+ }
+ if (imm >= MAX_PACKET_OFF ||
+ imm + dst_reg->off >= MAX_PACKET_OFF) {
+ verbose("constant %d is too large to add to packet pointer\n",
+ imm);
+ return -EACCES;
+ }
+ /* a constant was added to pkt_ptr.
+ * Remember it while keeping the same 'id'
+ */
+ dst_reg->off += imm;
+ } else {
+ if (src_reg->type == CONST_IMM) {
+ /* pkt_ptr += reg where reg is known constant */
+ imm = src_reg->imm;
+ goto add_imm;
+ }
+ /* disallow pkt_ptr += reg
+ * if reg is not uknown_value with guaranteed zero upper bits
+ * otherwise pkt_ptr may overflow and addition will become
+ * subtraction which is not allowed
+ */
+ if (src_reg->type != UNKNOWN_VALUE) {
+ verbose("cannot add '%s' to ptr_to_packet\n",
+ reg_type_str[src_reg->type]);
+ return -EACCES;
+ }
+ if (src_reg->imm < 48) {
+ verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
+ src_reg->imm);
+ return -EACCES;
+ }
+ /* dst_reg stays as pkt_ptr type and since some positive
+ * integer value was added to the pointer, increment its 'id'
+ */
+ dst_reg->id++;
+
+ /* something was added to pkt_ptr, set range and off to zero */
+ dst_reg->off = 0;
+ dst_reg->range = 0;
+ }
+ return 0;
+}
+
+static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct reg_state *dst_reg = &regs[insn->dst_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2;
+
+ /* for type == UNKNOWN_VALUE:
+ * imm > 0 -> number of zero upper bits
+ * imm == 0 -> don't track which is the same as all bits can be non-zero
+ */
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ struct reg_state *src_reg = &regs[insn->src_reg];
+
+ if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
+ dst_reg->imm && opcode == BPF_ADD) {
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(dst_reg->imm, src_reg->imm);
+ dst_reg->imm--;
+ return 0;
+ }
+ if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
+ dst_reg->imm && opcode == BPF_ADD) {
+ /* dreg += sreg
+ * where dreg has zero upper bits and sreg is const.
+ * Adding them can only result making one more bit
+ * non-zero in the larger value.
+ */
+ imm_log2 = __ilog2_u64((long long)src_reg->imm);
+ dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ return 0;
+ }
+ /* all other cases non supported yet, just mark dst_reg */
+ dst_reg->imm = 0;
+ return 0;
+ }
+
+ /* sign extend 32-bit imm into 64-bit to make sure that
+ * negative values occupy bit 63. Note ilog2() would have
+ * been incorrect, since sizeof(insn->imm) == 4
+ */
+ imm_log2 = __ilog2_u64((long long)insn->imm);
+
+ if (dst_reg->imm && opcode == BPF_LSH) {
+ /* reg <<= imm
+ * if reg was a result of 2 byte load, then its imm == 48
+ * which means that upper 48 bits are zero and shifting this reg
+ * left by 4 would mean that upper 44 bits are still zero
+ */
+ dst_reg->imm -= insn->imm;
+ } else if (dst_reg->imm && opcode == BPF_MUL) {
+ /* reg *= imm
+ * if multiplying by 14 subtract 4
+ * This is conservative calculation of upper zero bits.
+ * It's not trying to special case insn->imm == 1 or 0 cases
+ */
+ dst_reg->imm -= imm_log2 + 1;
+ } else if (opcode == BPF_AND) {
+ /* reg &= imm */
+ dst_reg->imm = 63 - imm_log2;
+ } else if (dst_reg->imm && opcode == BPF_ADD) {
+ /* reg += imm */
+ dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ } else if (opcode == BPF_RSH) {
+ /* reg >>= imm
+ * which means that after right shift, upper bits will be zero
+ * note that verifier already checked that
+ * 0 <= imm < 64 for shift insn
+ */
+ dst_reg->imm += insn->imm;
+ if (unlikely(dst_reg->imm > 64))
+ /* some dumb code did:
+ * r2 = *(u32 *)mem;
+ * r2 >>= 32;
+ * and all bits are zero now */
+ dst_reg->imm = 64;
+ } else {
+ /* all other alu ops, means that we don't know what will
+ * happen to the value, mark it with unknown number of zero bits
+ */
+ dst_reg->imm = 0;
+ }
+
+ if (dst_reg->imm < 0) {
+ /* all 64 bits of the register can contain non-zero bits
+ * and such value cannot be added to ptr_to_packet, since it
+ * may overflow, mark it as unknown to avoid further eval
+ */
+ dst_reg->imm = 0;
+ }
+ return 0;
+}
+
+static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct reg_state *dst_reg = &regs[insn->dst_reg];
+ struct reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+
+ /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
+ * Don't care about overflow or negative values, just add them
+ */
+ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
+ dst_reg->imm += insn->imm;
+ else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM)
+ dst_reg->imm += src_reg->imm;
+ else
+ mark_reg_unknown_value(regs, insn->dst_reg);
return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct reg_state *regs = env->cur_state.regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1092,8 +1517,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
- bool stack_relative = false;
-
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
verbose("BPF_ALU uses reserved fields\n");
@@ -1131,11 +1554,34 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
}
}
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- regs[insn->dst_reg].type == FRAME_PTR &&
- BPF_SRC(insn->code) == BPF_K) {
- stack_relative = true;
+ dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
+ dst_reg->type = PTR_TO_STACK;
+ dst_reg->imm = insn->imm;
+ return 0;
+ } else if (opcode == BPF_ADD &&
+ BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == PTR_TO_PACKET) {
+ /* ptr_to_packet += K|X */
+ return check_packet_ptr_add(env, insn);
+ } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == UNKNOWN_VALUE &&
+ env->allow_ptr_leaks) {
+ /* unknown += K|X */
+ return evaluate_reg_alu(env, insn);
+ } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == CONST_IMM &&
+ env->allow_ptr_leaks) {
+ /* reg_imm += K|X */
+ return evaluate_reg_imm_alu(env, insn);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer arithmetic prohibited\n",
insn->dst_reg);
@@ -1147,24 +1593,45 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
- /* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
- if (err)
- return err;
-
- if (stack_relative) {
- regs[insn->dst_reg].type = PTR_TO_STACK;
- regs[insn->dst_reg].imm = insn->imm;
- }
+ /* mark dest operand */
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
return 0;
}
+static void find_good_pkt_pointers(struct verifier_env *env,
+ struct reg_state *dst_reg)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs, *reg;
+ int i;
+ /* r2 = r3;
+ * r2 += 8
+ * if (r2 > pkt_end) goto somewhere
+ * r2 == dst_reg, pkt_end == src_reg,
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+ * so that range of bytes [r3, r3 + 8) is safe to access
+ */
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ regs[i].range = dst_reg->off;
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ reg = &state->spilled_regs[i / BPF_REG_SIZE];
+ if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+ reg->range = dst_reg->off;
+ }
+}
+
static int check_cond_jmp_op(struct verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct reg_state *regs = env->cur_state.regs, *dst_reg;
struct verifier_state *other_branch;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1202,11 +1669,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
if (err)
return err;
+ dst_reg = &regs[insn->dst_reg];
+
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- regs[insn->dst_reg].type == CONST_IMM &&
- regs[insn->dst_reg].imm == insn->imm) {
+ dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -1228,44 +1696,30 @@ static int check_cond_jmp_op(struct verifier_env *env,
/* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
if (BPF_SRC(insn->code) == BPF_K &&
- insn->imm == 0 && (opcode == BPF_JEQ ||
- opcode == BPF_JNE) &&
- regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
if (opcode == BPF_JEQ) {
/* next fallthrough insn can access memory via
* this register
*/
regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
/* branch targer cannot access it, since reg == 0 */
- other_branch->regs[insn->dst_reg].type = CONST_IMM;
- other_branch->regs[insn->dst_reg].imm = 0;
+ mark_reg_unknown_value(other_branch->regs,
+ insn->dst_reg);
} else {
other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = 0;
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+ dst_reg->type == PTR_TO_PACKET &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+ find_good_pkt_pointers(env, dst_reg);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_K &&
- (opcode == BPF_JEQ || opcode == BPF_JNE)) {
-
- if (opcode == BPF_JEQ) {
- /* detect if (R == imm) goto
- * and in the target state recognize that R = imm
- */
- other_branch->regs[insn->dst_reg].type = CONST_IMM;
- other_branch->regs[insn->dst_reg].imm = insn->imm;
- } else {
- /* detect if (R != imm) goto
- * and in the fall-through state recognize that R = imm
- */
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- }
}
if (log_level)
- print_verifier_state(env);
+ print_verifier_state(&env->cur_state);
return 0;
}
@@ -1343,13 +1797,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+ verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_ABS uses reserved fields\n");
+ verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -1513,6 +1968,8 @@ peek_stack:
goto peek_stack;
else if (ret < 0)
goto err_free;
+ if (t + 1 < insn_cnt)
+ env->explored_states[t + 1] = STATE_LIST_MARK;
} else if (opcode == BPF_JA) {
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
@@ -1580,6 +2037,58 @@ err_free:
return ret;
}
+/* the following conditions reduce the number of explored insns
+ * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+ */
+static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+{
+ if (old->id != cur->id)
+ return false;
+
+ /* old ptr_to_packet is more conservative, since it allows smaller
+ * range. Ex:
+ * old(off=0,r=10) is equal to cur(off=0,r=20), because
+ * old(off=0,r=10) means that with range=10 the verifier proceeded
+ * further and found no issues with the program. Now we're in the same
+ * spot with cur(off=0,r=20), so we're safe too, since anything further
+ * will only be looking at most 10 bytes after this pointer.
+ */
+ if (old->off == cur->off && old->range < cur->range)
+ return true;
+
+ /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
+ * since both cannot be used for packet access and safe(old)
+ * pointer has smaller off that could be used for further
+ * 'if (ptr > data_end)' check
+ * Ex:
+ * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
+ * that we cannot access the packet.
+ * The safe range is:
+ * [ptr, ptr + range - off)
+ * so whenever off >=range, it means no safe bytes from this pointer.
+ * When comparing old->off <= cur->off, it means that older code
+ * went with smaller offset and that offset was later
+ * used to figure out the safe range after 'if (ptr > data_end)' check
+ * Say, 'old' state was explored like:
+ * ... R3(off=0, r=0)
+ * R4 = R3 + 20
+ * ... now R4(off=20,r=0) <-- here
+ * if (R4 > data_end)
+ * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
+ * ... the code further went all the way to bpf_exit.
+ * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
+ * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
+ * goes further, such cur_R4 will give larger safe packet range after
+ * 'if (R4 > data_end)' and all further insn were already good with r=20,
+ * so they will be good with r=30 and we can prune the search.
+ */
+ if (old->off <= cur->off &&
+ old->off >= old->range && cur->off >= cur->range)
+ return true;
+
+ return false;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -1608,17 +2117,25 @@ err_free:
*/
static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
{
+ struct reg_state *rold, *rcur;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- if (memcmp(&old->regs[i], &cur->regs[i],
- sizeof(old->regs[0])) != 0) {
- if (old->regs[i].type == NOT_INIT ||
- (old->regs[i].type == UNKNOWN_VALUE &&
- cur->regs[i].type != NOT_INIT))
- continue;
- return false;
- }
+ rold = &old->regs[i];
+ rcur = &cur->regs[i];
+
+ if (memcmp(rold, rcur, sizeof(*rold)) == 0)
+ continue;
+
+ if (rold->type == NOT_INIT ||
+ (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
+ continue;
+
+ if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
+ compare_ptrs_to_packet(rold, rcur))
+ continue;
+
+ return false;
}
for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -1717,7 +2234,7 @@ static int do_check(struct verifier_env *env)
insn = &insns[insn_idx];
class = BPF_CLASS(insn->code);
- if (++insn_processed > 32768) {
+ if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose("BPF program is too large. Proccessed %d insn\n",
insn_processed);
return -E2BIG;
@@ -1740,7 +2257,7 @@ static int do_check(struct verifier_env *env)
if (log_level && do_print_state) {
verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
- print_verifier_state(env);
+ print_verifier_state(&env->cur_state);
do_print_state = false;
}
@@ -1952,6 +2469,7 @@ process_bpf_exit:
insn_idx++;
}
+ verbose("processed %d insns\n", insn_processed);
return 0;
}
@@ -2003,7 +2521,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
- fdput(f);
return PTR_ERR(map);
}
@@ -2023,15 +2540,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return -E2BIG;
}
- /* remember this map */
- env->used_maps[env->used_map_cnt++] = map;
-
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_bpf_prog_info()
*/
- bpf_map_inc(map, false);
+ map = bpf_map_inc(map, false);
+ if (IS_ERR(map)) {
+ fdput(f);
+ return PTR_ERR(map);
+ }
+ env->used_maps[env->used_map_cnt++] = map;
+
fdput(f);
next_insn:
insn++;
@@ -2067,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
insn->src_reg = 0;
}
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
-{
- struct bpf_insn *insn = prog->insnsi;
- int insn_cnt = prog->len;
- int i;
-
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (BPF_CLASS(insn->code) != BPF_JMP ||
- BPF_OP(insn->code) == BPF_CALL ||
- BPF_OP(insn->code) == BPF_EXIT)
- continue;
-
- /* adjust offset of jmps if necessary */
- if (i < pos && i + insn->off + 1 > pos)
- insn->off += delta;
- else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
- insn->off -= delta;
- }
-}
-
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
@@ -2096,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env)
int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
- u32 cnt;
- int i;
enum bpf_access_type type;
+ int i;
if (!env->prog->aux->ops->convert_ctx_access)
return 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ u32 insn_delta, cnt;
+
if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
type = BPF_READ;
else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
@@ -2125,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env)
return -EINVAL;
}
- if (cnt == 1) {
- memcpy(insn, insn_buf, sizeof(*insn));
- continue;
- }
-
- /* several new insns need to be inserted. Make room for them */
- insn_cnt += cnt - 1;
- new_prog = bpf_prog_realloc(env->prog,
- bpf_prog_size(insn_cnt),
- GFP_USER);
+ new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
- new_prog->len = insn_cnt;
-
- memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
- sizeof(*insn) * (insn_cnt - i - cnt));
-
- /* copy substitute insns in place of load instruction */
- memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
-
- /* adjust branches in the whole program */
- adjust_branches(new_prog, i, cnt - 1);
+ insn_delta = cnt - 1;
/* keep walking new program and skip insns we just inserted */
env->prog = new_prog;
- insn = new_prog->insnsi + i + cnt - 1;
- i += cnt - 1;
+ insn = new_prog->insnsi + i + insn_delta;
+
+ insn_cnt += insn_delta;
+ i += insn_delta;
}
return 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d27904c193da..86cb5c6e8932 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -59,6 +59,9 @@
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
+#include <linux/proc_ns.h>
+#include <linux/nsproxy.h>
+#include <linux/proc_ns.h>
#include <net/sock.h>
/*
@@ -178,10 +181,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_root_visible;
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
/* some controllers are not supported in the default hierarchy */
-static unsigned long cgrp_dfl_root_inhibit_ss_mask;
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */
@@ -205,23 +214,34 @@ static u64 css_serial_nr_next = 1;
* fork/exit handlers to call. This avoids us having to do extra work in the
* fork/exit path to check which subsystems have fork/exit callbacks.
*/
-static unsigned long have_fork_callback __read_mostly;
-static unsigned long have_exit_callback __read_mostly;
-static unsigned long have_free_callback __read_mostly;
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
+
+/* cgroup namespace for init task */
+struct cgroup_namespace init_cgroup_ns = {
+ .count = { .counter = 2, },
+ .user_ns = &init_user_ns,
+ .ns.ops = &cgroupns_operations,
+ .ns.inum = PROC_CGROUP_INIT_INO,
+ .root_cset = &init_css_set,
+};
/* Ditto for the can_fork callback. */
-static unsigned long have_canfork_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask);
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
@@ -238,9 +258,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
static bool cgroup_ssid_enabled(int ssid)
{
+ if (CGROUP_SUBSYS_COUNT == 0)
+ return false;
+
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
+static bool cgroup_ssid_no_v1(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
@@ -339,6 +367,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp)
return NULL;
}
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ u16 root_ss_mask = cgrp->root->subsys_mask;
+
+ if (parent)
+ return parent->subtree_control;
+
+ if (cgroup_on_dfl(cgrp))
+ root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+ cgrp_dfl_implicit_ss_mask);
+ return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (parent)
+ return parent->subtree_ss_mask;
+
+ return cgrp->root->subsys_mask;
+}
+
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
@@ -378,16 +432,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!ss)
return &cgrp->self;
- if (!(cgrp->root->subsys_mask & (1 << ss->id)))
- return NULL;
-
/*
* This function is used while updating css associations and thus
- * can't test the csses directly. Use ->child_subsys_mask.
+ * can't test the csses directly. Test ss_mask.
*/
- while (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp);
+ if (!cgrp)
+ return NULL;
+ }
return cgroup_css(cgrp, ss);
}
@@ -506,22 +559,28 @@ static int notify_on_release(const struct cgroup *cgrp)
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/**
- * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- * @ss_maskp: a pointer to the bitmask
+ * @ss_mask: the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
- * mask is set to 1.
+ * @ss_mask is set.
*/
-#define for_each_subsys_which(ss, ssid, ss_maskp) \
- if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
+#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
+ unsigned long __ss_mask = (ss_mask); \
+ if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
(ssid) = 0; \
- else \
- for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
- if (((ss) = cgroup_subsys[ssid]) && false) \
- break; \
- else
+ break; \
+ } \
+ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
+ (ss) = cgroup_subsys[ssid]; \
+ {
+
+#define while_each_subsys_mask() \
+ } \
+ } \
+} while (false)
/* iterate across the hierarchies */
#define for_each_root(root) \
@@ -535,6 +594,24 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
+ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
+ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
static void cgroup_release_agent(struct work_struct *work);
static void check_for_release(struct cgroup *cgrp);
@@ -665,6 +742,9 @@ static void css_set_move_task(struct task_struct *task,
{
lockdep_assert_held(&css_set_lock);
+ if (to_cset && !css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+
if (from_cset) {
struct css_task_iter *it, *pos;
@@ -698,8 +778,6 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
- if (!css_set_populated(to_cset))
- css_set_update_populated(to_cset, true);
rcu_assign_pointer(task->cgroups, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
@@ -1102,13 +1180,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */
- rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
@@ -1137,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_free_root(root);
}
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+ struct cgroup *res = NULL;
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ rcu_read_lock();
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ if (cset == &init_css_set) {
+ res = &root->cgrp;
+ } else {
+ struct cgrp_cset_link *link;
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ if (c->root == root) {
+ res = c;
+ break;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ BUG_ON(!res);
+ return res;
+}
+
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
@@ -1248,46 +1361,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
}
/**
- * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
- * @cgrp: the target cgroup
+ * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
* @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
* This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied to @cgrp. The returned mask is always
- * a superset of @subtree_control and follows the usual hierarchy rules.
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- unsigned long subtree_control)
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
- struct cgroup *parent = cgroup_parent(cgrp);
- unsigned long cur_ss_mask = subtree_control;
+ u16 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
- if (!cgroup_on_dfl(cgrp))
- return cur_ss_mask;
+ cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- unsigned long new_ss_mask = cur_ss_mask;
+ u16 new_ss_mask = cur_ss_mask;
- for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
+ } while_each_subsys_mask();
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
- if (parent)
- new_ss_mask &= parent->child_subsys_mask;
- else
- new_ss_mask &= cgrp->root->subsys_mask;
+ new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask)
break;
@@ -1298,19 +1405,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
}
/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
- * @cgrp: the target cgroup
- *
- * Update @cgrp->child_subsys_mask according to the current
- * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
- */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
-{
- cgrp->child_subsys_mask =
- cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
-}
-
-/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1338,19 +1432,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.
+ * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+ bool drain_offline)
{
struct cgroup *cgrp;
@@ -1369,7 +1466,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
return NULL;
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ if (drain_offline)
+ cgroup_lock_and_drain_offline(cgrp);
+ else
+ mutex_lock(&cgroup_mutex);
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -1399,14 +1499,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
* @css: taget css
- * @cgrp_override: specify if target cgroup is different from css->cgroup
*/
-static void css_clear_dir(struct cgroup_subsys_state *css,
- struct cgroup *cgrp_override)
+static void css_clear_dir(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cgroup *cgrp = css->cgroup;
struct cftype *cfts;
+ if (!(css->flags & CSS_VISIBLE))
+ return;
+
+ css->flags &= ~CSS_VISIBLE;
+
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
}
@@ -1414,17 +1517,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css,
/**
* css_populate_dir - create subsys files in a cgroup directory
* @css: target css
- * @cgrp_overried: specify if target cgroup is different from css->cgroup
*
* On failure, no file is added.
*/
-static int css_populate_dir(struct cgroup_subsys_state *css,
- struct cgroup *cgrp_override)
+static int css_populate_dir(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cgroup *cgrp = css->cgroup;
struct cftype *cfts, *failed_cfts;
int ret;
+ if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ return 0;
+
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_dfl_base_files;
@@ -1441,6 +1545,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css,
goto err;
}
}
+
+ css->flags |= CSS_VISIBLE;
+
return 0;
err:
list_for_each_entry(cfts, &css->ss->cfts, node) {
@@ -1451,67 +1558,30 @@ err:
return ret;
}
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- unsigned long tmp_ss_mask;
int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys_which(ss, ssid, &ss_mask) {
- /* if @ss has non-root csses attached to it, can't move */
- if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ do_each_subsys_mask(ss, ssid, ss_mask) {
+ /*
+ * If @ss has non-root csses attached to it, can't move.
+ * If @ss is an implicit controller, it is exempt from this
+ * rule and can be stolen.
+ */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+ !ss->implicit_on_dfl)
return -EBUSY;
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
- }
-
- /* skip creating root files on dfl_root for inhibited subsystems */
- tmp_ss_mask = ss_mask;
- if (dst_root == &cgrp_dfl_root)
- tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-
- for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- struct cgroup *scgrp = &ss->root->cgrp;
- int tssid;
-
- ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
- if (!ret)
- continue;
-
- /*
- * Rebinding back to the default root is not allowed to
- * fail. Using both default and non-default roots should
- * be rare. Moving subsystems back and forth even more so.
- * Just warn about it and continue.
- */
- if (dst_root == &cgrp_dfl_root) {
- if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- ret, ss_mask);
- pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
- }
- continue;
- }
-
- for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
- if (tssid == ssid)
- break;
- css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
- }
- return ret;
- }
+ } while_each_subsys_mask();
- /*
- * Nothing can fail from this point on. Remove files for the
- * removed subsystems and rebind each subsystem.
- */
- for_each_subsys_which(ss, ssid, &ss_mask) {
+ do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
@@ -1519,8 +1589,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
WARN_ON(!css || cgroup_css(dcgrp, ss));
- css_clear_dir(css, NULL);
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ /* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
@@ -1532,28 +1606,55 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
&dcgrp->e_csets[ss->id]);
spin_unlock_bh(&css_set_lock);
- src_root->subsys_mask &= ~(1 << ssid);
- scgrp->subtree_control &= ~(1 << ssid);
- cgroup_refresh_child_subsys_mask(scgrp);
-
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
- cgroup_refresh_child_subsys_mask(dcgrp);
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
+ ret = cgroup_apply_control(dcgrp);
+ if (ret)
+ pr_warn("partial failure to rebind %s controller (err=%d)\n",
+ ss->name, ret);
+
if (ss->bind)
ss->bind(css);
- }
+ } while_each_subsys_mask();
kernfs_activate(dcgrp->kn);
return 0;
}
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+ struct kernfs_root *kf_root)
+{
+ int len = 0;
+ char *buf = NULL;
+ struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+ struct cgroup *ns_cgroup;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ spin_lock_bh(&css_set_lock);
+ ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+ len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+ spin_unlock_bh(&css_set_lock);
+
+ if (len >= PATH_MAX)
+ len = -ERANGE;
+ else if (len > 0) {
+ seq_escape(sf, buf, " \t\n\\");
+ len = 0;
+ }
+ kfree(buf);
+ return len;
+}
+
static int cgroup_show_options(struct seq_file *seq,
struct kernfs_root *kf_root)
{
@@ -1584,7 +1685,7 @@ static int cgroup_show_options(struct seq_file *seq,
}
struct cgroup_sb_opts {
- unsigned long subsys_mask;
+ u16 subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
@@ -1597,13 +1698,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = -1UL;
+ u16 mask = U16_MAX;
struct cgroup_subsys *ss;
int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~(1U << cpuset_cgrp_id);
+ mask = ~((u16)1 << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1678,6 +1779,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
if (!cgroup_ssid_enabled(i))
continue;
+ if (cgroup_ssid_no_v1(i))
+ continue;
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
@@ -1698,7 +1801,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i))
+ if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
opts->subsys_mask |= (1 << i);
/*
@@ -1728,14 +1831,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned long added_mask, removed_mask;
+ u16 added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
return -EINVAL;
}
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1768,7 +1871,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
if (ret)
goto out_unlock;
- rebind_subsystems(&cgrp_dfl_root, removed_mask);
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
if (opts.release_agent) {
spin_lock(&release_agent_path_lock);
@@ -1876,7 +1979,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1899,10 +2002,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
- * cgroup_lock, and that's us. The worst that can happen is that we
- * have some link structures left over
+ * cgroup_lock, and that's us. Later rebinding may disable
+ * controllers on the default hierarchy and thus create new csets,
+ * which can't be more than the existing ones. Allocate 2x.
*/
- ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret)
goto cancel_ref;
@@ -1919,7 +2023,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = css_populate_dir(&root_cgrp->self, NULL);
+ ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -1972,6 +2076,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
{
bool is_v2 = fs_type == &cgroup2_fs_type;
struct super_block *pinned_sb = NULL;
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
@@ -1980,6 +2085,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int i;
bool new_sb;
+ get_cgroup_ns(ns);
+
+ /* Check if the caller has permission to mount. */
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
+ put_cgroup_ns(ns);
+ return ERR_PTR(-EPERM);
+ }
+
/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
@@ -1990,15 +2103,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
if (is_v2) {
if (data) {
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ put_cgroup_ns(ns);
return ERR_PTR(-EINVAL);
}
- cgrp_dfl_root_visible = true;
+ cgrp_dfl_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
goto out_mount;
}
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
@@ -2095,6 +2209,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
+ /*
+ * We know this subsystem has not yet been bound. Users in a non-init
+ * user namespace may only mount hierarchies with no bound subsystems,
+ * i.e. 'none,name=user1'
+ */
+ if (!opts.none && !capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
ret = -ENOMEM;
@@ -2113,12 +2237,37 @@ out_free:
kfree(opts.release_agent);
kfree(opts.name);
- if (ret)
+ if (ret) {
+ put_cgroup_ns(ns);
return ERR_PTR(ret);
+ }
out_mount:
dentry = kernfs_mount(fs_type, flags, root->kf_root,
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
&new_sb);
+
+ /*
+ * In non-init cgroup namespace, instead of root cgroup's
+ * dentry, we return the dentry corresponding to the
+ * cgroupns->root_cgrp.
+ */
+ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
+ struct dentry *nsdentry;
+ struct cgroup *cgrp;
+
+ mutex_lock(&cgroup_mutex);
+ spin_lock_bh(&css_set_lock);
+
+ cgrp = cset_cgroup_from_root(ns->root_cset, root);
+
+ spin_unlock_bh(&css_set_lock);
+ mutex_unlock(&cgroup_mutex);
+
+ nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+ dput(dentry);
+ dentry = nsdentry;
+ }
+
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
@@ -2131,6 +2280,7 @@ out_mount:
deactivate_super(pinned_sb);
}
+ put_cgroup_ns(ns);
return dentry;
}
@@ -2159,14 +2309,45 @@ static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
static struct file_system_type cgroup2_fs_type = {
.name = "cgroup2",
.mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
+ .fs_flags = FS_USERNS_MOUNT,
};
+static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
+{
+ struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
+ int ret;
+
+ ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
+ if (ret < 0 || ret >= buflen)
+ return NULL;
+ return buf;
+}
+
+char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
+ struct cgroup_namespace *ns)
+{
+ char *ret;
+
+ mutex_lock(&cgroup_mutex);
+ spin_lock_bh(&css_set_lock);
+
+ ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
+
+ spin_unlock_bh(&css_set_lock);
+ mutex_unlock(&cgroup_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
+
/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
@@ -2194,7 +2375,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
if (root) {
cgrp = task_cgroup_from_root(task, root);
- path = cgroup_path(cgrp, buf, buflen);
+ path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
if (strlcpy(buf, "/", buflen) < buflen)
@@ -2338,38 +2519,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * cgroup_taskset_migrate - migrate a taskset
* @tset: taget taskset
- * @dst_cgrp: destination cgroup
+ * @root: cgroup root the migration is taking place on
*
- * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
- * ->can_attach callbacks fails and guarantees that either all or none of
- * the tasks in @tset are migrated. @tset is consumed regardless of
- * success.
+ * Migrate tasks in @tset as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
*/
static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- struct cgroup *dst_cgrp)
+ struct cgroup_root *root)
{
- struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
- int i, ret;
+ int ssid, failed_ssid, ret;
/* methods shouldn't be called if no task is actually migrating */
if (list_empty(&tset->src_csets))
return 0;
/* check that we can legitimately attach to the cgroup */
- for_each_e_css(css, i, dst_cgrp) {
- if (css->ss->can_attach) {
- tset->ssid = i;
- ret = css->ss->can_attach(tset);
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
if (ret) {
- failed_css = css;
+ failed_ssid = ssid;
goto out_cancel_attach;
}
}
- }
+ } while_each_subsys_mask();
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2396,25 +2577,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;
- for_each_e_css(css, i, dst_cgrp) {
- if (css->ss->attach) {
- tset->ssid = i;
- css->ss->attach(tset);
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
}
- }
+ } while_each_subsys_mask();
ret = 0;
goto out_release_tset;
out_cancel_attach:
- for_each_e_css(css, i, dst_cgrp) {
- if (css == failed_css)
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ssid == failed_ssid)
break;
- if (css->ss->cancel_attach) {
- tset->ssid = i;
- css->ss->cancel_attach(tset);
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
}
- }
+ } while_each_subsys_mask();
out_release_tset:
spin_lock_bh(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2427,6 +2608,20 @@ out_release_tset:
}
/**
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
+ */
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+{
+ return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+ !dst_cgrp->subtree_control;
+}
+
+/**
* cgroup_migrate_finish - cleanup after attach
* @preloaded_csets: list of preloaded css_sets
*
@@ -2442,6 +2637,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
@@ -2474,58 +2670,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
+ /*
+ * If ->dead, @src_set is associated with one or more dead cgroups
+ * and doesn't contain any migratable tasks. Ignore it early so
+ * that the rest of migration path doesn't get confused by it.
+ */
+ if (src_cset->dead)
+ return;
+
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
return;
WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
WARN_ON(!list_empty(&src_cset->mg_node));
src_cset->mg_src_cgrp = src_cgrp;
+ src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
list_add(&src_cset->mg_preload_node, preloaded_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup (may be %NULL)
* @preloaded_csets: list of preloaded source css_sets
*
- * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- * have been preloaded to @preloaded_csets. This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
- * source css_set is assumed to be its cgroup on the default hierarchy.
+ * Tasks are about to be moved and all the source css_sets have been
+ * preloaded to @preloaded_csets. This function looks up and pins all
+ * destination css_sets, links each to its source, and append them to
+ * @preloaded_csets.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
* @preloaded_csets.
*/
-static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
- struct list_head *preloaded_csets)
+static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets)
{
LIST_HEAD(csets);
struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
- /*
- * Except for the root, child_subsys_mask must be zero for a cgroup
- * with tasks so that child cgroups don't compete against tasks.
- */
- if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
- dst_cgrp->child_subsys_mask)
- return -EBUSY;
-
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
struct css_set *dst_cset;
- dst_cset = find_css_set(src_cset,
- dst_cgrp ?: src_cset->dfl_cgrp);
+ dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
goto err;
@@ -2538,6 +2732,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
*/
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
+ src_cset->mg_dst_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
@@ -2563,11 +2758,11 @@ err:
* cgroup_migrate - migrate a process or task to a cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
- * @cgrp: the destination cgroup
+ * @root: cgroup root migration is taking place on
*
- * Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem. The
- * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * Migrate a process or task denoted by @leader. If migrating a process,
+ * the caller must be holding cgroup_threadgroup_rwsem. The caller is also
+ * responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
*
@@ -2578,7 +2773,7 @@ err:
* actually starting migrating.
*/
static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
- struct cgroup *cgrp)
+ struct cgroup_root *root)
{
struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct task_struct *task;
@@ -2599,7 +2794,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
rcu_read_unlock();
spin_unlock_bh(&css_set_lock);
- return cgroup_taskset_migrate(&tset, cgrp);
+ return cgroup_taskset_migrate(&tset, root);
}
/**
@@ -2617,6 +2812,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *task;
int ret;
+ if (!cgroup_may_migrate_to(dst_cgrp))
+ return -EBUSY;
+
/* look up all src csets */
spin_lock_bh(&css_set_lock);
rcu_read_lock();
@@ -2631,9 +2829,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
spin_unlock_bh(&css_set_lock);
/* prepare dst csets and commit */
- ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (!ret)
- ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
+ ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root);
cgroup_migrate_finish(&preloaded_csets);
return ret;
@@ -2689,14 +2887,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off, bool threadgroup)
{
struct task_struct *tsk;
+ struct cgroup_subsys *ss;
struct cgroup *cgrp;
pid_t pid;
- int ret;
+ int ssid, ret;
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
return -EINVAL;
- cgrp = cgroup_kn_lock_live(of->kn);
+ cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -2739,8 +2938,10 @@ out_unlock_rcu:
rcu_read_unlock();
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
+ for_each_subsys(ss, ssid)
+ if (ss->post_attach)
+ ss->post_attach();
cgroup_kn_unlock(of->kn);
- cpuset_post_attach_flush();
return ret ?: nbytes;
}
@@ -2794,7 +2995,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
- cgrp = cgroup_kn_lock_live(of->kn);
+ cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
spin_lock(&release_agent_path_lock);
@@ -2822,38 +3023,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
{
struct cgroup_subsys *ss;
bool printed = false;
int ssid;
- for_each_subsys_which(ss, ssid, &ss_mask) {
+ do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
seq_printf(seq, "%s", ss->name);
printed = true;
- }
+ } while_each_subsys_mask();
if (printed)
seq_putc(seq, '\n');
}
-/* show controllers which are currently attached to the default hierarchy */
-static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
-{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
- ~cgrp_dfl_root_inhibit_ss_mask);
- return 0;
-}
-
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
+ cgroup_print_ss_mask(seq, cgroup_control(cgrp));
return 0;
}
@@ -2870,16 +3061,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
* @cgrp: root of the subtree to update csses for
*
- * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
- * css associations need to be updated accordingly. This function looks up
- * all css_sets which are attached to the subtree, creates the matching
- * updated css_sets and migrates the tasks to the new ones.
+ * @cgrp's control masks have changed and its subtree's css associations
+ * need to be updated accordingly. This function looks up all css_sets
+ * which are attached to the subtree, creates the matching updated css_sets
+ * and migrates the tasks to the new ones.
*/
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
LIST_HEAD(preloaded_csets);
struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
- struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup *dsct;
struct css_set *src_cset;
int ret;
@@ -2889,21 +3081,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_bh(&css_set_lock);
- css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
- /* self is not affected by child_subsys_mask change */
- if (css->cgroup == cgrp)
- continue;
-
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
- cgroup_migrate_add_src(link->cset, cgrp,
+ list_for_each_entry(link, &dsct->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, dsct,
&preloaded_csets);
}
spin_unlock_bh(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
- ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
goto out_finish;
@@ -2921,20 +3109,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
}
spin_unlock_bh(&css_set_lock);
- ret = cgroup_taskset_migrate(&tset, cgrp);
+ ret = cgroup_taskset_migrate(&tset, cgrp->root);
out_finish:
cgroup_migrate_finish(&preloaded_csets);
percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
+/**
+ * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
+ * @cgrp: root of the target subtree
+ *
+ * Because css offlining is asynchronous, userland may try to re-enable a
+ * controller while the previous css is still around. This function grabs
+ * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
+ */
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
+ __acquires(&cgroup_mutex)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+restart:
+ mutex_lock(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+ DEFINE_WAIT(wait);
+
+ if (!css || !percpu_ref_is_dying(&css->refcnt))
+ continue;
+
+ cgroup_get(dsct);
+ prepare_to_wait(&dsct->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ mutex_unlock(&cgroup_mutex);
+ schedule();
+ finish_wait(&dsct->offline_waitq, &wait);
+
+ cgroup_put(dsct);
+ goto restart;
+ }
+ }
+}
+
+/**
+ * cgroup_save_control - save control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Save ->subtree_control and ->subtree_ss_mask to the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_save_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ dsct->old_subtree_control = dsct->subtree_control;
+ dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
+ }
+}
+
+/**
+ * cgroup_propagate_control - refresh control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
+ * ->subtree_control and propagate controller availability through the
+ * subtree so that descendants don't have unavailable controllers enabled.
+ */
+static void cgroup_propagate_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ dsct->subtree_control &= cgroup_control(dsct);
+ dsct->subtree_ss_mask =
+ cgroup_calc_subtree_ss_mask(dsct->subtree_control,
+ cgroup_ss_mask(dsct));
+ }
+}
+
+/**
+ * cgroup_restore_control - restore control masks of a subtree
+ * @cgrp: root of the target subtree
+ *
+ * Restore ->subtree_control and ->subtree_ss_mask from the respective old_
+ * prefixed fields for @cgrp's subtree including @cgrp itself.
+ */
+static void cgroup_restore_control(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ dsct->subtree_control = dsct->old_subtree_control;
+ dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
+ }
+}
+
+static bool css_visible(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys *ss = css->ss;
+ struct cgroup *cgrp = css->cgroup;
+
+ if (cgroup_control(cgrp) & (1 << ss->id))
+ return true;
+ if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
+ return false;
+ return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
+}
+
+/**
+ * cgroup_apply_control_enable - enable or show csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and create new csses or make the existing ones
+ * visible. A css is created invisible if it's being implicitly enabled
+ * through dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
+ *
+ * Returns 0 on success, -errno on failure. On failure, csses which have
+ * been processed already aren't cleaned up. The caller is responsible for
+ * cleaning up with cgroup_apply_control_disble().
+ */
+static int cgroup_apply_control_enable(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid, ret;
+
+ cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+ WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+ if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
+ continue;
+
+ if (!css) {
+ css = css_create(dsct, ss);
+ if (IS_ERR(css))
+ return PTR_ERR(css);
+ }
+
+ if (css_visible(css)) {
+ ret = css_populate_dir(css);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * cgroup_apply_control_disable - kill or hide csses according to control
+ * @cgrp: root of the target subtree
+ *
+ * Walk @cgrp's subtree and kill and hide csses so that they match
+ * cgroup_ss_mask() and cgroup_visible_mask().
+ *
+ * A css is hidden when the userland requests it to be disabled while other
+ * subsystems are still depending on it. The css must not actively control
+ * resources and be in the vanilla state if it's made visible again later.
+ * Controllers which may be depended upon should provide ->css_reset() for
+ * this purpose.
+ */
+static void cgroup_apply_control_disable(struct cgroup *cgrp)
+{
+ struct cgroup *dsct;
+ struct cgroup_subsys_state *d_css;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
+ for_each_subsys(ss, ssid) {
+ struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
+
+ WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt));
+
+ if (!css)
+ continue;
+
+ if (css->parent &&
+ !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
+ kill_css(css);
+ } else if (!css_visible(css)) {
+ css_clear_dir(css);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
+ }
+}
+
+/**
+ * cgroup_apply_control - apply control mask updates to the subtree
+ * @cgrp: root of the target subtree
+ *
+ * subsystems can be enabled and disabled in a subtree using the following
+ * steps.
+ *
+ * 1. Call cgroup_save_control() to stash the current state.
+ * 2. Update ->subtree_control masks in the subtree as desired.
+ * 3. Call cgroup_apply_control() to apply the changes.
+ * 4. Optionally perform other related operations.
+ * 5. Call cgroup_finalize_control() to finish up.
+ *
+ * This function implements step 3 and propagates the mask changes
+ * throughout @cgrp's subtree, updates csses accordingly and perform
+ * process migrations.
+ */
+static int cgroup_apply_control(struct cgroup *cgrp)
+{
+ int ret;
+
+ cgroup_propagate_control(cgrp);
+
+ ret = cgroup_apply_control_enable(cgrp);
+ if (ret)
+ return ret;
+
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * cgroup_finalize_control - finalize control mask update
+ * @cgrp: root of the target subtree
+ * @ret: the result of the update
+ *
+ * Finalize control mask update. See cgroup_apply_control() for more info.
+ */
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
+{
+ if (ret) {
+ cgroup_restore_control(cgrp);
+ cgroup_propagate_control(cgrp);
+ }
+
+ cgroup_apply_control_disable(cgrp);
+}
+
/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- unsigned long enable = 0, disable = 0;
- unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+ u16 enable = 0, disable = 0;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2946,11 +3386,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
*/
buf = strstrip(buf);
while ((tok = strsep(&buf, " "))) {
- unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
-
if (tok[0] == '\0')
continue;
- for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
if (!cgroup_ssid_enabled(ssid) ||
strcmp(tok + 1, ss->name))
continue;
@@ -2965,12 +3403,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
return -EINVAL;
}
break;
- }
+ } while_each_subsys_mask();
if (ssid == CGROUP_SUBSYS_COUNT)
return -EINVAL;
}
- cgrp = cgroup_kn_lock_live(of->kn);
+ cgrp = cgroup_kn_lock_live(of->kn, true);
if (!cgrp)
return -ENODEV;
@@ -2981,10 +3419,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
continue;
}
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ if (!(cgroup_control(cgrp) & (1 << ssid))) {
ret = -ENOENT;
goto out_unlock;
}
@@ -3018,150 +3453,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
goto out_unlock;
}
- /*
- * Update subsys masks and calculate what needs to be done. More
- * subsystems than specified may need to be enabled or disabled
- * depending on subsystem dependencies.
- */
- old_sc = cgrp->subtree_control;
- old_ss = cgrp->child_subsys_mask;
- new_sc = (old_sc | enable) & ~disable;
- new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
-
- css_enable = ~old_ss & new_ss;
- css_disable = old_ss & ~new_ss;
- enable |= css_enable;
- disable |= css_disable;
-
- /*
- * Because css offlining is asynchronous, userland might try to
- * re-enable the same controller while the previous instance is
- * still around. In such cases, wait till it's gone using
- * offline_waitq.
- */
- for_each_subsys_which(ss, ssid, &css_enable) {
- cgroup_for_each_live_child(child, cgrp) {
- DEFINE_WAIT(wait);
+ /* save and update control masks and prepare csses */
+ cgroup_save_control(cgrp);
- if (!cgroup_css(child, ss))
- continue;
-
- cgroup_get(child);
- prepare_to_wait(&child->offline_waitq, &wait,
- TASK_UNINTERRUPTIBLE);
- cgroup_kn_unlock(of->kn);
- schedule();
- finish_wait(&child->offline_waitq, &wait);
- cgroup_put(child);
-
- return restart_syscall();
- }
- }
-
- cgrp->subtree_control = new_sc;
- cgrp->child_subsys_mask = new_ss;
-
- /*
- * Create new csses or make the existing ones visible. A css is
- * created invisible if it's being implicitly enabled through
- * dependency. An invisible css is made visible when the userland
- * explicitly enables it.
- */
- for_each_subsys(ss, ssid) {
- if (!(enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- if (css_enable & (1 << ssid))
- ret = create_css(child, ss,
- cgrp->subtree_control & (1 << ssid));
- else
- ret = css_populate_dir(cgroup_css(child, ss),
- NULL);
- if (ret)
- goto err_undo_css;
- }
- }
-
- /*
- * At this point, cgroup_e_css() results reflect the new csses
- * making the following cgroup_update_dfl_csses() properly update
- * css associations of all tasks in the subtree.
- */
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- goto err_undo_css;
-
- /*
- * All tasks are migrated out of disabled csses. Kill or hide
- * them. A css is hidden when the userland requests it to be
- * disabled while other subsystems are still depending on it. The
- * css must not actively control resources and be in the vanilla
- * state if it's made visible again later. Controllers which may
- * be depended upon should provide ->css_reset() for this purpose.
- */
- for_each_subsys(ss, ssid) {
- if (!(disable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
- if (css_disable & (1 << ssid)) {
- kill_css(css);
- } else {
- css_clear_dir(css, NULL);
- if (ss->css_reset)
- ss->css_reset(css);
- }
- }
- }
-
- /*
- * The effective csses of all the descendants (excluding @cgrp) may
- * have changed. Subsystems can optionally subscribe to this event
- * by implementing ->css_e_css_changed() which is invoked if any of
- * the effective csses seen from the css's cgroup may have changed.
- */
- for_each_subsys(ss, ssid) {
- struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
- struct cgroup_subsys_state *css;
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
- if (!ss->css_e_css_changed || !this_css)
- continue;
+ ret = cgroup_apply_control(cgrp);
- css_for_each_descendant_pre(css, this_css)
- if (css != this_css)
- ss->css_e_css_changed(css);
- }
+ cgroup_finalize_control(cgrp, ret);
kernfs_activate(cgrp->kn);
ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
-
-err_undo_css:
- cgrp->subtree_control = old_sc;
- cgrp->child_subsys_mask = old_ss;
-
- for_each_subsys(ss, ssid) {
- if (!(enable & (1 << ssid)))
- continue;
-
- cgroup_for_each_live_child(child, cgrp) {
- struct cgroup_subsys_state *css = cgroup_css(child, ss);
-
- if (!css)
- continue;
-
- if (css_enable & (1 << ssid))
- kill_css(css);
- else
- css_clear_dir(css, NULL);
- }
- }
- goto out_unlock;
}
static int cgroup_events_show(struct seq_file *seq, void *v)
@@ -3359,7 +3665,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
bool is_add)
{
struct cftype *cft, *cft_end = NULL;
- int ret;
+ int ret = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -3388,7 +3694,7 @@ restart:
cgroup_rm_file(cgrp, cft);
}
}
- return 0;
+ return ret;
}
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
@@ -3405,7 +3711,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
- if (cgroup_is_dead(cgrp))
+ if (!(css->flags & CSS_VISIBLE))
continue;
ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
@@ -4026,6 +4332,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
struct task_struct *task;
int ret;
+ if (!cgroup_may_migrate_to(to))
+ return -EBUSY;
+
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
@@ -4034,7 +4343,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
spin_unlock_bh(&css_set_lock);
- ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+ ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
goto out_err;
@@ -4050,7 +4359,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
css_task_iter_end(&it);
if (task) {
- ret = cgroup_migrate(task, false, to);
+ ret = cgroup_migrate(task, false, to->root);
put_task_struct(task);
}
} while (task && !ret);
@@ -4557,12 +4866,6 @@ static struct cftype cgroup_dfl_base_files[] = {
},
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_root_controllers_show,
- },
- {
- .name = "cgroup.controllers",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show,
},
{
@@ -4731,7 +5034,9 @@ static void css_release_work_fn(struct work_struct *work)
* Those are supported by RCU protecting clearing of
* cgrp->kn->priv backpointer.
*/
- RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+ if (cgrp->kn)
+ RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
+ NULL);
}
mutex_unlock(&cgroup_mutex);
@@ -4802,6 +5107,9 @@ static void offline_css(struct cgroup_subsys_state *css)
if (!(css->flags & CSS_ONLINE))
return;
+ if (ss->css_reset)
+ ss->css_reset(css);
+
if (ss->css_offline)
ss->css_offline(css);
@@ -4812,17 +5120,16 @@ static void offline_css(struct cgroup_subsys_state *css)
}
/**
- * create_css - create a cgroup_subsys_state
+ * css_create - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
- * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created if
- * @visible. Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp. This function doesn't create the
+ * interface files. Returns 0 on success, -errno on failure.
*/
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible)
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4833,7 +5140,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
css = ss->css_alloc(parent_css);
if (IS_ERR(css))
- return PTR_ERR(css);
+ return css;
init_and_link_css(css, ss, cgrp);
@@ -4846,12 +5153,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
goto err_free_percpu_ref;
css->id = err;
- if (visible) {
- err = css_populate_dir(css, NULL);
- if (err)
- goto err_free_id;
- }
-
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
cgroup_idr_replace(&ss->css_idr, css, css->id);
@@ -4869,47 +5170,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
ss->warned_broken_hierarchy = true;
}
- return 0;
+ return css;
err_list_del:
list_del_rcu(&css->sibling);
- css_clear_dir(css, NULL);
-err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
- return err;
+ return ERR_PTR(err);
}
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
- umode_t mode)
+static struct cgroup *cgroup_create(struct cgroup *parent)
{
- struct cgroup *parent, *cgrp, *tcgrp;
- struct cgroup_root *root;
- struct cgroup_subsys *ss;
- struct kernfs_node *kn;
- int level, ssid, ret;
-
- /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
- */
- if (strchr(name, '\n'))
- return -EINVAL;
-
- parent = cgroup_kn_lock_live(parent_kn);
- if (!parent)
- return -ENODEV;
- root = parent->root;
- level = parent->level + 1;
+ struct cgroup_root *root = parent->root;
+ struct cgroup *cgrp, *tcgrp;
+ int level = parent->level + 1;
+ int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp) +
sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
- if (!cgrp) {
- ret = -ENOMEM;
- goto out_unlock;
- }
+ if (!cgrp)
+ return ERR_PTR(-ENOMEM);
ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
@@ -4940,20 +5224,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
- /* create the directory */
- kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
- if (IS_ERR(kn)) {
- ret = PTR_ERR(kn);
- goto out_free_id;
- }
- cgrp->kn = kn;
-
- /*
- * This extra ref will be put in cgroup_free_fn() and guarantees
- * that @cgrp->kn is always accessible.
- */
- kernfs_get(kn);
-
cgrp->self.serial_nr = css_serial_nr_next++;
/* allocation complete, commit to creation */
@@ -4967,51 +5237,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
*/
cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
- ret = cgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
+ /*
+ * On the default hierarchy, a child doesn't automatically inherit
+ * subtree_control from the parent. Each is configured manually.
+ */
+ if (!cgroup_on_dfl(cgrp))
+ cgrp->subtree_control = cgroup_control(cgrp);
- ret = css_populate_dir(&cgrp->self, NULL);
+ cgroup_propagate_control(cgrp);
+
+ /* @cgrp doesn't have dir yet so the following will only create csses */
+ ret = cgroup_apply_control_enable(cgrp);
if (ret)
goto out_destroy;
- /* let's create and online css's */
- for_each_subsys(ss, ssid) {
- if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss,
- parent->subtree_control & (1 << ssid));
- if (ret)
- goto out_destroy;
- }
+ return cgrp;
+
+out_cancel_ref:
+ percpu_ref_exit(&cgrp->self.refcnt);
+out_free_cgrp:
+ kfree(cgrp);
+ return ERR_PTR(ret);
+out_destroy:
+ cgroup_destroy_locked(cgrp);
+ return ERR_PTR(ret);
+}
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct cgroup *parent, *cgrp;
+ struct kernfs_node *kn;
+ int ret;
+
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = cgroup_kn_lock_live(parent_kn, false);
+ if (!parent)
+ return -ENODEV;
+
+ cgrp = cgroup_create(parent);
+ if (IS_ERR(cgrp)) {
+ ret = PTR_ERR(cgrp);
+ goto out_unlock;
+ }
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_destroy;
}
+ cgrp->kn = kn;
/*
- * On the default hierarchy, a child doesn't automatically inherit
- * subtree_control from the parent. Each is configured manually.
+ * This extra ref will be put in cgroup_free_fn() and guarantees
+ * that @cgrp->kn is always accessible.
*/
- if (!cgroup_on_dfl(cgrp)) {
- cgrp->subtree_control = parent->subtree_control;
- cgroup_refresh_child_subsys_mask(cgrp);
- }
+ kernfs_get(kn);
+ ret = cgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ ret = css_populate_dir(&cgrp->self);
+ if (ret)
+ goto out_destroy;
+
+ ret = cgroup_apply_control_enable(cgrp);
+ if (ret)
+ goto out_destroy;
+
+ /* let's create and online css's */
kernfs_activate(kn);
ret = 0;
goto out_unlock;
-out_free_id:
- cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
-out_cancel_ref:
- percpu_ref_exit(&cgrp->self.refcnt);
-out_free_cgrp:
- kfree(cgrp);
+out_destroy:
+ cgroup_destroy_locked(cgrp);
out_unlock:
cgroup_kn_unlock(parent_kn);
return ret;
-
-out_destroy:
- cgroup_destroy_locked(cgrp);
- goto out_unlock;
}
/*
@@ -5065,7 +5374,7 @@ static void kill_css(struct cgroup_subsys_state *css)
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
*/
- css_clear_dir(css, NULL);
+ css_clear_dir(css);
/*
* Killing would put the base ref, but we need to keep it alive
@@ -5114,6 +5423,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup_subsys_state *css;
+ struct cgrp_cset_link *link;
int ssid;
lockdep_assert_held(&cgroup_mutex);
@@ -5134,11 +5444,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
return -EBUSY;
/*
- * Mark @cgrp dead. This prevents further task migration and child
- * creation by disabling cgroup_lock_live_group().
+ * Mark @cgrp and the associated csets dead. The former prevents
+ * further task migration and child creation by disabling
+ * cgroup_lock_live_group(). The latter makes the csets ignored by
+ * the migration path.
*/
cgrp->self.flags &= ~CSS_ONLINE;
+ spin_lock_bh(&css_set_lock);
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ link->cset->dead = true;
+ spin_unlock_bh(&css_set_lock);
+
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
@@ -5162,7 +5479,7 @@ static int cgroup_rmdir(struct kernfs_node *kn)
struct cgroup *cgrp;
int ret = 0;
- cgrp = cgroup_kn_lock_live(kn);
+ cgrp = cgroup_kn_lock_live(kn, false);
if (!cgrp)
return 0;
@@ -5178,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
+ .show_path = cgroup_show_path,
};
static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
@@ -5252,7 +5570,7 @@ int __init cgroup_init_early(void)
for_each_subsys(ss, i) {
WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
- "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+ "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
ss->id, ss->name);
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
@@ -5269,7 +5587,7 @@ int __init cgroup_init_early(void)
return 0;
}
-static unsigned long cgroup_disable_mask __initdata;
+static u16 cgroup_disable_mask __initdata;
/**
* cgroup_init - cgroup initialization
@@ -5280,18 +5598,23 @@ static unsigned long cgroup_disable_mask __initdata;
int __init cgroup_init(void)
{
struct cgroup_subsys *ss;
- unsigned long key;
int ssid;
+ BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ get_user_ns(init_cgroup_ns.user_ns);
+
mutex_lock(&cgroup_mutex);
- /* Add init_css_set to the hash table */
- key = css_set_hash(init_css_set.subsys);
- hash_add(css_set_table, &init_css_set.hlist, key);
+ /*
+ * Add init_css_set to the hash table so that dfl_root can link to
+ * it during init.
+ */
+ hash_add(css_set_table, &init_css_set.hlist,
+ css_set_hash(init_css_set.subsys));
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
@@ -5324,10 +5647,16 @@ int __init cgroup_init(void)
continue;
}
+ if (cgroup_ssid_no_v1(ssid))
+ printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
+ ss->name);
+
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- if (!ss->dfl_cftypes)
- cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+ if (ss->implicit_on_dfl)
+ cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
+ else if (!ss->dfl_cftypes)
+ cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
if (ss->dfl_cftypes == ss->legacy_cftypes) {
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
@@ -5340,6 +5669,11 @@ int __init cgroup_init(void)
ss->bind(init_css_set.subsys[ssid]);
}
+ /* init_css_set.subsys[] has been updated, re-hash */
+ hash_del(&init_css_set.hlist);
+ hash_add(css_set_table, &init_css_set.hlist,
+ css_set_hash(init_css_set.subsys));
+
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
@@ -5398,7 +5732,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
+ if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -5424,7 +5758,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
- path = cgroup_path(cgrp, buf, PATH_MAX);
+ path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
if (!path) {
retval = -ENAMETOOLONG;
goto out_unlock;
@@ -5513,11 +5848,11 @@ int cgroup_can_fork(struct task_struct *child)
struct cgroup_subsys *ss;
int i, j, ret;
- for_each_subsys_which(ss, i, &have_canfork_callback) {
+ do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child);
if (ret)
goto out_revert;
- }
+ } while_each_subsys_mask();
return 0;
@@ -5602,8 +5937,9 @@ void cgroup_post_fork(struct task_struct *child)
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
- for_each_subsys_which(ss, i, &have_fork_callback)
+ do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
+ } while_each_subsys_mask();
}
/**
@@ -5646,8 +5982,9 @@ void cgroup_exit(struct task_struct *tsk)
}
/* see cgroup_post_fork() for details */
- for_each_subsys_which(ss, i, &have_exit_callback)
+ do_each_subsys_mask(ss, i, have_exit_callback) {
ss->exit(tsk);
+ } while_each_subsys_mask();
}
void cgroup_free(struct task_struct *task)
@@ -5656,8 +5993,9 @@ void cgroup_free(struct task_struct *task)
struct cgroup_subsys *ss;
int ssid;
- for_each_subsys_which(ss, ssid, &have_free_callback)
+ do_each_subsys_mask(ss, ssid, have_free_callback) {
ss->free(task);
+ } while_each_subsys_mask();
put_css_set(cset);
}
@@ -5706,7 +6044,9 @@ static void cgroup_release_agent(struct work_struct *work)
if (!pathbuf || !agentbuf)
goto out;
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ spin_lock_bh(&css_set_lock);
+ path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
+ spin_unlock_bh(&css_set_lock);
if (!path)
goto out;
@@ -5750,6 +6090,33 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_no_v1(char *str)
+{
+ struct cgroup_subsys *ss;
+ char *token;
+ int i;
+
+ while ((token = strsep(&str, ",")) != NULL) {
+ if (!*token)
+ continue;
+
+ if (!strcmp(token, "all")) {
+ cgroup_no_v1_mask = U16_MAX;
+ break;
+ }
+
+ for_each_subsys(ss, i) {
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+
+ cgroup_no_v1_mask |= 1 << i;
+ }
+ }
+ return 1;
+}
+__setup("cgroup_no_v1=", cgroup_no_v1);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5763,12 +6130,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+ struct file_system_type *s_type = dentry->d_sb->s_type;
struct cgroup_subsys_state *css = NULL;
struct cgroup *cgrp;
/* is @dentry a cgroup dir? */
- if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
- kernfs_type(kn) != KERNFS_DIR)
+ if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
+ !kn || kernfs_type(kn) != KERNFS_DIR)
return ERR_PTR(-EBADF);
rcu_read_lock();
@@ -5890,6 +6258,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
+/* cgroup namespaces */
+
+static struct cgroup_namespace *alloc_cgroup_ns(void)
+{
+ struct cgroup_namespace *new_ns;
+ int ret;
+
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ if (!new_ns)
+ return ERR_PTR(-ENOMEM);
+ ret = ns_alloc_inum(&new_ns->ns);
+ if (ret) {
+ kfree(new_ns);
+ return ERR_PTR(ret);
+ }
+ atomic_set(&new_ns->count, 1);
+ new_ns->ns.ops = &cgroupns_operations;
+ return new_ns;
+}
+
+void free_cgroup_ns(struct cgroup_namespace *ns)
+{
+ put_css_set(ns->root_cset);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+EXPORT_SYMBOL(free_cgroup_ns);
+
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct cgroup_namespace *old_ns)
+{
+ struct cgroup_namespace *new_ns;
+ struct css_set *cset;
+
+ BUG_ON(!old_ns);
+
+ if (!(flags & CLONE_NEWCGROUP)) {
+ get_cgroup_ns(old_ns);
+ return old_ns;
+ }
+
+ /* Allow only sysadmin to create cgroup namespace. */
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ mutex_lock(&cgroup_mutex);
+ spin_lock_bh(&css_set_lock);
+
+ cset = task_css_set(current);
+ get_css_set(cset);
+
+ spin_unlock_bh(&css_set_lock);
+ mutex_unlock(&cgroup_mutex);
+
+ new_ns = alloc_cgroup_ns();
+ if (IS_ERR(new_ns)) {
+ put_css_set(cset);
+ return new_ns;
+ }
+
+ new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->root_cset = cset;
+
+ return new_ns;
+}
+
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct cgroup_namespace, ns);
+}
+
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+{
+ struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
+
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Don't need to do anything if we are attaching to our own cgroupns. */
+ if (cgroup_ns == nsproxy->cgroup_ns)
+ return 0;
+
+ get_cgroup_ns(cgroup_ns);
+ put_cgroup_ns(nsproxy->cgroup_ns);
+ nsproxy->cgroup_ns = cgroup_ns;
+
+ return 0;
+}
+
+static struct ns_common *cgroupns_get(struct task_struct *task)
+{
+ struct cgroup_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->cgroup_ns;
+ get_cgroup_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void cgroupns_put(struct ns_common *ns)
+{
+ put_cgroup_ns(to_cg_ns(ns));
+}
+
+const struct proc_ns_operations cgroupns_operations = {
+ .name = "cgroup",
+ .type = CLONE_NEWCGROUP,
+ .get = cgroupns_get,
+ .put = cgroupns_put,
+ .install = cgroupns_install,
+};
+
+static __init int cgroup_namespaces_init(void)
+{
+ return 0;
+}
+subsys_initcall(cgroup_namespaces_init);
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ea42e8da861..d948e44c471e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -36,6 +36,7 @@
* @target: The target state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
+ * @rollback: Perform a rollback
* @cb_stat: The state for a single callback (install/uninstall)
* @cb: Single callback function (install/uninstall)
* @result: Result of the operation
@@ -47,6 +48,7 @@ struct cpuhp_cpu_state {
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
+ bool rollback;
enum cpuhp_state cb_state;
int (*cb)(unsigned int cpu);
int result;
@@ -301,6 +303,11 @@ static int cpu_notify(unsigned long val, unsigned int cpu)
return __cpu_notify(val, cpu, -1, NULL);
}
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+ BUG_ON(cpu_notify(val, cpu));
+}
+
/* Notifier wrappers for transitioning to state machine */
static int notify_prepare(unsigned int cpu)
{
@@ -477,6 +484,16 @@ static void cpuhp_thread_fun(unsigned int cpu)
} else {
ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
}
+ } else if (st->rollback) {
+ BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+
+ undo_cpu_down(cpu, st, cpuhp_ap_states);
+ /*
+ * This is a momentary workaround to keep the notifier users
+ * happy. Will go away once we got rid of the notifiers.
+ */
+ cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+ st->rollback = false;
} else {
/* Cannot happen .... */
BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
@@ -636,11 +653,6 @@ static inline void check_for_tasks(int dead_cpu)
read_unlock(&tasklist_lock);
}
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
- BUG_ON(cpu_notify(val, cpu));
-}
-
static int notify_down_prepare(unsigned int cpu)
{
int err, nr_calls = 0;
@@ -691,21 +703,6 @@ static int takedown_cpu(unsigned int cpu)
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int err;
- /*
- * By now we've cleared cpu_active_mask, wait for all preempt-disabled
- * and RCU users of this state to go away such that all new such users
- * will observe it.
- *
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so wait for both.
- *
- * Do sync before park smpboot threads to take care the rcu boost case.
- */
- if (IS_ENABLED(CONFIG_PREEMPT))
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
- else
- synchronize_rcu();
-
/* Park the smpboot threads */
kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
smpboot_park_threads(cpu);
@@ -721,9 +718,10 @@ static int takedown_cpu(unsigned int cpu)
*/
err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
if (err) {
- /* CPU didn't die: tell everyone. Can't complain. */
- cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+ /* CPU refused to die */
irq_unlock_sparse();
+ /* Unpark the hotplug thread so we can rollback there */
+ kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
return err;
}
BUG_ON(cpu_online(cpu));
@@ -832,6 +830,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
* to do the further cleanups.
*/
ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+ if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+ st->target = prev_state;
+ st->rollback = true;
+ cpuhp_kick_ap_work(cpu);
+ }
hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
out:
@@ -905,8 +908,6 @@ void cpuhp_online_idle(enum cpuhp_state state)
st->state = CPUHP_AP_ONLINE_IDLE;
- /* The cpu is marked online, set it active now */
- set_cpu_active(cpu, true);
/* Unpark the stopper thread and the hotplug thread of this cpu */
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
@@ -1218,6 +1219,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.name = "ap:offline",
.cant_stop = true,
},
+ /* First state is scheduler control. Interrupts are disabled */
+ [CPUHP_AP_SCHED_STARTING] = {
+ .name = "sched:starting",
+ .startup = sched_cpu_starting,
+ .teardown = sched_cpu_dying,
+ },
/*
* Low level startup/teardown notifiers. Run with interrupts
* disabled. Will be removed once the notifiers are converted to
@@ -1249,12 +1256,22 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.name = "notify:online",
.startup = notify_online,
.teardown = notify_down_prepare,
+ .skip_onerr = true,
},
#endif
/*
* The dynamically registered state space is here
*/
+#ifdef CONFIG_SMP
+ /* Last state is scheduler control setting the cpu active */
+ [CPUHP_AP_ACTIVE] = {
+ .name = "sched:active",
+ .startup = sched_cpu_activate,
+ .teardown = sched_cpu_deactivate,
+ },
+#endif
+
/* CPU is fully up and running. */
[CPUHP_ONLINE] = {
.name = "online",
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 41989ab4db57..1902956baba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -58,7 +58,6 @@
#include <asm/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
-#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
@@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
}
}
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
{
flush_workqueue(cpuset_migrate_mm_wq);
}
@@ -2087,9 +2086,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
+ .post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.legacy_cftypes = files,
- .early_init = 1,
+ .early_init = true,
};
/**
@@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
goto out;
retval = -ENAMETOOLONG;
- rcu_read_lock();
- css = task_css(tsk, cpuset_cgrp_id);
- p = cgroup_path(css->cgroup, buf, PATH_MAX);
- rcu_read_unlock();
+ css = task_get_css(tsk, cpuset_cgrp_id);
+ p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ css_put(css);
if (!p)
goto out_free;
seq_puts(m, p);
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 9c418002b8c1..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -159,15 +168,24 @@ put_callchain_entry(int rctx)
struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
- int rctx;
- struct perf_callchain_entry *entry;
-
- int kernel = !event->attr.exclude_callchain_kernel;
- int user = !event->attr.exclude_callchain_user;
+ bool kernel = !event->attr.exclude_callchain_kernel;
+ bool user = !event->attr.exclude_callchain_user;
+ /* Disallow cross-task user callchains. */
+ bool crosstask = event->ctx->task && event->ctx->task != current;
if (!kernel && !user)
return NULL;
+ return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+}
+
+struct perf_callchain_entry *
+get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
+ bool crosstask, bool add_mark)
+{
+ struct perf_callchain_entry *entry;
+ int rctx;
+
entry = get_callchain_entry(&rctx);
if (rctx == -1)
return NULL;
@@ -175,10 +193,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!entry)
goto exit_put;
- entry->nr = 0;
+ entry->nr = init_nr;
if (kernel && !user_mode(regs)) {
- perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+ if (add_mark)
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
}
@@ -191,13 +210,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
}
if (regs) {
- /*
- * Disallow cross-task user callchains.
- */
- if (event->ctx->task && event->ctx->task != current)
+ if (crosstask)
goto exit_put;
- perf_callchain_store(entry, PERF_CONTEXT_USER);
+ if (add_mark)
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
perf_callchain_user(entry, regs);
}
}
@@ -207,3 +224,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 712570dddacd..274450efea90 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -351,7 +353,7 @@ static struct srcu_struct pmus_srcu;
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
*/
-int sysctl_perf_event_paranoid __read_mostly = 1;
+int sysctl_perf_event_paranoid __read_mostly = 2;
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -376,8 +378,11 @@ static void update_perf_cpu_limits(void)
u64 tmp = perf_sample_period_ns;
tmp *= sysctl_perf_cpu_time_max_percent;
- do_div(tmp, 100);
- ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
+ tmp = div_u64(tmp, 100);
+ if (!tmp)
+ tmp = 1;
+
+ WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -409,7 +414,14 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
- update_perf_cpu_limits();
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0) {
+ printk(KERN_WARNING
+ "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
+ WRITE_ONCE(perf_sample_allowed_ns, 0);
+ } else {
+ update_perf_cpu_limits();
+ }
return 0;
}
@@ -423,62 +435,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);
+static u64 __report_avg;
+static u64 __report_allowed;
+
static void perf_duration_warn(struct irq_work *w)
{
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- u64 avg_local_sample_len;
- u64 local_samples_len;
-
- local_samples_len = __this_cpu_read(running_sample_length);
- avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
printk_ratelimited(KERN_WARNING
- "perf interrupt took too long (%lld > %lld), lowering "
- "kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns >> 1,
- sysctl_perf_event_sample_rate);
+ "perf: interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ __report_avg, __report_allowed,
+ sysctl_perf_event_sample_rate);
}
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
void perf_sample_event_took(u64 sample_len_ns)
{
- u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
- u64 avg_local_sample_len;
- u64 local_samples_len;
+ u64 max_len = READ_ONCE(perf_sample_allowed_ns);
+ u64 running_len;
+ u64 avg_len;
+ u32 max;
- if (allowed_ns == 0)
+ if (max_len == 0)
return;
- /* decay the counter by 1 average sample */
- local_samples_len = __this_cpu_read(running_sample_length);
- local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
- local_samples_len += sample_len_ns;
- __this_cpu_write(running_sample_length, local_samples_len);
+ /* Decay the counter by 1 average sample. */
+ running_len = __this_cpu_read(running_sample_length);
+ running_len -= running_len/NR_ACCUMULATED_SAMPLES;
+ running_len += sample_len_ns;
+ __this_cpu_write(running_sample_length, running_len);
/*
- * note: this will be biased artifically low until we have
- * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
+ * Note: this will be biased artifically low until we have
+ * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
* from having to maintain a count.
*/
- avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-
- if (avg_local_sample_len <= allowed_ns)
+ avg_len = running_len/NR_ACCUMULATED_SAMPLES;
+ if (avg_len <= max_len)
return;
- if (max_samples_per_tick <= 1)
- return;
+ __report_avg = avg_len;
+ __report_allowed = max_len;
- max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
- sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
- perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+ /*
+ * Compute a throttle threshold 25% below the current duration.
+ */
+ avg_len += avg_len / 4;
+ max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
+ if (avg_len < max)
+ max /= (u32)avg_len;
+ else
+ max = 1;
- update_perf_cpu_limits();
+ WRITE_ONCE(perf_sample_allowed_ns, avg_len);
+ WRITE_ONCE(max_samples_per_tick, max);
+
+ sysctl_perf_event_sample_rate = max * HZ;
+ perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
if (!irq_work_queue(&perf_duration_work)) {
- early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ early_printk("perf: interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns >> 1,
+ __report_avg, __report_allowed,
sysctl_perf_event_sample_rate);
}
}
@@ -1090,6 +1108,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
+ * cred_guard_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -1910,8 +1929,13 @@ event_sched_in(struct perf_event *event,
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
+ WRITE_ONCE(event->oncpu, smp_processor_id());
+ /*
+ * Order event::oncpu write to happen before the ACTIVE state
+ * is visible.
+ */
+ smp_wmb();
+ WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2343,6 +2367,112 @@ void perf_event_enable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_enable);
+struct stop_event_data {
+ struct perf_event *event;
+ unsigned int restart;
+};
+
+static int __perf_event_stop(void *info)
+{
+ struct stop_event_data *sd = info;
+ struct perf_event *event = sd->event;
+
+ /* if it's already INACTIVE, do nothing */
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * There is a window with interrupts enabled before we get here,
+ * so we need to check again lest we try to stop another CPU's event.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return -EAGAIN;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ /*
+ * May race with the actual stop (through perf_pmu_output_stop()),
+ * but it is only used for events with AUX ring buffer, and such
+ * events will refuse to restart because of rb::aux_mmap_count==0,
+ * see comments in perf_aux_output_begin().
+ *
+ * Since this is happening on a event-local CPU, no trace is lost
+ * while restarting.
+ */
+ if (sd->restart)
+ event->pmu->start(event, PERF_EF_START);
+
+ return 0;
+}
+
+static int perf_event_restart(struct perf_event *event)
+{
+ struct stop_event_data sd = {
+ .event = event,
+ .restart = 1,
+ };
+ int ret = 0;
+
+ do {
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * We only want to restart ACTIVE events, so if the event goes
+ * inactive here (event->oncpu==-1), there's nothing more to do;
+ * fall through with ret==-ENXIO.
+ */
+ ret = cpu_function_call(READ_ONCE(event->oncpu),
+ __perf_event_stop, &sd);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ * we update the addresses of corresponding vmas in
+ * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ * if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ * pre-existing mappings, called once when new filters arrive via SET_FILTER
+ * ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ * registered mapping, called for every new mmap(), with mm::mmap_sem down
+ * for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ * of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock(&ifh->lock);
+ if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+ event->pmu->addr_filters_sync(event);
+ event->hw.addr_filters_gen = event->addr_filters_gen;
+ }
+ raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
@@ -2402,14 +2532,24 @@ static void ctx_sched_out(struct perf_event_context *ctx,
cpuctx->task_ctx = NULL;
}
- is_active ^= ctx->is_active; /* changed bits */
-
+ /*
+ * Always update time if it was set; not only when it changes.
+ * Otherwise we can 'forget' to update time for any but the last
+ * context we sched out. For example:
+ *
+ * ctx_sched_out(.event_type = EVENT_FLEXIBLE)
+ * ctx_sched_out(.event_type = EVENT_PINNED)
+ *
+ * would only update time for the pinned events.
+ */
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
}
+ is_active ^= ctx->is_active; /* changed bits */
+
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
@@ -3182,16 +3322,6 @@ out:
put_ctx(clone_ctx);
}
-void perf_event_exec(void)
-{
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn)
- perf_event_enable_on_exec(ctxn);
- rcu_read_unlock();
-}
-
struct perf_read_data {
struct perf_event *event;
bool group;
@@ -3395,7 +3525,6 @@ static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
- int err;
rcu_read_lock();
if (!vpid)
@@ -3409,16 +3538,7 @@ find_lively_task_by_vpid(pid_t vpid)
if (!task)
return ERR_PTR(-ESRCH);
- /* Reuse ptrace permission checks for now. */
- err = -EACCES;
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto errout;
-
return task;
-errout:
- put_task_struct(task);
- return ERR_PTR(err);
-
}
/*
@@ -3703,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head);
+
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3730,6 +3853,8 @@ static void _free_event(struct perf_event *event)
}
perf_event_free_bpf_prog(event);
+ perf_addr_filters_splice(event, NULL);
+ kfree(event->addr_filters_offs);
if (event->destroy)
event->destroy(event);
@@ -4210,6 +4335,14 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(ctx->pmu);
+ /*
+ * We could be throttled; unthrottle now to avoid the tick
+ * trying to unthrottle while we already re-started the event.
+ */
+ if (event->hw.interrupts == MAX_INTERRUPTS) {
+ event->hw.interrupts = 0;
+ perf_log_throttle(event, 1);
+ }
event->pmu->stop(event, PERF_EF_UPDATE);
}
@@ -4318,6 +4451,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
+ case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb || !rb->nr_pages) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rb_toggle_paused(rb, !!arg);
+ rcu_read_unlock();
+ return 0;
+ }
default:
return -ENOTTY;
}
@@ -4634,6 +4780,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
event->pmu->event_mapped(event);
}
+static void perf_pmu_output_stop(struct perf_event *event);
+
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
@@ -4661,10 +4809,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ /*
+ * Stop all AUX events that are writing to this buffer,
+ * so that we can free its AUX pages and corresponding PMU
+ * data. Note that after rb::aux_mmap_count dropped to zero,
+ * they won't start any more (see perf_aux_output_begin()).
+ */
+ perf_pmu_output_stop(event);
+
+ /* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ /* this has to be the last one */
rb_free_aux(rb);
+ WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
mutex_unlock(&event->mmap_mutex);
}
@@ -5605,9 +5765,13 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs,
+ int (*output_begin)(struct perf_output_handle *,
+ struct perf_event *,
+ unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5617,7 +5781,7 @@ void perf_event_output(struct perf_event *event,
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ if (output_begin(&handle, event, header.size))
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -5628,6 +5792,30 @@ exit:
rcu_read_unlock();
}
+void
+perf_event_output_forward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin);
+}
+
/*
* read event_id
*/
@@ -5673,15 +5861,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
static void
perf_event_aux_ctx(struct perf_event_context *ctx,
perf_event_aux_output_cb output,
- void *data)
+ void *data, bool all)
{
struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- continue;
- if (!event_filter_match(event))
- continue;
+ if (!all) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ }
+
output(event, data);
}
}
@@ -5692,7 +5883,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
{
rcu_read_lock();
preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data);
+ perf_event_aux_ctx(task_ctx, output, data, false);
preempt_enable();
rcu_read_unlock();
}
@@ -5722,13 +5913,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
ctxn = pmu->task_ctx_nr;
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data);
+ perf_event_aux_ctx(ctx, output, data, false);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
@@ -5736,6 +5927,134 @@ next:
}
/*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+ unsigned long flags;
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (filter->inode) {
+ event->addr_filters_offs[count] = 0;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_restart(event);
+}
+
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctxn);
+
+ perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ true);
+ }
+ rcu_read_unlock();
+}
+
+struct remote_output {
+ struct ring_buffer *rb;
+ int err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+ struct perf_event *parent = event->parent;
+ struct remote_output *ro = data;
+ struct ring_buffer *rb = ro->rb;
+ struct stop_event_data sd = {
+ .event = event,
+ };
+
+ if (!has_aux(event))
+ return;
+
+ if (!parent)
+ parent = event;
+
+ /*
+ * In case of inheritance, it will be the parent that links to the
+ * ring-buffer, but it will be the child that's actually using it:
+ */
+ if (rcu_dereference(parent->rb) == rb)
+ ro->err = __perf_event_stop(&sd);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+ struct perf_event *event = info;
+ struct pmu *pmu = event->pmu;
+ struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct remote_output ro = {
+ .rb = event->rb,
+ };
+
+ rcu_read_lock();
+ perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ if (cpuctx->task_ctx)
+ perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ &ro, false);
+ rcu_read_unlock();
+
+ return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+ struct perf_event *iter;
+ int err, cpu;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+ /*
+ * For per-CPU events, we need to make sure that neither they
+ * nor their children are running; for cpu==-1 events it's
+ * sufficient to stop the event itself if it's active, since
+ * it can't have children.
+ */
+ cpu = iter->cpu;
+ if (cpu == -1)
+ cpu = READ_ONCE(iter->oncpu);
+
+ if (cpu == -1)
+ continue;
+
+ err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+ if (err == -EAGAIN) {
+ rcu_read_unlock();
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
* task tracking -- fork/exit
*
* enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
@@ -6144,6 +6463,87 @@ got_name:
kfree(buf);
}
+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
+{
+ return filter->filter && filter->inode;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+ struct file *file, unsigned long offset,
+ unsigned long size)
+{
+ if (filter->inode != file->f_inode)
+ return false;
+
+ if (filter->offset > offset + size)
+ return false;
+
+ if (filter->offset + filter->size < offset)
+ return false;
+
+ return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct vm_area_struct *vma = data;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
+ struct file *file = vma->vm_file;
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+
+ if (!has_addr_filter(event))
+ return;
+
+ if (!file)
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (perf_addr_filter_match(filter, file, off,
+ vma->vm_end - vma->vm_start)) {
+ event->addr_filters_offs[count] = vma->vm_start;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_restart(event);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (!ctx)
+ continue;
+
+ perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ }
+ rcu_read_unlock();
+}
+
void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
@@ -6175,6 +6575,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .flags (attr_mmap2 only) */
};
+ perf_addr_filters_adjust(vma);
perf_event_mmap_event(&mmap_event);
}
@@ -6466,10 +6867,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(&event->pending);
}
- if (event->overflow_handler)
- event->overflow_handler(event, data, regs);
- else
- perf_event_output(event, data, regs);
+ event->overflow_handler(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
@@ -6702,7 +7100,7 @@ int perf_swevent_get_recursion_context(void)
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
-inline void perf_swevent_put_recursion_context(int rctx)
+void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -6964,7 +7362,26 @@ static int perf_tp_event_match(struct perf_event *event,
return 1;
}
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+ struct trace_event_call *call, u64 count,
+ struct pt_regs *regs, struct hlist_head *head,
+ struct task_struct *task)
+{
+ struct bpf_prog *prog = call->prog;
+
+ if (prog) {
+ *(struct pt_regs **)raw_data = regs;
+ if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ perf_swevent_put_recursion_context(rctx);
+ return;
+ }
+ }
+ perf_tp_event(call->event.type, count, raw_data, size, regs, head,
+ rctx, task);
+}
+EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+
+void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
{
@@ -6976,9 +7393,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
.data = record,
};
- perf_sample_data_init(&data, addr, 0);
+ perf_sample_data_init(&data, 0, 0);
data.raw = &raw;
+ perf_trace_buf_update(record, event_type);
+
hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
@@ -7056,24 +7475,6 @@ static inline void perf_tp_register(void)
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- char *filter_str;
- int ret;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- filter_str = strndup_user(arg, PAGE_SIZE);
- if (IS_ERR(filter_str))
- return PTR_ERR(filter_str);
-
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
- kfree(filter_str);
- return ret;
-}
-
static void perf_event_free_filter(struct perf_event *event)
{
ftrace_profile_free_filter(event);
@@ -7081,6 +7482,7 @@ static void perf_event_free_filter(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
+ bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -7089,20 +7491,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->tp_event->prog)
return -EEXIST;
- if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
- /* bpf programs can only be attached to u/kprobes */
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+ is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
+ if (!is_kprobe && !is_tracepoint)
+ /* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
prog = bpf_prog_get(prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->type != BPF_PROG_TYPE_KPROBE) {
+ if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
+ if (is_tracepoint) {
+ int off = trace_event_get_offsets(event->tp_event);
+
+ if (prog->aux->max_ctx_offset > off) {
+ bpf_prog_put(prog);
+ return -EACCES;
+ }
+ }
event->tp_event->prog = prog;
return 0;
@@ -7128,11 +7541,6 @@ static inline void perf_tp_register(void)
{
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- return -ENOENT;
-}
-
static void perf_event_free_filter(struct perf_event *event)
{
}
@@ -7161,6 +7569,387 @@ void perf_bp_event(struct perf_event *bp, void *data)
#endif
/*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+ int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+ struct perf_addr_filter *filter;
+
+ filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+ if (!filter)
+ return NULL;
+
+ INIT_LIST_HEAD(&filter->entry);
+ list_add_tail(&filter->entry, filters);
+
+ return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+ struct perf_addr_filter *filter, *iter;
+
+ list_for_each_entry_safe(filter, iter, filters, entry) {
+ if (filter->inode)
+ iput(filter->inode);
+ list_del(&filter->entry);
+ kfree(filter);
+ }
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head)
+{
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ if (!has_addr_filter(event))
+ return;
+
+ /* don't bother with children, they don't have their own filters */
+ if (event->parent)
+ return;
+
+ raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+ list_splice_init(&event->addr_filters.list, &list);
+ if (head)
+ list_splice(head, &event->addr_filters.list);
+
+ raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+ free_filters_list(&list);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct file *file = vma->vm_file;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+ if (!file)
+ continue;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ continue;
+
+ return vma->vm_start;
+ }
+
+ return 0;
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct task_struct *task = READ_ONCE(event->ctx->task);
+ struct perf_addr_filter *filter;
+ struct mm_struct *mm = NULL;
+ unsigned int count = 0;
+ unsigned long flags;
+
+ /*
+ * We may observe TASK_TOMBSTONE, which means that the event tear-down
+ * will stop on the parent's child_mutex that our caller is also holding
+ */
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
+
+ down_read(&mm->mmap_sem);
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ event->addr_filters_offs[count] = 0;
+
+ if (perf_addr_filter_needs_mmap(filter))
+ event->addr_filters_offs[count] =
+ perf_addr_filter_apply(filter, mm);
+
+ count++;
+ }
+
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ up_read(&mm->mmap_sem);
+
+ mmput(mm);
+
+restart:
+ perf_event_restart(event);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ * * "filter": limit the trace to this region
+ * * "start": start tracing from this address
+ * * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ * * for kernel addresses: <start address>[/<size>]
+ * * for object files: <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+ IF_ACT_FILTER,
+ IF_ACT_START,
+ IF_ACT_STOP,
+ IF_SRC_FILE,
+ IF_SRC_KERNEL,
+ IF_SRC_FILEADDR,
+ IF_SRC_KERNELADDR,
+};
+
+enum {
+ IF_STATE_ACTION = 0,
+ IF_STATE_SOURCE,
+ IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+ { IF_ACT_FILTER, "filter" },
+ { IF_ACT_START, "start" },
+ { IF_ACT_STOP, "stop" },
+ { IF_SRC_FILE, "%u/%u@%s" },
+ { IF_SRC_KERNEL, "%u/%u" },
+ { IF_SRC_FILEADDR, "%u@%s" },
+ { IF_SRC_KERNELADDR, "%u" },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+ struct list_head *filters)
+{
+ struct perf_addr_filter *filter = NULL;
+ char *start, *orig, *filename = NULL;
+ struct path path;
+ substring_t args[MAX_OPT_ARGS];
+ int state = IF_STATE_ACTION, token;
+ unsigned int kernel = 0;
+ int ret = -EINVAL;
+
+ orig = fstr = kstrdup(fstr, GFP_KERNEL);
+ if (!fstr)
+ return -ENOMEM;
+
+ while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ ret = -EINVAL;
+
+ if (!*start)
+ continue;
+
+ /* filter definition begins */
+ if (state == IF_STATE_ACTION) {
+ filter = perf_addr_filter_new(event, filters);
+ if (!filter)
+ goto fail;
+ }
+
+ token = match_token(start, if_tokens, args);
+ switch (token) {
+ case IF_ACT_FILTER:
+ case IF_ACT_START:
+ filter->filter = 1;
+
+ case IF_ACT_STOP:
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ state = IF_STATE_SOURCE;
+ break;
+
+ case IF_SRC_KERNELADDR:
+ case IF_SRC_KERNEL:
+ kernel = 1;
+
+ case IF_SRC_FILEADDR:
+ case IF_SRC_FILE:
+ if (state != IF_STATE_SOURCE)
+ goto fail;
+
+ if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+ filter->range = 1;
+
+ *args[0].to = 0;
+ ret = kstrtoul(args[0].from, 0, &filter->offset);
+ if (ret)
+ goto fail;
+
+ if (filter->range) {
+ *args[1].to = 0;
+ ret = kstrtoul(args[1].from, 0, &filter->size);
+ if (ret)
+ goto fail;
+ }
+
+ if (token == IF_SRC_FILE) {
+ filename = match_strdup(&args[2]);
+ if (!filename) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ state = IF_STATE_END;
+ break;
+
+ default:
+ goto fail;
+ }
+
+ /*
+ * Filter definition is fully parsed, validate and install it.
+ * Make sure that it doesn't contradict itself or the event's
+ * attribute.
+ */
+ if (state == IF_STATE_END) {
+ if (kernel && event->attr.exclude_kernel)
+ goto fail;
+
+ if (!kernel) {
+ if (!filename)
+ goto fail;
+
+ /* look up the path and grab its inode */
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_free_name;
+
+ filter->inode = igrab(d_inode(path.dentry));
+ path_put(&path);
+ kfree(filename);
+ filename = NULL;
+
+ ret = -EINVAL;
+ if (!filter->inode ||
+ !S_ISREG(filter->inode->i_mode))
+ /* free_filters_list() will iput() */
+ goto fail;
+ }
+
+ /* ready to consume more filters */
+ state = IF_STATE_ACTION;
+ filter = NULL;
+ }
+ }
+
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ kfree(orig);
+
+ return 0;
+
+fail_free_name:
+ kfree(filename);
+fail:
+ free_filters_list(filters);
+ kfree(orig);
+
+ return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+ LIST_HEAD(filters);
+ int ret;
+
+ /*
+ * Since this is called in perf_ioctl() path, we're already holding
+ * ctx::mutex.
+ */
+ lockdep_assert_held(&event->ctx->mutex);
+
+ if (WARN_ON_ONCE(event->parent))
+ return -EINVAL;
+
+ /*
+ * For now, we only support filtering in per-task events; doing so
+ * for CPU-wide events requires additional context switching trickery,
+ * since same object code will be mapped at different virtual
+ * addresses in different processes.
+ */
+ if (!event->ctx->task)
+ return -EOPNOTSUPP;
+
+ ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+ if (ret)
+ return ret;
+
+ ret = event->pmu->addr_filters_validate(&filters);
+ if (ret) {
+ free_filters_list(&filters);
+ return ret;
+ }
+
+ /* remove existing filters, if any */
+ perf_addr_filters_splice(event, &filters);
+
+ /* install new filters */
+ perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+ return ret;
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret = -EINVAL;
+
+ if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+ !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+ !has_addr_filter(event))
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
+ event->attr.type == PERF_TYPE_TRACEPOINT)
+ ret = ftrace_profile_set_filter(event, event->attr.config,
+ filter_str);
+ else if (has_addr_filter(event))
+ ret = perf_event_set_addr_filter(event, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+/*
* hrtimer based swevent callback
*/
@@ -7517,6 +8306,20 @@ static void free_pmu_context(struct pmu *pmu)
out:
mutex_unlock(&pmus_lock);
}
+
+/*
+ * Let userspace know that this PMU supports address range filtering:
+ */
+static ssize_t nr_addr_filters_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+
+ return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+}
+DEVICE_ATTR_RO(nr_addr_filters);
+
static struct idr pmu_idr;
static ssize_t
@@ -7618,9 +8421,19 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (ret)
goto free_dev;
+ /* For PMUs with address filters, throw in an extra attribute: */
+ if (pmu->nr_addr_filters)
+ ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
+
+ if (ret)
+ goto del_dev;
+
out:
return ret;
+del_dev:
+ device_del(pmu->dev);
+
free_dev:
put_device(pmu->dev);
goto out;
@@ -7660,6 +8473,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
+ if (pmu->task_ctx_nr == perf_hw_context) {
+ static int hw_context_taken = 0;
+
+ /*
+ * Other than systems with heterogeneous CPUs, it never makes
+ * sense for two PMUs to share perf_hw_context. PMUs which are
+ * uncore must use perf_invalid_context.
+ */
+ if (WARN_ON_ONCE(hw_context_taken &&
+ !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
+ pmu->task_ctx_nr = perf_invalid_context;
+
+ hw_context_taken = 1;
+ }
+
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;
@@ -7747,6 +8575,8 @@ void perf_pmu_unregister(struct pmu *pmu)
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
device_del(pmu->dev);
put_device(pmu->dev);
free_pmu_context(pmu);
@@ -7940,6 +8770,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
+ INIT_LIST_HEAD(&event->addr_filters.list);
INIT_HLIST_NODE(&event->hlist_entry);
@@ -7947,6 +8778,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
+ raw_spin_lock_init(&event->addr_filters.lock);
atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
@@ -7981,8 +8813,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
context = parent_event->overflow_handler_context;
}
- event->overflow_handler = overflow_handler;
- event->overflow_handler_context = context;
+ if (overflow_handler) {
+ event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
+ } else if (is_write_backward(event)){
+ event->overflow_handler = perf_event_output_backward;
+ event->overflow_handler_context = NULL;
+ } else {
+ event->overflow_handler = perf_event_output_forward;
+ event->overflow_handler_context = NULL;
+ }
perf_event__state_init(event);
@@ -8023,11 +8863,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (err)
goto err_pmu;
+ if (has_addr_filter(event)) {
+ event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (!event->addr_filters_offs)
+ goto err_per_task;
+
+ /* force hw sync on the address filters */
+ event->addr_filters_gen = 1;
+ }
+
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
if (err)
- goto err_per_task;
+ goto err_addr_filters;
}
}
@@ -8036,6 +8887,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
return event;
+err_addr_filters:
+ kfree(event->addr_filters_offs);
+
err_per_task:
exclusive_event_destroy(event);
@@ -8215,6 +9069,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
goto out;
/*
+ * Either writing ring buffer from beginning or from end.
+ * Mixing is not allowed.
+ */
+ if (is_write_backward(output_event) != is_write_backward(event))
+ goto out;
+
+ /*
* If both events generate aux data, they must be on the same PMU
*/
if (has_aux(event) && has_aux(output_event) &&
@@ -8380,6 +9241,24 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus();
+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ if (err)
+ goto err_cpus;
+
+ /*
+ * Reuse ptrace permission checks for now.
+ *
+ * We must hold cred_guard_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -8387,7 +9266,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cpus;
+ goto err_cred;
}
if (is_sampling_event(event)) {
@@ -8446,11 +9325,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
- if (task) {
- put_task_struct(task);
- task = NULL;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -8509,6 +9383,7 @@ SYSCALL_DEFINE5(perf_event_open,
f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
+ event_file = NULL;
goto err_context;
}
@@ -8547,6 +9422,11 @@ SYSCALL_DEFINE5(perf_event_open,
WARN_ON_ONCE(ctx->parent_ctx);
+ /*
+ * This is the point on no return; we cannot fail hereafter. This is
+ * where we start modifying current state.
+ */
+
if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
@@ -8618,6 +9498,11 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&gctx->mutex);
mutex_unlock(&ctx->mutex);
+ if (task) {
+ mutex_unlock(&task->signal->cred_guard_mutex);
+ put_task_struct(task);
+ }
+
put_online_cpus();
mutex_lock(&current->perf_event_mutex);
@@ -8650,6 +9535,9 @@ err_alloc:
*/
if (!event_file)
free_event(event);
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->cred_guard_mutex);
err_cpus:
put_online_cpus();
err_task:
@@ -8934,6 +9822,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
{
@@ -9426,10 +10317,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
+ /*
+ * This must be done before the CPU comes alive, because the
+ * moment we can run tasks we can encounter (software) events.
+ *
+ * Specifically, someone can have inherited events on kthreadd
+ * or a pre-existing worker thread that gets re-bound.
+ */
perf_event_init_cpu(cpu);
break;
case CPU_DOWN_PREPARE:
+ /*
+ * This must be done before the CPU dies because after that an
+ * active event might want to IPI the CPU and that'll not work
+ * so great for dead CPUs.
+ *
+ * XXX smp_call_function_single() return -ENXIO without a warn
+ * so we could possibly deal with this.
+ *
+ * This is safe against new events arriving because
+ * sys_perf_event_open() serializes against hotplug using
+ * get_online_cpus().
+ */
perf_event_exit_cpu(cpu);
break;
default:
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c1274c..05f9f6d626df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,13 +11,13 @@
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
- struct irq_work irq_work;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
+ int paused; /* can write into ring buffer */
atomic_t poll; /* POLL_ for wakeups */
@@ -65,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
rb_free(rb);
}
+static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
+{
+ if (!pause && rb->nr_pages)
+ rb->paused = 0;
+ else
+ rb->paused = 1;
+}
+
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
@@ -182,8 +190,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
/* Callchain handling */
extern struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs);
-extern int get_callchain_buffers(void);
-extern void put_callchain_buffers(void);
static inline int get_recursion_context(int *recursion)
{
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 1faad2cfdb9e..ae9b90dc9a5a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -102,8 +102,21 @@ out:
preempt_enable();
}
-int perf_output_begin(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size)
+static bool __always_inline
+ring_buffer_has_space(unsigned long head, unsigned long tail,
+ unsigned long data_size, unsigned int size,
+ bool backward)
+{
+ if (!backward)
+ return CIRC_SPACE(head, tail, data_size) >= size;
+ else
+ return CIRC_SPACE(tail, head, data_size) >= size;
+}
+
+static int __always_inline
+__perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size,
+ bool backward)
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
@@ -125,8 +138,11 @@ int perf_output_begin(struct perf_output_handle *handle,
if (unlikely(!rb))
goto out;
- if (unlikely(!rb->nr_pages))
+ if (unlikely(rb->paused)) {
+ if (rb->nr_pages)
+ local_inc(&rb->lost);
goto out;
+ }
handle->rb = rb;
handle->event = event;
@@ -143,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
do {
tail = READ_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
- if (!rb->overwrite &&
- unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
- goto fail;
+ if (!rb->overwrite) {
+ if (unlikely(!ring_buffer_has_space(head, tail,
+ perf_data_size(rb),
+ size, backward)))
+ goto fail;
+ }
/*
* The above forms a control dependency barrier separating the
@@ -159,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
* See perf_output_put_handle().
*/
- head += size;
+ if (!backward)
+ head += size;
+ else
+ head -= size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
+ if (backward) {
+ offset = head;
+ head = (u64)(-head);
+ }
+
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
@@ -203,6 +230,26 @@ out:
return -ENOSPC;
}
+int perf_output_begin_forward(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, false);
+}
+
+int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ return __perf_output_begin(handle, event, size, true);
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+
+ return __perf_output_begin(handle, event, size,
+ unlikely(is_write_backward(event)));
+}
+
unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
@@ -221,8 +268,6 @@ void perf_output_end(struct perf_output_handle *handle)
rcu_read_unlock();
}
-static void rb_irq_work(struct irq_work *work);
-
static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
@@ -243,16 +288,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
INIT_LIST_HEAD(&rb->event_list);
spin_lock_init(&rb->event_lock);
- init_irq_work(&rb->irq_work, rb_irq_work);
-}
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
- if (!atomic_dec_and_test(&rb->refcount))
- return;
-
- rb->rcu_head.next = (void *)rb;
- irq_work_queue(&rb->irq_work);
+ /*
+ * perf_output_begin() only checks rb->paused, therefore
+ * rb->paused must be true if we have no pages for output.
+ */
+ if (!rb->nr_pages)
+ rb->paused = 1;
}
/*
@@ -264,6 +306,10 @@ static void ring_buffer_put_async(struct ring_buffer *rb)
* The ordering is similar to that of perf_output_{begin,end}, with
* the exception of (B), which should be taken care of by the pmu
* driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
*/
void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *event)
@@ -288,6 +334,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
goto err;
/*
+ * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+ * the aux buffer is in perf_mmap_close(), about to get freed.
+ */
+ if (!atomic_read(&rb->aux_mmap_count))
+ goto err_put;
+
+ /*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
@@ -328,10 +381,11 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
return handle->rb->aux_priv;
err_put:
+ /* can't be last */
rb_free_aux(rb);
err:
- ring_buffer_put_async(rb);
+ ring_buffer_put(rb);
handle->event = NULL;
return NULL;
@@ -342,11 +396,16 @@ err:
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
* pmu driver's responsibility to observe ordering rules of the hardware,
* so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
*/
void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
bool truncated)
{
struct ring_buffer *rb = handle->rb;
+ bool wakeup = truncated;
unsigned long aux_head;
u64 flags = 0;
@@ -375,14 +434,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
- perf_output_wakeup(handle);
+ wakeup = true;
local_add(rb->aux_watermark, &rb->aux_wakeup);
}
+
+ if (wakeup) {
+ if (truncated)
+ handle->event->pending_disable = 1;
+ perf_output_wakeup(handle);
+ }
+
handle->event = NULL;
local_set(&rb->aux_nest, 0);
+ /* can't be last */
rb_free_aux(rb);
- ring_buffer_put_async(rb);
+ ring_buffer_put(rb);
}
/*
@@ -463,6 +530,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
{
int pg;
+ /*
+ * Should never happen, the last reference should be dropped from
+ * perf_mmap_close() path, which first stops aux transactions (which
+ * in turn are the atomic holders of aux_refcount) and then does the
+ * last rb_free_aux().
+ */
+ WARN_ON_ONCE(in_atomic());
+
if (rb->aux_priv) {
rb->free_aux(rb->aux_priv);
rb->free_aux = NULL;
@@ -574,18 +649,7 @@ out:
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
- irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
- struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
- if (!atomic_read(&rb->aux_refcount))
__rb_free_aux(rb);
-
- if (rb->rcu_head.next == (void *)rb)
- call_rcu(&rb->rcu_head, rb_free_rcu);
}
#ifndef CONFIG_PERF_USE_VMALLOC
@@ -746,8 +810,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
- rb->page_order = ilog2(nr_pages);
- rb->nr_pages = !!nr_pages;
+ if (nr_pages) {
+ rb->nr_pages = 1;
+ rb->page_order = ilog2(nr_pages);
+ }
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 5f6ce931f1ea..7edc95edfaee 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -299,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
retry:
/* Read the page with vaddr into memory */
- ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
+ ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma);
if (ret <= 0)
return ret;
@@ -321,7 +321,7 @@ retry:
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = __replace_page(vma, vaddr, old_page, new_page);
- page_cache_release(new_page);
+ put_page(new_page);
put_old:
put_page(old_page);
@@ -539,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
* see uprobe_register().
*/
if (mapping->a_ops->readpage)
- page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
else
- page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
copy_from_page(page, offset, insn, nbytes);
- page_cache_release(page);
+ put_page(page);
return 0;
}
@@ -1701,7 +1701,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
- result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
+ /*
+ * The NULL 'tsk' here ensures that any faults that occur here
+ * will not be accounted to the task. 'mm' *is* current->mm,
+ * but we treat this as a 'remote' access since it is
+ * essentially a kernel access to the memory.
+ */
+ result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL);
if (result < 0)
return result;
diff --git a/kernel/exit.c b/kernel/exit.c
index 10e088237fed..fd90195667e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
+#include <linux/kcov.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -434,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
- exit_oom_victim();
+ exit_oom_victim(tsk);
}
static struct task_struct *find_alive_thread(struct task_struct *p)
@@ -655,6 +656,7 @@ void do_exit(long code)
TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
+ kcov_task_exit(tsk);
WARN_ON(blk_needs_flush_plug(tsk));
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c754ae7..3e8451527cbe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,7 @@
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
+#include <linux/kcov.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -164,12 +165,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
+ if (page)
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ 1 << THREAD_SIZE_ORDER);
+
return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
{
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ struct page *page = virt_to_page(ti);
+
+ memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
+ -(1 << THREAD_SIZE_ORDER));
+ __free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -384,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
account_kernel_stack(ti, 1);
+ kcov_task_init(tsk);
+
return tsk;
free_ti:
@@ -1483,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* sigaltstack should be cleared when sharing the same VM
*/
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
- p->sas_ss_sp = p->sas_ss_size = 0;
+ sas_ss_reset(p);
/*
* Syscall tracing and stepping should be turned off in the
@@ -1884,7 +1895,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
diff --git a/kernel/futex.c b/kernel/futex.c
index a5d2e74c89e0..c20f06f38ef3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1295,10 +1295,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (unlikely(should_fail_futex(true)))
ret = -EFAULT;
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
- else if (curval != uval)
- ret = -EINVAL;
+ } else if (curval != uval) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+ * try the TID->0 transition) raced with a waiter setting the
+ * FUTEX_WAITERS flag between get_user() and locking the hash
+ * bucket lock, retry the operation.
+ */
+ if ((FUTEX_TID_MASK & curval) == uval)
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
if (ret) {
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
@@ -1525,8 +1535,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
hb_waiters_dec(hb1);
- plist_add(&q->list, &hb2->chain);
hb_waiters_inc(hb2);
+ plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -2623,6 +2633,15 @@ retry:
if (ret == -EFAULT)
goto pi_faulted;
/*
+ * A unconditional UNLOCK_PI op raced against a waiter
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN) {
+ spin_unlock(&hb->lock);
+ put_futex_key(&key);
+ goto retry;
+ }
+ /*
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index e0f90c2b57aa..d234022805dc 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
rcu_read_unlock();
}
-static unsigned long timeout_jiffies(unsigned long timeout)
+static long hung_timeout_jiffies(unsigned long last_checked,
+ unsigned long timeout)
{
/* timeout of 0 will disable the watchdog */
- return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
+ return timeout ? last_checked - jiffies + timeout * HZ :
+ MAX_SCHEDULE_TIMEOUT;
}
/*
@@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector);
*/
static int watchdog(void *dummy)
{
+ unsigned long hung_last_checked = jiffies;
+
set_user_nice(current, 0);
for ( ; ; ) {
unsigned long timeout = sysctl_hung_task_timeout_secs;
+ long t = hung_timeout_jiffies(hung_last_checked, timeout);
- while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
- timeout = sysctl_hung_task_timeout_secs;
-
- if (atomic_xchg(&reset_hung_task, 0))
+ if (t <= 0) {
+ if (!atomic_xchg(&reset_hung_task, 0))
+ check_hung_uninterruptible_tasks(timeout);
+ hung_last_checked = jiffies;
continue;
-
- check_hung_uninterruptible_tasks(timeout);
+ }
+ schedule_timeout_interruptible(t);
}
return 0;
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index c37f34b00a11..c42742208e5e 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -19,9 +19,9 @@
*
* Allocate a virq that can be used to send IPI to any CPU in dest mask.
*
- * On success it'll return linux irq number and 0 on failure
+ * On success it'll return linux irq number and error code on failure
*/
-unsigned int irq_reserve_ipi(struct irq_domain *domain,
+int irq_reserve_ipi(struct irq_domain *domain,
const struct cpumask *dest)
{
unsigned int nr_irqs, offset;
@@ -30,18 +30,18 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
if (!domain ||!irq_domain_is_ipi(domain)) {
pr_warn("Reservation on a non IPI domain\n");
- return 0;
+ return -EINVAL;
}
if (!cpumask_subset(dest, cpu_possible_mask)) {
pr_warn("Reservation is not in possible_cpu_mask\n");
- return 0;
+ return -EINVAL;
}
nr_irqs = cpumask_weight(dest);
if (!nr_irqs) {
pr_warn("Reservation for empty destination mask\n");
- return 0;
+ return -EINVAL;
}
if (irq_domain_is_ipi_single(domain)) {
@@ -72,14 +72,14 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
next = cpumask_next(next, dest);
if (next < nr_cpu_ids) {
pr_warn("Destination mask has holes\n");
- return 0;
+ return -EINVAL;
}
}
virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc descs\n");
- return 0;
+ return -ENOMEM;
}
virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
@@ -94,22 +94,26 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
data = irq_get_irq_data(virq + i);
cpumask_copy(data->common->affinity, dest);
data->common->ipi_offset = offset;
+ irq_set_status_flags(virq + i, IRQ_NO_BALANCING);
}
return virq;
free_descs:
irq_free_descs(virq, nr_irqs);
- return 0;
+ return -EBUSY;
}
/**
* irq_destroy_ipi() - unreserve an IPI that was previously allocated
* @irq: linux irq number to be destroyed
+ * @dest: cpumask of cpus which should have the IPI removed
*
- * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
- * all virqs associated with them.
+ * The IPIs allocated with irq_reserve_ipi() are retuerned to the system
+ * destroying all virqs associated with them.
+ *
+ * Return 0 on success or error code on failure.
*/
-void irq_destroy_ipi(unsigned int irq)
+int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
@@ -117,7 +121,7 @@ void irq_destroy_ipi(unsigned int irq)
unsigned int nr_irqs;
if (!irq || !data || !ipimask)
- return;
+ return -EINVAL;
domain = data->domain;
if (WARN_ON(domain == NULL))
@@ -125,15 +129,25 @@ void irq_destroy_ipi(unsigned int irq)
if (!irq_domain_is_ipi(domain)) {
pr_warn("Trying to destroy a non IPI domain!\n");
- return;
+ return -EINVAL;
}
- if (irq_domain_is_ipi_per_cpu(domain))
- nr_irqs = cpumask_weight(ipimask);
- else
+ if (WARN_ON(!cpumask_subset(dest, ipimask)))
+ /*
+ * Must be destroying a subset of CPUs to which this IPI
+ * was set up to target
+ */
+ return -EINVAL;
+
+ if (irq_domain_is_ipi_per_cpu(domain)) {
+ irq = irq + cpumask_first(dest) - data->common->ipi_offset;
+ nr_irqs = cpumask_weight(dest);
+ } else {
nr_irqs = 1;
+ }
irq_domain_free_irqs(irq, nr_irqs);
+ return 0;
}
/**
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0ccd028817d7..8731e1c5d1e7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -595,7 +595,8 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
chip_bus_sync_unlock(desc);
}
-int irq_set_percpu_devid(unsigned int irq)
+int irq_set_percpu_devid_partition(unsigned int irq,
+ const struct cpumask *affinity)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -610,10 +611,33 @@ int irq_set_percpu_devid(unsigned int irq)
if (!desc->percpu_enabled)
return -ENOMEM;
+ if (affinity)
+ desc->percpu_affinity = affinity;
+ else
+ desc->percpu_affinity = cpu_possible_mask;
+
irq_set_percpu_devid_flags(irq);
return 0;
}
+int irq_set_percpu_devid(unsigned int irq)
+{
+ return irq_set_percpu_devid_partition(irq, NULL);
+}
+
+int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !desc->percpu_enabled)
+ return -EINVAL;
+
+ if (affinity)
+ cpumask_copy(affinity, desc->percpu_affinity);
+
+ return 0;
+}
+
void kstat_incr_irq_this_cpu(unsigned int irq)
{
kstat_incr_irqs_this_cpu(irq_to_desc(irq));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3a519a01118b..503c5b9dd030 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -243,14 +243,15 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
/**
- * irq_find_matching_fwnode() - Locates a domain for a given fwnode
- * @fwnode: FW descriptor of the interrupt controller
+ * irq_find_matching_fwspec() - Locates a domain for a given fwspec
+ * @fwspec: FW specifier for an interrupt
* @bus_token: domain-specific data
*/
-struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
+struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
enum irq_domain_bus_token bus_token)
{
struct irq_domain *h, *found = NULL;
+ struct fwnode_handle *fwnode = fwspec->fwnode;
int rc;
/* We might want to match the legacy controller last since
@@ -264,7 +265,9 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
- if (h->ops->match)
+ if (h->ops->select && fwspec->param_count)
+ rc = h->ops->select(h, fwspec, bus_token);
+ else if (h->ops->match)
rc = h->ops->match(h, to_of_node(fwnode), bus_token);
else
rc = ((fwnode != NULL) && (h->fwnode == fwnode) &&
@@ -279,7 +282,7 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
mutex_unlock(&irq_domain_mutex);
return found;
}
-EXPORT_SYMBOL_GPL(irq_find_matching_fwnode);
+EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
/**
* irq_set_default_host() - Set a "default" irq domain
@@ -574,11 +577,9 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
int virq;
if (fwspec->fwnode) {
- domain = irq_find_matching_fwnode(fwspec->fwnode,
- DOMAIN_BUS_WIRED);
+ domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
if (!domain)
- domain = irq_find_matching_fwnode(fwspec->fwnode,
- DOMAIN_BUS_ANY);
+ domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY);
} else {
domain = irq_default_domain;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 64731e84c982..ef0bc02c3a70 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1322,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
- pr_warning("irq %d uses trigger mode %u; requested %u\n",
- irq, nmsk, omsk);
+ pr_warn("irq %d uses trigger mode %u; requested %u\n",
+ irq, nmsk, omsk);
}
*old_ptr = new;
@@ -1407,7 +1407,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
int retval;
struct irq_desc *desc = irq_to_desc(irq);
- if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..a02f2dddd1d7
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,274 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#define DISABLE_BRANCH_PROFILING
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ * - initial state after open()
+ * - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ * - then, mmap() call (several calls are allowed but not useful)
+ * - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+ /*
+ * Reference counter. We keep one for:
+ * - opened file descriptor
+ * - task with enabled coverage (we can't unwire it from another task)
+ */
+ atomic_t refcount;
+ /* The lock protects mode, size, area and t. */
+ spinlock_t lock;
+ enum kcov_mode mode;
+ /* Size of arena (in long's for KCOV_MODE_TRACE). */
+ unsigned size;
+ /* Coverage buffer shared with user space. */
+ void *area;
+ /* Task for which we collect coverage, or NULL. */
+ struct task_struct *t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ enum kcov_mode mode;
+
+ t = current;
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts.
+ */
+ if (!t || in_interrupt())
+ return;
+ mode = READ_ONCE(t->kcov_mode);
+ if (mode == KCOV_MODE_TRACE) {
+ unsigned long *area;
+ unsigned long pos;
+
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ area = t->kcov_area;
+ /* The first word is number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = _RET_IP_;
+ WRITE_ONCE(area[0], pos);
+ }
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+ atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+ if (atomic_dec_and_test(&kcov->refcount)) {
+ vfree(kcov->area);
+ kfree(kcov);
+ }
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ t->kcov_mode = KCOV_MODE_DISABLED;
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+ t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+ struct kcov *kcov;
+
+ kcov = t->kcov;
+ if (kcov == NULL)
+ return;
+ spin_lock(&kcov->lock);
+ if (WARN_ON(kcov->t != t)) {
+ spin_unlock(&kcov->lock);
+ return;
+ }
+ /* Just to not leave dangling references behind. */
+ kcov_task_init(t);
+ kcov->t = NULL;
+ spin_unlock(&kcov->lock);
+ kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+ int res = 0;
+ void *area;
+ struct kcov *kcov = vma->vm_file->private_data;
+ unsigned long size, off;
+ struct page *page;
+
+ area = vmalloc_user(vma->vm_end - vma->vm_start);
+ if (!area)
+ return -ENOMEM;
+
+ spin_lock(&kcov->lock);
+ size = kcov->size * sizeof(unsigned long);
+ if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+ vma->vm_end - vma->vm_start != size) {
+ res = -EINVAL;
+ goto exit;
+ }
+ if (!kcov->area) {
+ kcov->area = area;
+ vma->vm_flags |= VM_DONTEXPAND;
+ spin_unlock(&kcov->lock);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
+ }
+ return 0;
+ }
+exit:
+ spin_unlock(&kcov->lock);
+ vfree(area);
+ return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+ struct kcov *kcov;
+
+ kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+ if (!kcov)
+ return -ENOMEM;
+ atomic_set(&kcov->refcount, 1);
+ spin_lock_init(&kcov->lock);
+ filep->private_data = kcov;
+ return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+ kcov_put(filep->private_data);
+ return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+ unsigned long arg)
+{
+ struct task_struct *t;
+ unsigned long size, unused;
+
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ */
+ if (kcov->mode != KCOV_MODE_DISABLED)
+ return -EBUSY;
+ /*
+ * Size must be at least 2 to hold current position and one PC.
+ * Later we allocate size * sizeof(unsigned long) memory,
+ * that must not overflow.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_TRACE;
+ return 0;
+ case KCOV_ENABLE:
+ /*
+ * Enable coverage for the current task.
+ * At this point user must have been enabled trace mode,
+ * and mmapped the file. Coverage collection is disabled only
+ * at task exit or voluntary by KCOV_DISABLE. After that it can
+ * be enabled for another task.
+ */
+ unused = arg;
+ if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+ kcov->area == NULL)
+ return -EINVAL;
+ if (kcov->t != NULL)
+ return -EBUSY;
+ t = current;
+ /* Cache in task struct for performance. */
+ t->kcov_size = kcov->size;
+ t->kcov_area = kcov->area;
+ /* See comment in __sanitizer_cov_trace_pc(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, kcov->mode);
+ t->kcov = kcov;
+ kcov->t = t;
+ /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ case KCOV_DISABLE:
+ /* Disable coverage for the current task. */
+ unused = arg;
+ if (unused != 0 || current->kcov != kcov)
+ return -EINVAL;
+ t = current;
+ if (WARN_ON(kcov->t != t))
+ return -EINVAL;
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov_put(kcov);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct kcov *kcov;
+ int res;
+
+ kcov = filep->private_data;
+ spin_lock(&kcov->lock);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock(&kcov->lock);
+ return res;
+}
+
+static const struct file_operations kcov_fops = {
+ .open = kcov_open,
+ .unlocked_ioctl = kcov_ioctl,
+ .mmap = kcov_mmap,
+ .release = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+ if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+ pr_err("failed to create kcov in debugfs\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8d34308ea449..1391d3ee3b86 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1415,6 +1415,9 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(page, lru);
VMCOREINFO_OFFSET(page, _mapcount);
VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(page, compound_dtor);
+ VMCOREINFO_OFFSET(page, compound_order);
+ VMCOREINFO_OFFSET(page, compound_head);
VMCOREINFO_OFFSET(pglist_data, node_zones);
VMCOREINFO_OFFSET(pglist_data, nr_zones);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1447,8 +1450,8 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_X86
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
#endif
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
+#ifdef CONFIG_HUGETLB_PAGE
+ VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
#endif
arch_crash_save_vmcoreinfo();
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6cc2a4a..31322a4275cd 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 53ab2f85d77e..81f1a7107c0e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,6 +45,7 @@
#include <linux/bitops.h>
#include <linux/gfp.h>
#include <linux/kmemcheck.h>
+#include <linux/random.h>
#include <asm/sections.h>
@@ -708,7 +709,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
* yet. Otherwise we look it up. We cache the result in the lock object
* itself, so actual lookup of the hash should be once per lock object.
*/
-static inline struct lock_class *
+static struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
@@ -1999,6 +2000,79 @@ static inline int get_first_held_lock(struct task_struct *curr,
return ++i;
}
+#ifdef CONFIG_DEBUG_LOCKDEP
+/*
+ * Returns the next chain_key iteration
+ */
+static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+{
+ u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+
+ printk(" class_idx:%d -> chain_key:%016Lx",
+ class_idx,
+ (unsigned long long)new_chain_key);
+ return new_chain_key;
+}
+
+static void
+print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
+{
+ struct held_lock *hlock;
+ u64 chain_key = 0;
+ int depth = curr->lockdep_depth;
+ int i;
+
+ printk("depth: %u\n", depth + 1);
+ for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ hlock = curr->held_locks + i;
+ chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+
+ print_lock(hlock);
+ }
+
+ print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_lock(hlock_next);
+}
+
+static void print_chain_keys_chain(struct lock_chain *chain)
+{
+ int i;
+ u64 chain_key = 0;
+ int class_id;
+
+ printk("depth: %u\n", chain->depth);
+ for (i = 0; i < chain->depth; i++) {
+ class_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+
+ print_lock_name(lock_classes + class_id);
+ printk("\n");
+ }
+}
+
+static void print_collision(struct task_struct *curr,
+ struct held_lock *hlock_next,
+ struct lock_chain *chain)
+{
+ printk("\n");
+ printk("======================\n");
+ printk("[chain_key collision ]\n");
+ print_kernel_ident();
+ printk("----------------------\n");
+ printk("%s/%d: ", current->comm, task_pid_nr(current));
+ printk("Hash chain already cached but the contents don't match!\n");
+
+ printk("Held locks:");
+ print_chain_keys_held_locks(curr, hlock_next);
+
+ printk("Locks in cached chain:");
+ print_chain_keys_chain(chain);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+}
+#endif
+
/*
* Checks whether the chain and the current held locks are consistent
* in depth and also in content. If they are not it most likely means
@@ -2014,14 +2088,18 @@ static int check_no_collision(struct task_struct *curr,
i = get_first_held_lock(curr, hlock);
- if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+ if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) {
+ print_collision(curr, hlock, chain);
return 0;
+ }
for (j = 0; j < chain->depth - 1; j++, i++) {
id = curr->held_locks[i].class_idx - 1;
- if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+ if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
+ print_collision(curr, hlock, chain);
return 0;
+ }
}
#endif
return 1;
@@ -2099,15 +2177,37 @@ cache_hit:
chain->irq_context = hlock->irq_context;
i = get_first_held_lock(curr, hlock);
chain->depth = curr->lockdep_depth + 1 - i;
+
+ BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+ BUILD_BUG_ON((1UL << 6) <= ARRAY_SIZE(curr->held_locks));
+ BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
- nr_chain_hlocks += chain->depth;
for (j = 0; j < chain->depth - 1; j++, i++) {
int lock_id = curr->held_locks[i].class_idx - 1;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
+
+ if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
+ nr_chain_hlocks += chain->depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+ /*
+ * Important for check_no_collision().
+ */
+ if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ if (debug_locks_off_graph_unlock())
+ return 0;
+
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+ dump_stack();
+ return 0;
+ }
+#endif
+
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -2855,6 +2955,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 2 * !!task->hardirq_context + !!task->softirq_context;
+}
+
static int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -2863,8 +2968,6 @@ static int separate_irq_context(struct task_struct *curr,
/*
* Keep track of points where we cross into an interrupt context:
*/
- hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
- curr->softirq_context;
if (depth) {
struct held_lock *prev_hlock;
@@ -2896,6 +2999,11 @@ static inline int mark_irqflags(struct task_struct *curr,
return 1;
}
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
static inline int separate_irq_context(struct task_struct *curr,
struct held_lock *hlock)
{
@@ -3164,6 +3272,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->acquire_ip = ip;
hlock->instance = lock;
hlock->nest_lock = nest_lock;
+ hlock->irq_context = task_irq_context(curr);
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
@@ -3477,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock)
return 0;
}
-static void __lock_pin_lock(struct lockdep_map *lock)
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+ struct pin_cookie cookie = NIL_COOKIE;
+ struct task_struct *curr = current;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return cookie;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ /*
+ * Grab 16bits of randomness; this is sufficient to not
+ * be guessable and still allows some pin nesting in
+ * our u32 pin_count.
+ */
+ cookie.val = 1 + (prandom_u32() >> 16);
+ hlock->pin_count += cookie.val;
+ return cookie;
+ }
+ }
+
+ WARN(1, "pinning an unheld lock\n");
+ return cookie;
+}
+
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
struct task_struct *curr = current;
int i;
@@ -3489,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
struct held_lock *hlock = curr->held_locks + i;
if (match_held_lock(hlock, lock)) {
- hlock->pin_count++;
+ hlock->pin_count += cookie.val;
return;
}
}
@@ -3497,7 +3634,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
WARN(1, "pinning an unheld lock\n");
}
-static void __lock_unpin_lock(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
struct task_struct *curr = current;
int i;
@@ -3512,7 +3649,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock)
if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
return;
- hlock->pin_count--;
+ hlock->pin_count -= cookie.val;
+
+ if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n"))
+ hlock->pin_count = 0;
+
return;
}
}
@@ -3643,24 +3784,44 @@ int lock_is_held(struct lockdep_map *lock)
}
EXPORT_SYMBOL_GPL(lock_is_held);
-void lock_pin_lock(struct lockdep_map *lock)
+struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
{
+ struct pin_cookie cookie = NIL_COOKIE;
unsigned long flags;
if (unlikely(current->lockdep_recursion))
- return;
+ return cookie;
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
- __lock_pin_lock(lock);
+ cookie = __lock_pin_lock(lock);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
+
+ return cookie;
}
EXPORT_SYMBOL_GPL(lock_pin_lock);
-void lock_unpin_lock(struct lockdep_map *lock)
+void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_repin_lock(lock, cookie);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_repin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
@@ -3671,7 +3832,7 @@ void lock_unpin_lock(struct lockdep_map *lock)
check_flags(flags);
current->lockdep_recursion = 1;
- __lock_unpin_lock(lock);
+ __lock_unpin_lock(lock, cookie);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dbb61a302548..a0f61effad25 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v)
int i;
if (v == SEQ_START_TOKEN) {
+ if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+ seq_printf(m, "(buggered) ");
seq_printf(m, "all lock chains:\n");
return 0;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919d63b2..f8c5af52a131 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -75,12 +75,7 @@ struct lock_stress_stats {
long n_lock_acquired;
};
-#if defined(MODULE)
-#define LOCKTORTURE_RUNNABLE_INIT 1
-#else
-#define LOCKTORTURE_RUNNABLE_INIT 0
-#endif
-int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+int torture_runnable = IS_ENABLED(MODULE);
module_param(torture_runnable, int, 0444);
MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
@@ -394,12 +389,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
if (!rt_task(current)) {
/*
- * (1) Boost priority once every ~50k operations. When the
+ * Boost priority once every ~50k operations. When the
* task tries to take the lock, the rtmutex it will account
* for the new priority, and do any corresponding pi-dance.
*/
- if (!(torture_random(trsp) %
- (cxt.nrealwriters_stress * factor))) {
+ if (trsp && !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
policy = SCHED_FIFO;
param.sched_priority = MAX_RT_PRIO - 1;
} else /* common case, do nothing */
@@ -748,6 +743,15 @@ static void lock_torture_cleanup(void)
if (torture_cleanup_begin())
return;
+ /*
+ * Indicates early cleanup, meaning that the test has not run,
+ * such as when passing bogus args when loading the module. As
+ * such, only perform the underlying torture-specific cleanups,
+ * and avoid anything related to locktorture.
+ */
+ if (!cxt.lwsa)
+ goto end;
+
if (writer_tasks) {
for (i = 0; i < cxt.nrealwriters_stress; i++)
torture_stop_kthread(lock_torture_writer,
@@ -776,6 +780,7 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+end:
torture_cleanup_end();
}
@@ -870,6 +875,7 @@ static int __init lock_torture_init(void)
VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
firsterr = -ENOMEM;
kfree(cxt.lwsa);
+ cxt.lwsa = NULL;
goto unwind;
}
@@ -878,6 +884,7 @@ static int __init lock_torture_init(void)
cxt.lrsa[i].n_lock_acquired = 0;
}
}
+
lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
/* Prepare torture context. */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb2a2c9bc3fc..22e025309845 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -136,10 +136,12 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
}
if (counter == qstat_pv_hash_hops) {
- u64 frac;
+ u64 frac = 0;
- frac = 100ULL * do_div(stat, kicks);
- frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+ if (kicks) {
+ frac = 100ULL * do_div(stat, kicks);
+ frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+ }
/*
* Return a X.XX decimal number
@@ -189,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
for (i = 0 ; i < qstat_num; i++)
WRITE_ONCE(ptr[i], 0);
- for (i = 0 ; i < qstat_num; i++)
- WRITE_ONCE(ptr[i], 0);
}
return count;
}
@@ -212,10 +212,8 @@ static int __init init_qspinlock_stat(void)
struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
int i;
- if (!d_qstat) {
- pr_warn("Could not create 'qlockstat' debugfs directory\n");
- return 0;
- }
+ if (!d_qstat)
+ goto out;
/*
* Create the debugfs files
@@ -225,12 +223,20 @@ static int __init init_qspinlock_stat(void)
* performance.
*/
for (i = 0; i < qstat_num; i++)
- debugfs_create_file(qstat_names[i], 0400, d_qstat,
- (void *)(long)i, &fops_qstat);
+ if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
+ (void *)(long)i, &fops_qstat))
+ goto fail_undo;
+
+ if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+ (void *)(long)qstat_reset_cnts, &fops_qstat))
+ goto fail_undo;
- debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
- (void *)(long)qstat_reset_cnts, &fops_qstat);
return 0;
+fail_undo:
+ debugfs_remove_recursive(d_qstat);
+out:
+ pr_warn("Could not create 'qlockstat' debugfs entries\n");
+ return -ENOMEM;
}
fs_initcall(init_qspinlock_stat);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 3a5048572065..1591f6b3539f 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem)
/*
* get a write lock on the semaphore
*/
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+int __sched __down_write_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
struct task_struct *tsk;
unsigned long flags;
+ int ret = 0;
raw_spin_lock_irqsave(&sem->wait_lock, flags);
@@ -215,21 +216,33 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
*/
if (sem->count == 0)
break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ set_task_state(tsk, state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
/* got the lock */
sem->count = -1;
+out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
}
void __sched __down_write(struct rw_semaphore *sem)
{
- __down_write_nested(sem, 0);
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
}
/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..09e30c6225e5 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
/*
* Wait until we successfully acquire the write lock
*/
-__visible
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore *
+__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
{
long count;
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
+ struct rw_semaphore *ret = sem;
/* undo write bias from down_write operation, stop active locking */
count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
@@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
/* wait until we successfully acquire the lock */
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
while (true) {
if (rwsem_try_write_lock(count, sem))
break;
@@ -486,21 +487,48 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
/* Block until there are no active lockers. */
do {
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
+ list_del(&waiter.list);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ return ret;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+ else
+ __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
raw_spin_unlock_irq(&sem->wait_lock);
- return sem;
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(rwsem_down_write_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed_killable);
+
/*
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 205be0ce34de..c817216c1615 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -55,6 +55,25 @@ void __sched down_write(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_write);
/*
+ * lock for writing
+ */
+int __sched down_write_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_owner(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable);
+
+/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
int down_write_trylock(struct rw_semaphore *sem)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 584febd13e2e..a6d382312e6f 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -41,11 +41,13 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* memremap() - remap an iomem_resource as cacheable memory
* @offset: iomem resource start address
* @size: size of remap
- * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC
*
* memremap() is "ioremap" for cases where it is known that the resource
* being mapped does not have i/o side effects and the __iomem
- * annotation is not applicable.
+ * annotation is not applicable. In the case of multiple flags, the different
+ * mapping types will be attempted in the order listed below until one of
+ * them succeeds.
*
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
@@ -57,6 +59,10 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* cache or are written through to memory and never exist in a
* cache-dirty state with respect to program visibility. Attempts to
* map System RAM with this mapping type will fail.
+ *
+ * MEMREMAP_WC - establish a writecombine mapping, whereby writes may
+ * be coalesced together (e.g. in the CPU's write buffers), but is otherwise
+ * uncached. Attempts to map System RAM with this mapping type will fail.
*/
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
@@ -64,6 +70,9 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
void *addr = NULL;
+ if (!flags)
+ return NULL;
+
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
&offset, (unsigned long) size);
@@ -72,7 +81,6 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
/* Try all mapping types requested until one returns non-NULL */
if (flags & MEMREMAP_WB) {
- flags &= ~MEMREMAP_WB;
/*
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map. Some archs depend on the
@@ -86,21 +94,22 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
}
/*
- * If we don't have a mapping yet and more request flags are
- * pending then we will be attempting to establish a new virtual
+ * If we don't have a mapping yet and other request flags are
+ * present then we will be attempting to establish a new virtual
* address mapping. Enforce that this mapping is not aliasing
* System RAM.
*/
- if (!addr && is_ram == REGION_INTERSECTS && flags) {
+ if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) {
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
&offset, (unsigned long) size);
return NULL;
}
- if (!addr && (flags & MEMREMAP_WT)) {
- flags &= ~MEMREMAP_WT;
+ if (!addr && (flags & MEMREMAP_WT))
addr = ioremap_wt(offset, size);
- }
+
+ if (!addr && (flags & MEMREMAP_WC))
+ addr = ioremap_wc(offset, size);
return addr;
}
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 49746c81ad8d..782102e59eed 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -25,6 +25,7 @@
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
+#include <linux/cgroup.h>
static struct kmem_cache *nsproxy_cachep;
@@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
+#ifdef CONFIG_CGROUPS
+ .cgroup_ns = &init_cgroup_ns,
+#endif
};
static inline struct nsproxy *create_nsproxy(void)
@@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_pid;
}
+ new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
+ tsk->nsproxy->cgroup_ns);
+ if (IS_ERR(new_nsp->cgroup_ns)) {
+ err = PTR_ERR(new_nsp->cgroup_ns);
+ goto out_cgroup;
+ }
+
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
if (IS_ERR(new_nsp->net_ns)) {
err = PTR_ERR(new_nsp->net_ns);
@@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
return new_nsp;
out_net:
+ put_cgroup_ns(new_nsp->cgroup_ns);
+out_cgroup:
if (new_nsp->pid_ns_for_children)
put_pid_ns(new_nsp->pid_ns_for_children);
out_pid:
@@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *new_ns;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWPID | CLONE_NEWNET)))) {
+ CLONE_NEWPID | CLONE_NEWNET |
+ CLONE_NEWCGROUP)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
}
@@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/panic.c b/kernel/panic.c
index d96469de72dc..535c96510a44 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,7 @@
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
+#include <linux/bug.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -72,6 +73,26 @@ void __weak nmi_panic_self_stop(struct pt_regs *regs)
atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+/*
+ * A variant of panic() called from NMI context. We return if we've already
+ * panicked on this CPU. If another CPU already panicked, loop in
+ * nmi_panic_self_stop() which can provide architecture dependent code such
+ * as saving register state for crash dump.
+ */
+void nmi_panic(struct pt_regs *regs, const char *msg)
+{
+ int old_cpu, cpu;
+
+ cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+
+ if (old_cpu == PANIC_CPU_INVALID)
+ panic("%s", msg);
+ else if (old_cpu != cpu)
+ nmi_panic_self_stop(regs);
+}
+EXPORT_SYMBOL(nmi_panic);
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -449,20 +470,25 @@ void oops_exit(void)
kmsg_dump(KMSG_DUMP_OOPS);
}
-#ifdef WANT_WARN_ON_SLOWPATH
-struct slowpath_args {
+struct warn_args {
const char *fmt;
va_list args;
};
-static void warn_slowpath_common(const char *file, int line, void *caller,
- unsigned taint, struct slowpath_args *args)
+void __warn(const char *file, int line, void *caller, unsigned taint,
+ struct pt_regs *regs, struct warn_args *args)
{
disable_trace_on_warning();
pr_warn("------------[ cut here ]------------\n");
- pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
- raw_smp_processor_id(), current->pid, file, line, caller);
+
+ if (file)
+ pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
+ raw_smp_processor_id(), current->pid, file, line,
+ caller);
+ else
+ pr_warn("WARNING: CPU: %d PID: %d at %pS\n",
+ raw_smp_processor_id(), current->pid, caller);
if (args)
vprintk(args->fmt, args->args);
@@ -479,20 +505,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller,
}
print_modules();
- dump_stack();
+
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
print_oops_end_marker();
+
/* Just a warning, don't kill lockdep. */
add_taint(taint, LOCKDEP_STILL_OK);
}
+#ifdef WANT_WARN_ON_SLOWPATH
void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
{
- struct slowpath_args args;
+ struct warn_args args;
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, &args);
+ __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL,
+ &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt);
@@ -500,20 +533,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt);
void warn_slowpath_fmt_taint(const char *file, int line,
unsigned taint, const char *fmt, ...)
{
- struct slowpath_args args;
+ struct warn_args args;
args.fmt = fmt;
va_start(args.args, fmt);
- warn_slowpath_common(file, line, __builtin_return_address(0),
- taint, &args);
+ __warn(file, line, __builtin_return_address(0), taint, NULL, &args);
va_end(args.args);
}
EXPORT_SYMBOL(warn_slowpath_fmt_taint);
void warn_slowpath_null(const char *file, int line)
{
- warn_slowpath_common(file, line, __builtin_return_address(0),
- TAINT_WARN, NULL);
+ __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
}
EXPORT_SYMBOL(warn_slowpath_null);
#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa0f26b58426..fca9254280ee 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode)
pm_message_t msg;
int error;
+ pm_suspend_clear_flags();
error = platform_begin(platform_mode);
if (error)
goto Close;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 230a77225e2e..5b70d64b871e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
- pr_warning("PM: Unsupported test mode for suspend to idle,"
- "please choose none/freezer/devices/platform.\n");
+ pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
return -EAGAIN;
}
#endif
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 12cd989dadf6..160e1006640d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -37,6 +37,14 @@
#define HIBERNATE_SIG "S1SUSPEND"
/*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
+/*
* The swap map is a data structure used for keeping track of each page
* written to a swap partition. It consists of many swap_map_page
* structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
@@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio)
if (bio_data_dir(bio) == WRITE)
put_page(page);
+ else if (clean_pages_on_read)
+ flush_icache_range((unsigned long)page_address(page),
+ (unsigned long)page_address(page) + PAGE_SIZE);
if (bio->bi_error && !hb->error)
hb->error = bio->bi_error;
@@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle,
hib_init_batch(&hb);
+ clean_pages_on_read = true;
printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
nr_to_read);
m = nr_to_read / 10;
@@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data)
d->unc_len = LZO_UNC_SIZE;
d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
d->unc, &d->unc_len);
+ if (clean_pages_on_decompress)
+ flush_icache_range((unsigned long)d->unc,
+ (unsigned long)d->unc + d->unc_len);
+
atomic_set(&d->stop, 1);
wake_up(&d->done);
}
@@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
memset(crc, 0, offsetof(struct crc_data, go));
+ clean_pages_on_decompress = true;
+
/*
* Start the decompression threads.
*/
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index c963ba534a78..bfbf284e4218 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -367,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty)
static int log_make_free_space(u32 msg_size)
{
- while (log_first_seq < log_next_seq) {
- if (logbuf_has_space(msg_size, false))
- return 0;
+ while (log_first_seq < log_next_seq &&
+ !logbuf_has_space(msg_size, false)) {
/* drop old messages until we have enough contiguous space */
log_first_idx = log_next(log_first_idx);
log_first_seq++;
}
+ if (clear_seq < log_first_seq) {
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ }
+
/* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, true))
+ if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
return 0;
return -ENOMEM;
@@ -854,6 +858,7 @@ void log_buf_kexec_setup(void)
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(log_first_idx);
+ VMCOREINFO_SYMBOL(clear_idx);
VMCOREINFO_SYMBOL(log_next_idx);
/*
* Export struct printk_log size and field offsets. User space tools can
@@ -1216,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
u32 idx;
enum log_flags prev;
- if (clear_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
@@ -1483,58 +1482,6 @@ static void zap_locks(void)
sema_init(&console_sem, 1);
}
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if (con->flags & CON_ANYTIME)
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(unsigned int cpu)
-{
- return cpu_online(cpu) || have_callable_console();
-}
-
-/*
- * Try to get console ownership to actually show the kernel
- * messages from a 'printk'. Return true (and with the
- * console_lock held, and 'console_locked' set) if it
- * is successful, false otherwise.
- */
-static int console_trylock_for_printk(void)
-{
- unsigned int cpu = smp_processor_id();
-
- if (!console_trylock())
- return 0;
- /*
- * If we can't use the console, we need to release the console
- * semaphore by hand to avoid flushing the buffer. We need to hold the
- * console semaphore in order to do this test safely.
- */
- if (!can_use_console(cpu)) {
- console_locked = 0;
- up_console_sem();
- return 0;
- }
- return 1;
-}
-
int printk_delay_msec __read_mostly;
static inline void printk_delay(void)
@@ -1681,7 +1628,6 @@ asmlinkage int vprintk_emit(int facility, int level,
boot_delay_msec(level);
printk_delay();
- /* This stops the holder of console_sem just where we want him */
local_irq_save(flags);
this_cpu = smp_processor_id();
@@ -1705,6 +1651,7 @@ asmlinkage int vprintk_emit(int facility, int level,
}
lockdep_off();
+ /* This stops the holder of console_sem just where we want him */
raw_spin_lock(&logbuf_lock);
logbuf_cpu = this_cpu;
@@ -1810,20 +1757,12 @@ asmlinkage int vprintk_emit(int facility, int level,
if (!in_sched) {
lockdep_off();
/*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
-
- /*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock_for_printk())
+ if (console_trylock())
console_unlock();
- preempt_enable();
lockdep_on();
}
@@ -2174,7 +2113,20 @@ int console_trylock(void)
return 0;
}
console_locked = 1;
- console_may_schedule = 0;
+ /*
+ * When PREEMPT_COUNT disabled we can't reliably detect if it's
+ * safe to schedule (e.g. calling printk while holding a spin_lock),
+ * because preempt_disable()/preempt_enable() are just barriers there
+ * and preempt_count() is always 0.
+ *
+ * RCU read sections have a separate preemption counter when
+ * PREEMPT_RCU enabled thus we must take extra care and check
+ * rcu_preempt_depth(), otherwise RCU read sections modify
+ * preempt_count().
+ */
+ console_may_schedule = !oops_in_progress &&
+ preemptible() &&
+ !rcu_preempt_depth();
return 1;
}
EXPORT_SYMBOL(console_trylock);
@@ -2184,6 +2136,34 @@ int is_console_locked(void)
return console_locked;
}
+/*
+ * Check if we have any console that is capable of printing while cpu is
+ * booting or shutting down. Requires console_sem.
+ */
+static int have_callable_console(void)
+{
+ struct console *con;
+
+ for_each_console(con)
+ if ((con->flags & CON_ENABLED) &&
+ (con->flags & CON_ANYTIME))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Can we actually use the console at this time on this cpu?
+ *
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
+ */
+static inline int can_use_console(void)
+{
+ return cpu_online(raw_smp_processor_id()) || have_callable_console();
+}
+
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2254,9 +2234,21 @@ void console_unlock(void)
do_cond_resched = console_may_schedule;
console_may_schedule = 0;
+again:
+ /*
+ * We released the console_sem lock, so we need to recheck if
+ * cpu is online and (if not) is there at least one CON_ANYTIME
+ * console.
+ */
+ if (!can_use_console()) {
+ console_locked = 0;
+ up_console_sem();
+ return;
+ }
+
/* flush buffered message fragment immediately to console */
console_cont_flush(text, sizeof(text));
-again:
+
for (;;) {
struct printk_log *msg;
size_t ext_len = 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 51369697466e..c2199e9901c9 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -44,7 +44,7 @@ int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
static cpumask_var_t prof_cpu_mask;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
@@ -202,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
/*
* Each cpu has a pair of open-addressed hashtables for pending
* profile hits. read_profile() IPI's all cpus to request them
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2341efe7fe02..d49bfa1e53e6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child)
{
BUG_ON(!child->ptrace);
- child->ptrace = 0;
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
spin_lock(&child->sighand->siglock);
-
+ child->ptrace = 0;
/*
* Clear all pending traps and TRAPPING. TRAPPING should be
* cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly.
@@ -681,7 +680,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
break;
#ifdef CONFIG_COMPAT
- if (unlikely(is_compat_task())) {
+ if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
if (copy_siginfo_to_user32(uinfo, &info) ||
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 61a16569ffbf..18dfc485225c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,11 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
new file mode 100644
index 000000000000..3cee0d8393ed
--- /dev/null
+++ b/kernel/rcu/rcuperf.c
@@ -0,0 +1,655 @@
+/*
+ * Read-Copy Update module-based performance-test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+
+#define PERF_FLAG "-perf:"
+#define PERFOUT_STRING(s) \
+ pr_alert("%s" PERF_FLAG s "\n", perf_type)
+#define VERBOSE_PERFOUT_STRING(s) \
+ do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
+#define VERBOSE_PERFOUT_ERRSTRING(s) \
+ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+
+torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+
+static char *perf_type = "rcu";
+module_param(perf_type, charp, 0444);
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static int *writer_n_durations;
+static atomic_t n_rcu_perf_reader_started;
+static atomic_t n_rcu_perf_writer_started;
+static atomic_t n_rcu_perf_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_perf_writer_started;
+static u64 t_rcu_perf_writer_finished;
+static unsigned long b_rcu_perf_writer_started;
+static unsigned long b_rcu_perf_writer_finished;
+
+static int rcu_perf_writer_state;
+#define RTWS_INIT 0
+#define RTWS_EXP_SYNC 1
+#define RTWS_SYNC 2
+#define RTWS_IDLE 2
+#define RTWS_STOPPING 3
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
+#define RCUPERF_RUNNABLE_INIT 1
+#else
+#define RCUPERF_RUNNABLE_INIT 0
+#endif
+static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+module_param(perf_runnable, int, 0444);
+MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_perf_ops {
+ int ptype;
+ void (*init)(void);
+ void (*cleanup)(void);
+ int (*readlock)(void);
+ void (*readunlock)(int idx);
+ unsigned long (*started)(void);
+ unsigned long (*completed)(void);
+ unsigned long (*exp_completed)(void);
+ void (*sync)(void);
+ void (*exp_sync)(void);
+ const char *name;
+};
+
+static struct rcu_perf_ops *cur_ops;
+
+/*
+ * Definitions for rcu perf testing.
+ */
+
+static int rcu_perf_read_lock(void) __acquires(RCU)
+{
+ rcu_read_lock();
+ return 0;
+}
+
+static void rcu_perf_read_unlock(int idx) __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+ return 0;
+}
+
+static void rcu_sync_perf_init(void)
+{
+}
+
+static struct rcu_perf_ops rcu_ops = {
+ .ptype = RCU_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = rcu_perf_read_lock,
+ .readunlock = rcu_perf_read_unlock,
+ .started = rcu_batches_started,
+ .completed = rcu_batches_completed,
+ .exp_completed = rcu_exp_batches_completed,
+ .sync = synchronize_rcu,
+ .exp_sync = synchronize_rcu_expedited,
+ .name = "rcu"
+};
+
+/*
+ * Definitions for rcu_bh perf testing.
+ */
+
+static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH)
+{
+ rcu_read_lock_bh();
+ return 0;
+}
+
+static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH)
+{
+ rcu_read_unlock_bh();
+}
+
+static struct rcu_perf_ops rcu_bh_ops = {
+ .ptype = RCU_BH_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = rcu_bh_perf_read_lock,
+ .readunlock = rcu_bh_perf_read_unlock,
+ .started = rcu_batches_started_bh,
+ .completed = rcu_batches_completed_bh,
+ .exp_completed = rcu_exp_batches_completed_sched,
+ .sync = synchronize_rcu_bh,
+ .exp_sync = synchronize_rcu_bh_expedited,
+ .name = "rcu_bh"
+};
+
+/*
+ * Definitions for srcu perf testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_perf);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf;
+
+static int srcu_perf_read_lock(void) __acquires(srcu_ctlp)
+{
+ return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp)
+{
+ srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_perf_completed(void)
+{
+ return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize(void)
+{
+ synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize_expedited(void)
+{
+ synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcu_ops = {
+ .ptype = SRCU_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = srcu_perf_read_lock,
+ .readunlock = srcu_perf_read_unlock,
+ .started = NULL,
+ .completed = srcu_perf_completed,
+ .exp_completed = srcu_perf_completed,
+ .sync = srcu_perf_synchronize,
+ .exp_sync = srcu_perf_synchronize_expedited,
+ .name = "srcu"
+};
+
+/*
+ * Definitions for sched perf testing.
+ */
+
+static int sched_perf_read_lock(void)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void sched_perf_read_unlock(int idx)
+{
+ preempt_enable();
+}
+
+static struct rcu_perf_ops sched_ops = {
+ .ptype = RCU_SCHED_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = sched_perf_read_lock,
+ .readunlock = sched_perf_read_unlock,
+ .started = rcu_batches_started_sched,
+ .completed = rcu_batches_completed_sched,
+ .exp_completed = rcu_exp_batches_completed_sched,
+ .sync = synchronize_sched,
+ .exp_sync = synchronize_sched_expedited,
+ .name = "sched"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks perf testing.
+ */
+
+static int tasks_perf_read_lock(void)
+{
+ return 0;
+}
+
+static void tasks_perf_read_unlock(int idx)
+{
+}
+
+static struct rcu_perf_ops tasks_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_perf_init,
+ .readlock = tasks_perf_read_lock,
+ .readunlock = tasks_perf_read_unlock,
+ .started = rcu_no_completed,
+ .completed = rcu_no_completed,
+ .sync = synchronize_rcu_tasks,
+ .exp_sync = synchronize_rcu_tasks,
+ .name = "tasks"
+};
+
+#define RCUPERF_TASKS_OPS &tasks_ops,
+
+static bool __maybe_unused torturing_tasks(void)
+{
+ return cur_ops == &tasks_ops;
+}
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUPERF_TASKS_OPS
+
+static bool __maybe_unused torturing_tasks(void)
+{
+ return false;
+}
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
+/*
+ * If performance tests complete, wait for shutdown to commence.
+ */
+static void rcu_perf_wait_shutdown(void)
+{
+ cond_resched_rcu_qs();
+ if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
+ return;
+ while (!torture_must_stop())
+ schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU perf reader kthread. Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference.
+ */
+static int
+rcu_perf_reader(void *arg)
+{
+ unsigned long flags;
+ int idx;
+ long me = (long)arg;
+
+ VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ set_user_nice(current, MAX_NICE);
+ atomic_inc(&n_rcu_perf_reader_started);
+
+ do {
+ local_irq_save(flags);
+ idx = cur_ops->readlock();
+ cur_ops->readunlock(idx);
+ local_irq_restore(flags);
+ rcu_perf_wait_shutdown();
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_perf_reader");
+ return 0;
+}
+
+/*
+ * RCU perf writer kthread. Repeatedly does a grace period.
+ */
+static int
+rcu_perf_writer(void *arg)
+{
+ int i = 0;
+ int i_max;
+ long me = (long)arg;
+ struct sched_param sp;
+ bool started = false, done = false, alldone = false;
+ u64 t;
+ u64 *wdp;
+ u64 *wdpp = writer_durations[me];
+
+ VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
+ WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
+ WARN_ON(rcu_gp_is_normal() && gp_exp);
+ WARN_ON(!wdpp);
+ set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+ sp.sched_priority = 1;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+
+ if (holdoff)
+ schedule_timeout_uninterruptible(holdoff * HZ);
+
+ t = ktime_get_mono_fast_ns();
+ if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
+ t_rcu_perf_writer_started = t;
+ if (gp_exp) {
+ b_rcu_perf_writer_started =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_perf_writer_started =
+ cur_ops->completed();
+ }
+ }
+
+ do {
+ wdp = &wdpp[i];
+ *wdp = ktime_get_mono_fast_ns();
+ if (gp_exp) {
+ rcu_perf_writer_state = RTWS_EXP_SYNC;
+ cur_ops->exp_sync();
+ } else {
+ rcu_perf_writer_state = RTWS_SYNC;
+ cur_ops->sync();
+ }
+ rcu_perf_writer_state = RTWS_IDLE;
+ t = ktime_get_mono_fast_ns();
+ *wdp = t - *wdp;
+ i_max = i;
+ if (!started &&
+ atomic_read(&n_rcu_perf_writer_started) >= nrealwriters)
+ started = true;
+ if (!done && i >= MIN_MEAS) {
+ done = true;
+ sp.sched_priority = 0;
+ sched_setscheduler_nocheck(current,
+ SCHED_NORMAL, &sp);
+ pr_alert("%s" PERF_FLAG
+ "rcu_perf_writer %ld has %d measurements\n",
+ perf_type, me, MIN_MEAS);
+ if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
+ nrealwriters) {
+ schedule_timeout_interruptible(10);
+ rcu_ftrace_dump(DUMP_ALL);
+ PERFOUT_STRING("Test complete");
+ t_rcu_perf_writer_finished = t;
+ if (gp_exp) {
+ b_rcu_perf_writer_finished =
+ cur_ops->exp_completed() / 2;
+ } else {
+ b_rcu_perf_writer_finished =
+ cur_ops->completed();
+ }
+ if (shutdown) {
+ smp_mb(); /* Assign before wake. */
+ wake_up(&shutdown_wq);
+ }
+ }
+ }
+ if (done && !alldone &&
+ atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters)
+ alldone = true;
+ if (started && !alldone && i < MAX_MEAS - 1)
+ i++;
+ rcu_perf_wait_shutdown();
+ } while (!torture_must_stop());
+ rcu_perf_writer_state = RTWS_STOPPING;
+ writer_n_durations[me] = i_max;
+ torture_kthread_stopping("rcu_perf_writer");
+ return 0;
+}
+
+static inline void
+rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag)
+{
+ pr_alert("%s" PERF_FLAG
+ "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
+ perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
+}
+
+static void
+rcu_perf_cleanup(void)
+{
+ int i;
+ int j;
+ int ngps = 0;
+ u64 *wdp;
+ u64 *wdpp;
+
+ if (torture_cleanup_begin())
+ return;
+
+ if (reader_tasks) {
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_perf_reader,
+ reader_tasks[i]);
+ kfree(reader_tasks);
+ }
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters; i++) {
+ torture_stop_kthread(rcu_perf_writer,
+ writer_tasks[i]);
+ if (!writer_n_durations)
+ continue;
+ j = writer_n_durations[i];
+ pr_alert("%s%s writer %d gps: %d\n",
+ perf_type, PERF_FLAG, i, j);
+ ngps += j;
+ }
+ pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+ perf_type, PERF_FLAG,
+ t_rcu_perf_writer_started, t_rcu_perf_writer_finished,
+ t_rcu_perf_writer_finished -
+ t_rcu_perf_writer_started,
+ ngps,
+ b_rcu_perf_writer_finished -
+ b_rcu_perf_writer_started);
+ for (i = 0; i < nrealwriters; i++) {
+ if (!writer_durations)
+ break;
+ if (!writer_n_durations)
+ continue;
+ wdpp = writer_durations[i];
+ if (!wdpp)
+ continue;
+ for (j = 0; j <= writer_n_durations[i]; j++) {
+ wdp = &wdpp[j];
+ pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+ perf_type, PERF_FLAG,
+ i, j, *wdp);
+ if (j % 100 == 0)
+ schedule_timeout_uninterruptible(1);
+ }
+ kfree(writer_durations[i]);
+ }
+ kfree(writer_tasks);
+ kfree(writer_durations);
+ kfree(writer_n_durations);
+ }
+
+ /* Do flavor-specific cleanup operations. */
+ if (cur_ops->cleanup != NULL)
+ cur_ops->cleanup();
+
+ torture_cleanup_end();
+}
+
+/*
+ * Return the number if non-negative. If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+ int nr;
+
+ if (n >= 0) {
+ nr = n;
+ } else {
+ nr = num_online_cpus() + 1 + n;
+ if (nr <= 0)
+ nr = 1;
+ }
+ return nr;
+}
+
+/*
+ * RCU perf shutdown kthread. Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_perf_shutdown(void *arg)
+{
+ do {
+ wait_event(shutdown_wq,
+ atomic_read(&n_rcu_perf_writer_finished) >=
+ nrealwriters);
+ } while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+ smp_mb(); /* Wake before output. */
+ rcu_perf_cleanup();
+ kernel_power_off();
+ return -EINVAL;
+}
+
+static int __init
+rcu_perf_init(void)
+{
+ long i;
+ int firsterr = 0;
+ static struct rcu_perf_ops *perf_ops[] = {
+ &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ RCUPERF_TASKS_OPS
+ };
+
+ if (!torture_init_begin(perf_type, verbose, &perf_runnable))
+ return -EBUSY;
+
+ /* Process args and tell the world that the perf'er is on the job. */
+ for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
+ cur_ops = perf_ops[i];
+ if (strcmp(perf_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(perf_ops)) {
+ pr_alert("rcu-perf: invalid perf type: \"%s\"\n",
+ perf_type);
+ pr_alert("rcu-perf types:");
+ for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
+ pr_alert(" %s", perf_ops[i]->name);
+ pr_alert("\n");
+ firsterr = -EINVAL;
+ goto unwind;
+ }
+ if (cur_ops->init)
+ cur_ops->init();
+
+ nrealwriters = compute_real(nwriters);
+ nrealreaders = compute_real(nreaders);
+ atomic_set(&n_rcu_perf_reader_started, 0);
+ atomic_set(&n_rcu_perf_writer_started, 0);
+ atomic_set(&n_rcu_perf_writer_finished, 0);
+ rcu_perf_print_module_parms(cur_ops, "Start of test");
+
+ /* Start up the kthreads. */
+
+ if (shutdown) {
+ init_waitqueue_head(&shutdown_wq);
+ firsterr = torture_create_kthread(rcu_perf_shutdown, NULL,
+ shutdown_task);
+ if (firsterr)
+ goto unwind;
+ schedule_timeout_uninterruptible(1);
+ }
+ reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ if (reader_tasks == NULL) {
+ VERBOSE_PERFOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealreaders; i++) {
+ firsterr = torture_create_kthread(rcu_perf_reader, (void *)i,
+ reader_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders)
+ schedule_timeout_uninterruptible(1);
+ writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
+ GFP_KERNEL);
+ writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
+ GFP_KERNEL);
+ writer_n_durations =
+ kcalloc(nrealwriters, sizeof(*writer_n_durations),
+ GFP_KERNEL);
+ if (!writer_tasks || !writer_durations || !writer_n_durations) {
+ VERBOSE_PERFOUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters; i++) {
+ writer_durations[i] =
+ kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+ GFP_KERNEL);
+ if (!writer_durations[i])
+ goto unwind;
+ firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ rcu_perf_cleanup();
+ return firsterr;
+}
+
+module_init(rcu_perf_init);
+module_exit(rcu_perf_cleanup);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 250ea67c1615..084a28a732eb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
@@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg)
static int
rcu_torture_writer(void *arg)
{
- bool can_expedite = !rcu_gp_is_expedited();
+ bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
int expediting = 0;
unsigned long gp_snap;
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
@@ -932,7 +932,7 @@ rcu_torture_writer(void *arg)
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
if (!can_expedite) {
pr_alert("%s" TORTURE_FLAG
- " Grace periods expedited from boot/sysfs for %s,\n",
+ " GP expediting controlled from boot/sysfs for %s,\n",
torture_type, cur_ops->name);
pr_alert("%s" TORTURE_FLAG
" Disabled dynamic grace-period expediting.\n",
@@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
-static void rcutorture_trace_dump(void)
-{
- static atomic_t beenhere = ATOMIC_INIT(0);
-
- if (atomic_read(&beenhere))
- return;
- if (atomic_xchg(&beenhere, 1) != 0)
- return;
- ftrace_dump(DUMP_ALL);
-}
-
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused)
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
started, completed);
- rcutorture_trace_dump();
+ rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
completed = completed - started;
@@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg)
if (pipe_count > 1) {
do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
ts, started, completed);
- rcutorture_trace_dump();
+ rcu_ftrace_dump(DUMP_ALL);
}
__this_cpu_inc(rcu_torture_count[pipe_count]);
completed = completed - started;
@@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void)
rcu_torture_writer_state,
gpnum, completed, flags);
show_rcu_gp_kthreads();
- rcutorture_trace_dump();
+ rcu_ftrace_dump(DUMP_ALL);
}
rtcv_snap = rcu_torture_current_version;
}
@@ -1489,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the folloiwng ->call().
*/
+ local_irq_disable(); /* Just to test no-irq call_rcu(). */
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ local_irq_enable();
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -1596,7 +1587,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self,
{
long cpu = (long)hcpu;
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
(void)rcutorture_booster_init(cpu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9a535a86e732..c7f1bc4f817c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,6 +102,8 @@ struct rcu_state sname##_state = { \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
+ .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
+ .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
}
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -370,6 +372,21 @@ void rcu_all_qs(void)
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
+ if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
+ /*
+ * Yes, we just checked a per-CPU variable with preemption
+ * enabled, so we might be migrated to some other CPU at
+ * this point. That is OK because in that case, the
+ * migration will supply the needed quiescent state.
+ * We might end up needlessly disabling preemption and
+ * invoking rcu_sched_qs() on the destination CPU, but
+ * the probability and cost are both quite low, so this
+ * should not be a problem in practice.
+ */
+ preempt_disable();
+ rcu_sched_qs();
+ preempt_enable();
+ }
this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -385,9 +402,11 @@ module_param(qlowmark, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
+static bool rcu_kick_kthreads;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+module_param(rcu_kick_kthreads, bool, 0644);
/*
* How long the grace period must be before we start recruiting
@@ -460,6 +479,28 @@ unsigned long rcu_batches_completed_bh(void)
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
/*
+ * Return the number of RCU expedited batches completed thus far for
+ * debug & stats. Odd numbers mean that a batch is in progress, even
+ * numbers mean idle. The value returned will thus be roughly double
+ * the cumulative batches since boot.
+ */
+unsigned long rcu_exp_batches_completed(void)
+{
+ return rcu_state_p->expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
+
+/*
+ * Return the number of RCU-sched expedited batches completed thus far
+ * for debug & stats. Similar to rcu_exp_batches_completed().
+ */
+unsigned long rcu_exp_batches_completed_sched(void)
+{
+ return rcu_sched_state.expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
+
+/*
* Force a quiescent state.
*/
void rcu_force_quiescent_state(void)
@@ -637,7 +678,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
idle_task(smp_processor_id());
trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
- ftrace_dump(DUMP_ORIG);
+ rcu_ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -799,7 +840,7 @@ static void rcu_eqs_exit_common(long long oldval, int user)
trace_rcu_dyntick(TPS("Error on exit: not idle task"),
oldval, rdtp->dynticks_nesting);
- ftrace_dump(DUMP_ORIG);
+ rcu_ftrace_dump(DUMP_ORIG);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
@@ -1224,8 +1265,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
- if (rsp->gp_kthread)
+ if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
+ wake_up_process(rsp->gp_kthread);
+ }
}
}
@@ -1249,6 +1292,25 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
}
}
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
+{
+ unsigned long j;
+
+ if (!rcu_kick_kthreads)
+ return;
+ j = READ_ONCE(rsp->jiffies_kick_kthreads);
+ if (time_after(jiffies, j) && rsp->gp_kthread) {
+ WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+ rcu_ftrace_dump(DUMP_ALL);
+ wake_up_process(rsp->gp_kthread);
+ WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
+ }
+}
+
static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
{
int cpu;
@@ -1260,6 +1322,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads(rsp);
+ if (rcu_cpu_stall_suppress)
+ return;
+
/* Only let one CPU complain about others per time interval. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1333,6 +1400,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
+ /* Kick and suppress, if so configured. */
+ rcu_stall_kick_kthreads(rsp);
+ if (rcu_cpu_stall_suppress)
+ return;
+
/*
* OK, time to rat on ourselves...
* See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1377,8 +1449,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
unsigned long js;
struct rcu_node *rnp;
- if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+ if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+ !rcu_gp_in_progress(rsp))
return;
+ rcu_stall_kick_kthreads(rsp);
j = jiffies;
/*
@@ -2117,8 +2191,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
ret = 0;
for (;;) {
- if (!ret)
+ if (!ret) {
rsp->jiffies_force_qs = jiffies + j;
+ WRITE_ONCE(rsp->jiffies_kick_kthreads,
+ jiffies + 3 * j);
+ }
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
@@ -2144,6 +2221,15 @@ static int __noreturn rcu_gp_kthread(void *arg)
TPS("fqsend"));
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
+ ret = 0; /* Force full wait till next FQS. */
+ j = jiffies_till_next_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_next_fqs = HZ;
+ } else if (j < 1) {
+ j = 1;
+ jiffies_till_next_fqs = 1;
+ }
} else {
/* Deal with stray signal. */
cond_resched_rcu_qs();
@@ -2152,14 +2238,12 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqswaitsig"));
- }
- j = jiffies_till_next_fqs;
- if (j > HZ) {
- j = HZ;
- jiffies_till_next_fqs = HZ;
- } else if (j < 1) {
- j = 1;
- jiffies_till_next_fqs = 1;
+ ret = 1; /* Keep old FQS timing. */
+ j = jiffies;
+ if (time_after(jiffies, rsp->jiffies_force_qs))
+ j = 1;
+ else
+ j = rsp->jiffies_force_qs - j;
}
}
@@ -3376,8 +3460,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
}
static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
{
+ unsigned long s;
+
smp_mb(); /* Caller's modifications seen first by other CPUs. */
- return rcu_seq_snap(&rsp->expedited_sequence);
+ s = rcu_seq_snap(&rsp->expedited_sequence);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+ return s;
}
static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
{
@@ -3469,7 +3557,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
* for the current expedited grace period. Works only for preemptible
* RCU -- other RCU implementation use other means.
*
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
*/
static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
{
@@ -3485,8 +3573,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
- * Caller must hold the root rcu_node's exp_funnel_mutex and the
- * specified rcu_node structure's ->lock.
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
*/
static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake, unsigned long flags)
@@ -3523,7 +3611,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
* Report expedited quiescent state for specified node. This is a
* lock-acquisition wrapper function for __rcu_report_exp_rnp().
*
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
*/
static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
struct rcu_node *rnp, bool wake)
@@ -3536,8 +3624,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure. Caller must hold the root
- * rcu_node's exp_funnel_mutex.
+ * specified leaf rcu_node structure. Caller must hold the rcu_state's
+ * exp_mutex.
*/
static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
unsigned long mask, bool wake)
@@ -3555,7 +3643,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
/*
* Report expedited quiescent state for specified rcu_data (CPU).
- * Caller must hold the root rcu_node's exp_funnel_mutex.
*/
static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
bool wake)
@@ -3564,15 +3651,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
}
/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
- struct rcu_data *rdp,
- atomic_long_t *stat, unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+ unsigned long s)
{
if (rcu_exp_gp_seq_done(rsp, s)) {
- if (rnp)
- mutex_unlock(&rnp->exp_funnel_mutex);
- else if (rdp)
- mutex_unlock(&rdp->exp_funnel_mutex);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
/* Ensure test happens before caller kfree(). */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(stat);
@@ -3582,59 +3665,65 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
}
/*
- * Funnel-lock acquisition for expedited grace periods. Returns a
- * pointer to the root rcu_node structure, or NULL if some other
- * task did the expedited grace period for us.
+ * Funnel-lock acquisition for expedited grace periods. Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held. Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
*/
-static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
{
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
- struct rcu_node *rnp0;
- struct rcu_node *rnp1 = NULL;
+ struct rcu_node *rnp = rdp->mynode;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+ /* Low-contention fastpath. */
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+ (rnp == rnp_root ||
+ ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+ !mutex_is_locked(&rsp->exp_mutex) &&
+ mutex_trylock(&rsp->exp_mutex))
+ goto fastpath;
/*
- * First try directly acquiring the root lock in order to reduce
- * latency in the common case where expedited grace periods are
- * rare. We check mutex_is_locked() to avoid pathological levels of
- * memory contention on ->exp_funnel_mutex in the heavy-load case.
+ * Each pass through the following loop works its way up
+ * the rcu_node tree, returning if others have done the work or
+ * otherwise falls through to acquire rsp->exp_mutex. The mapping
+ * from CPU to rcu_node structure can be inexact, as it is just
+ * promoting locality and is not strictly needed for correctness.
*/
- rnp0 = rcu_get_root(rsp);
- if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
- if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
- if (sync_exp_work_done(rsp, rnp0, NULL,
- &rdp->expedited_workdone0, s))
- return NULL;
- return rnp0;
+ for (; rnp != NULL; rnp = rnp->parent) {
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+ return true;
+
+ /* Work not done, either wait here or go up. */
+ spin_lock(&rnp->exp_lock);
+ if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+ /* Someone else doing GP, so wait for them. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+ rnp->grplo, rnp->grphi,
+ TPS("wait"));
+ wait_event(rnp->exp_wq[(s >> 1) & 0x3],
+ sync_exp_work_done(rsp,
+ &rdp->exp_workdone2, s));
+ return true;
}
+ rnp->exp_seq_rq = s; /* Followers can wait on us. */
+ spin_unlock(&rnp->exp_lock);
+ trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+ rnp->grphi, TPS("nxtlvl"));
}
-
- /*
- * Each pass through the following loop works its way
- * up the rcu_node tree, returning if others have done the
- * work or otherwise falls through holding the root rnp's
- * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
- * can be inexact, as it is just promoting locality and is not
- * strictly needed for correctness.
- */
- if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
- return NULL;
- mutex_lock(&rdp->exp_funnel_mutex);
- rnp0 = rdp->mynode;
- for (; rnp0 != NULL; rnp0 = rnp0->parent) {
- if (sync_exp_work_done(rsp, rnp1, rdp,
- &rdp->expedited_workdone2, s))
- return NULL;
- mutex_lock(&rnp0->exp_funnel_mutex);
- if (rnp1)
- mutex_unlock(&rnp1->exp_funnel_mutex);
- else
- mutex_unlock(&rdp->exp_funnel_mutex);
- rnp1 = rnp0;
+ mutex_lock(&rsp->exp_mutex);
+fastpath:
+ if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+ mutex_unlock(&rsp->exp_mutex);
+ return true;
}
- if (sync_exp_work_done(rsp, rnp1, rdp,
- &rdp->expedited_workdone3, s))
- return NULL;
- return rnp1;
+ rcu_exp_gp_seq_start(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+ return false;
}
/* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -3649,6 +3738,11 @@ static void sync_sched_exp_handler(void *data)
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
return;
+ if (rcu_is_cpu_rrupt_from_idle()) {
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
+ return;
+ }
__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
resched_cpu(smp_processor_id());
}
@@ -3773,7 +3867,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
rsp->name);
ndetected = 0;
rcu_for_each_leaf_node(rsp, rnp) {
- ndetected = rcu_print_task_exp_stall(rnp);
+ ndetected += rcu_print_task_exp_stall(rnp);
mask = 1;
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
struct rcu_data *rdp;
@@ -3783,7 +3877,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
ndetected++;
rdp = per_cpu_ptr(rsp->rda, cpu);
pr_cont(" %d-%c%c%c", cpu,
- "O."[cpu_online(cpu)],
+ "O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rnp->expmaskinit)],
"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
}
@@ -3792,7 +3886,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
jiffies - jiffies_start, rsp->expedited_sequence,
rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
- if (!ndetected) {
+ if (ndetected) {
pr_err("blocking rcu_node structures:");
rcu_for_each_node_breadth_first(rsp, rnp) {
if (rnp == rnp_root)
@@ -3818,6 +3912,41 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
}
}
+/*
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
+ * grace period. Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_node *rnp;
+
+ synchronize_sched_expedited_wait(rsp);
+ rcu_exp_gp_seq_end(rsp);
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+ /*
+ * Switch over to wakeup mode, allowing the next GP, but -only- the
+ * next GP, to proceed.
+ */
+ mutex_lock(&rsp->exp_wake_mutex);
+ mutex_unlock(&rsp->exp_mutex);
+
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+ spin_lock(&rnp->exp_lock);
+ /* Recheck, avoid hang in case someone just arrived. */
+ if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+ rnp->exp_seq_rq = s;
+ spin_unlock(&rnp->exp_lock);
+ }
+ wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
+ }
+ trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+ mutex_unlock(&rsp->exp_wake_mutex);
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3837,7 +3966,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
void synchronize_sched_expedited(void)
{
unsigned long s;
- struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
/* If only one CPU, this is automatically a grace period. */
@@ -3852,17 +3980,14 @@ void synchronize_sched_expedited(void)
/* Take a snapshot of the sequence number. */
s = rcu_exp_gp_seq_snap(rsp);
-
- rnp = exp_funnel_lock(rsp, s);
- if (rnp == NULL)
+ if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- rcu_exp_gp_seq_start(rsp);
+ /* Initialize the rcu_node tree in preparation for the wait. */
sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
- synchronize_sched_expedited_wait(rsp);
- rcu_exp_gp_seq_end(rsp);
- mutex_unlock(&rnp->exp_funnel_mutex);
+ /* Wait and clean up, including waking everyone. */
+ rcu_exp_wait_wake(rsp, s);
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@@ -4162,7 +4287,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
- mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -4420,10 +4544,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
- static const char * const exp[] = RCU_EXP_NAME_INIT;
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
- static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4482,9 +4604,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
- mutex_init(&rnp->exp_funnel_mutex);
- lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
- &rcu_exp_class[i], exp[i]);
+ init_waitqueue_head(&rnp->exp_wq[0]);
+ init_waitqueue_head(&rnp->exp_wq[1]);
+ init_waitqueue_head(&rnp->exp_wq[2]);
+ init_waitqueue_head(&rnp->exp_wq[3]);
+ spin_lock_init(&rnp->exp_lock);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df668c0f9e64..e3959f5e6ddf 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,7 +70,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
@@ -79,7 +78,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
@@ -89,7 +87,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
@@ -100,7 +97,6 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -252,7 +248,9 @@ struct rcu_node {
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
- struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
+ spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
+ unsigned long exp_seq_rq;
+ wait_queue_head_t exp_wq[4];
} ____cacheline_internodealigned_in_smp;
/*
@@ -387,11 +385,9 @@ struct rcu_data {
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
- struct mutex exp_funnel_mutex;
- atomic_long_t expedited_workdone0; /* # done by others #0. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_workdone3; /* # done by others #3. */
+ atomic_long_t exp_workdone1; /* # done by others #1. */
+ atomic_long_t exp_workdone2; /* # done by others #2. */
+ atomic_long_t exp_workdone3; /* # done by others #3. */
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -505,6 +501,8 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex exp_mutex; /* Serialize expedited GP. */
+ struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
@@ -513,6 +511,8 @@ struct rcu_state {
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
+ unsigned long jiffies_kick_kthreads; /* Time at which to kick */
+ /* kthreads, if configured. */
unsigned long n_force_qs; /* Number of calls to */
/* force_quiescent_state(). */
unsigned long n_force_qs_lh; /* ~Number of calls leaving */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index efdf7b61ce12..ff1cd4e1188d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -722,18 +722,22 @@ static void sync_rcu_exp_handler(void *info)
* synchronize_rcu_expedited - Brute-force RCU grace period
*
* Wait for an RCU-preempt grace period, but expedite it. The basic
- * idea is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.
- * In fact, if you are using synchronize_rcu_expedited() in a loop,
- * please restructure your code to batch your updates, and then Use a
- * single synchronize_rcu() instead.
+ * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state. On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code. In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
*/
void synchronize_rcu_expedited(void)
{
- struct rcu_node *rnp;
- struct rcu_node *rnp_unlock;
struct rcu_state *rsp = rcu_state_p;
unsigned long s;
@@ -744,23 +748,14 @@ void synchronize_rcu_expedited(void)
}
s = rcu_exp_gp_seq_snap(rsp);
-
- rnp_unlock = exp_funnel_lock(rsp, s);
- if (rnp_unlock == NULL)
+ if (exp_funnel_lock(rsp, s))
return; /* Someone else did our work for us. */
- rcu_exp_gp_seq_start(rsp);
-
/* Initialize the rcu_node tree in preparation for the wait. */
sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
- /* Wait for snapshotted ->blkd_tasks lists to drain. */
- rnp = rcu_get_root(rsp);
- synchronize_sched_expedited_wait(rsp);
-
- /* Clean up and exit. */
- rcu_exp_gp_seq_end(rsp);
- mutex_unlock(&rnp_unlock->exp_funnel_mutex);
+ /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+ rcu_exp_wait_wake(rsp, s);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 1088e64f01ad..86782f9a4604 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v)
int cpu;
struct rcu_state *rsp = (struct rcu_state *)m->private;
struct rcu_data *rdp;
- unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ unsigned long s1 = 0, s2 = 0, s3 = 0;
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- s0 += atomic_long_read(&rdp->expedited_workdone0);
- s1 += atomic_long_read(&rdp->expedited_workdone1);
- s2 += atomic_long_read(&rdp->expedited_workdone2);
- s3 += atomic_long_read(&rdp->expedited_workdone3);
+ s1 += atomic_long_read(&rdp->exp_workdone1);
+ s2 += atomic_long_read(&rdp->exp_workdone2);
+ s3 += atomic_long_read(&rdp->exp_workdone3);
}
- seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
- rsp->expedited_sequence, s0, s1, s2, s3,
+ seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence, s1, s2, s3,
atomic_long_read(&rsp->expedited_normal),
atomic_read(&rsp->expedited_need_qs),
rsp->expedited_sequence / 2);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index ca828b41c938..3ccdc8eebc5a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -67,7 +67,7 @@ static int rcu_normal_after_boot;
module_param(rcu_normal_after_boot, int, 0);
#endif /* #ifndef CONFIG_TINY_RCU */
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
* rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
*
@@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void)
return 0;
if (debug_locks)
lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+ return lockdep_opinion || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
diff --git a/kernel/resource.c b/kernel/resource.c
index 2e78ead30934..9b5f04404152 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
+ unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
+
+ if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+ start = r->start;
+ end = r->end;
+ } else {
+ start = end = 0;
+ }
+
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, (unsigned long long) r->start,
- width, (unsigned long long) r->end,
+ width, start,
+ width, end,
r->name ? r->name : "<BAD>");
return 0;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 302d6ebd64f7..5e59b832ae2b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
@@ -20,3 +24,4 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index fedb967a9841..e85a725e5c34 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu)
return clock;
}
+EXPORT_SYMBOL_GPL(sched_clock_cpu);
void sched_clock_tick(void)
{
@@ -363,39 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
-/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !! #
- * ####################################################################
- */
-u64 cpu_clock(int cpu)
-{
- if (!sched_clock_stable())
- return sched_clock_cpu(cpu);
-
- return sched_clock();
-}
-
-/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
- */
-u64 local_clock(void)
-{
- if (!sched_clock_stable())
- return sched_clock_cpu(raw_smp_processor_id());
-
- return sched_clock();
-}
-
#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
void sched_clock_init(void)
@@ -410,22 +378,8 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
-
-u64 cpu_clock(int cpu)
-{
- return sched_clock();
-}
-
-u64 local_clock(void)
-{
- return sched_clock();
-}
-
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
-
/*
* Running clock - returns the time that has elapsed while a guest has been
* running.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ea8f49ae0062..404c0784b1fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
+#include <linux/frame.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -169,6 +170,71 @@ static struct rq *this_rq_lock(void)
return rq;
}
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ for (;;) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+ return rq;
+ }
+ raw_spin_unlock(&rq->lock);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ for (;;) {
+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ /*
+ * move_queued_task() task_rq_lock()
+ *
+ * ACQUIRE (rq->lock)
+ * [S] ->on_rq = MIGRATING [L] rq = task_rq()
+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
+ * [S] ->cpu = new_cpu [L] task_rq()
+ * [L] ->on_rq
+ * RELEASE (rq->lock)
+ *
+ * If we observe the old cpu in task_rq_lock, the acquire of
+ * the old rq->lock will fully serialize against the stores.
+ *
+ * If we observe the new cpu in task_rq_lock, the acquire will
+ * pair with the WMB to ensure we must then also see migrating.
+ */
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+ return rq;
+ }
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
+
+ while (unlikely(task_on_rq_migrating(p)))
+ cpu_relax();
+ }
+}
+
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -248,29 +314,6 @@ void hrtick_start(struct rq *rq, u64 delay)
}
}
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
- switch (action) {
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- hrtick_clear(cpu_rq(cpu));
- return NOTIFY_OK;
- }
-
- return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
- hotcpu_notifier(hotplug_hrtick, 0);
-}
#else
/*
* Called to set the hrtick timer state.
@@ -287,10 +330,6 @@ void hrtick_start(struct rq *rq, u64 delay)
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED);
}
-
-static inline void init_hrtick(void)
-{
-}
#endif /* CONFIG_SMP */
static void init_rq_hrtick(struct rq *rq)
@@ -314,12 +353,26 @@ static inline void hrtick_clear(struct rq *rq)
static inline void init_rq_hrtick(struct rq *rq)
{
}
-
-static inline void init_hrtick(void)
-{
-}
#endif /* CONFIG_SCHED_HRTICK */
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _old, _val = *_ptr; \
+ \
+ for (;;) { \
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
+ if (_old == _val) \
+ break; \
+ _val = _old; \
+ } \
+ _old; \
+})
+
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -381,7 +434,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
* wakeup due to that.
*
* This cmpxchg() implies a full barrier, which pairs with the write
- * barrier implied by the wakeup in wake_up_list().
+ * barrier implied by the wakeup in wake_up_q().
*/
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
return;
@@ -480,7 +533,10 @@ int get_nohz_timer_target(void)
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
cpu = i;
goto unlock;
}
@@ -577,17 +633,8 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * FIFO realtime policy runs the highest priority task (after DEADLINE).
- * Other runnable tasks are of a lower priority. The scheduler tick
- * isn't needed.
- */
- fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
- if (fifo_nr_running)
- return true;
-
- /*
- * Round-robin realtime tasks time slice with other tasks at the same
- * realtime priority.
+ * If there are more than one RR tasks, we need the tick to effect the
+ * actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
@@ -596,8 +643,20 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
}
- /* Normal multitasking need periodic preemption checks */
- if (rq->cfs.nr_running > 1)
+ /*
+ * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+ * forced preemption between FIFO tasks.
+ */
+ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+ if (fifo_nr_running)
+ return true;
+
+ /*
+ * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+ * if there's more than one we need the tick for involuntary
+ * preemption.
+ */
+ if (rq->nr_running > 1)
return false;
return true;
@@ -1063,12 +1122,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check)
{
- unsigned long flags;
- struct rq *rq;
+ const struct cpumask *cpu_valid_mask = cpu_active_mask;
unsigned int dest_cpu;
+ struct rq_flags rf;
+ struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
+
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * Kernel threads are allowed on online && !active CPUs
+ */
+ cpu_valid_mask = cpu_online_mask;
+ }
/*
* Must re-check here, to close a race against __kthread_bind(),
@@ -1082,22 +1149,32 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
ret = -EINVAL;
goto out;
}
do_set_cpus_allowed(p, new_mask);
+ if (p->flags & PF_KTHREAD) {
+ /*
+ * For kernel threads that do indeed end up on online &&
+ * !active we want to ensure they are strict per-cpu threads.
+ */
+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+ !cpumask_intersects(new_mask, cpu_active_mask) &&
+ p->nr_cpus_allowed != 1);
+ }
+
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
@@ -1106,12 +1183,12 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = move_queued_task(rq, p, dest_cpu);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, rf.cookie);
}
out:
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -1295,8 +1372,8 @@ out:
*/
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
- unsigned long flags;
int running, queued;
+ struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
@@ -1331,14 +1408,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
/*
* If it changed from the expected state, bail out now.
@@ -1412,6 +1489,25 @@ EXPORT_SYMBOL_GPL(kick_process);
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ * - cpu_active must be a subset of cpu_online
+ *
+ * - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ * see __set_cpus_allowed_ptr(). At this point the newly online
+ * cpu isn't yet part of the sched domains, and balancing will not
+ * see it.
+ *
+ * - on cpu-down we clear cpu_active() to mask the sched domains and
+ * avoid the load balancer to place new tasks on the to be removed
+ * cpu. Existing tasks will remain running there and will be taken
+ * off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -1430,8 +1526,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_online(dest_cpu))
- continue;
if (!cpu_active(dest_cpu))
continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1442,8 +1536,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- if (!cpu_online(dest_cpu))
- continue;
if (!cpu_active(dest_cpu))
continue;
goto out;
@@ -1493,8 +1585,10 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
- if (p->nr_cpus_allowed > 1)
+ if (tsk_nr_cpus_allowed(p) > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ else
+ cpu = cpumask_any(tsk_cpus_allowed(p));
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1582,8 +1676,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
/*
* Mark the task runnable and perform wakeup-preemption.
*/
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct pin_cookie cookie)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -1595,9 +1689,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
* Our task @p is fully woken up and running; so its safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
p->sched_class->task_woken(rq, p);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
}
if (rq->idle_stamp) {
@@ -1615,17 +1709,23 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
}
static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct pin_cookie cookie)
{
+ int en_flags = ENQUEUE_WAKEUP;
+
lockdep_assert_held(&rq->lock);
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
+
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
#endif
- ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
- ttwu_do_wakeup(rq, p, wake_flags);
+ ttwu_activate(rq, p, en_flags);
+ ttwu_do_wakeup(rq, p, wake_flags, cookie);
}
/*
@@ -1636,17 +1736,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
*/
static int ttwu_remote(struct task_struct *p, int wake_flags)
{
+ struct rq_flags rf;
struct rq *rq;
int ret = 0;
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags);
+ ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
ret = 1;
}
- __task_rq_unlock(rq);
+ __task_rq_unlock(rq, &rf);
return ret;
}
@@ -1656,6 +1757,7 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct pin_cookie cookie;
struct task_struct *p;
unsigned long flags;
@@ -1663,15 +1765,19 @@ void sched_ttwu_pending(void)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
- lockdep_pin_lock(&rq->lock);
+ cookie = lockdep_pin_lock(&rq->lock);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
llist = llist_next(llist);
- ttwu_do_activate(rq, p, 0);
+ /*
+ * See ttwu_queue(); we only call ttwu_queue_remote() when
+ * its a x-cpu wakeup.
+ */
+ ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
}
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1755,9 +1861,10 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
}
#endif /* CONFIG_SMP */
-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
+ struct pin_cookie cookie;
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1768,9 +1875,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
#endif
raw_spin_lock(&rq->lock);
- lockdep_pin_lock(&rq->lock);
- ttwu_do_activate(rq, p, 0);
- lockdep_unpin_lock(&rq->lock);
+ cookie = lockdep_pin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, wake_flags, cookie);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
}
@@ -1939,9 +2046,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
- if (p->sched_class->task_waking)
- p->sched_class->task_waking(p);
-
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
@@ -1949,7 +2053,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
}
#endif /* CONFIG_SMP */
- ttwu_queue(p, cpu);
+ ttwu_queue(p, cpu, wake_flags);
stat:
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
@@ -1967,7 +2071,7 @@ out:
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task.
*/
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
{
struct rq *rq = task_rq(p);
@@ -1984,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p)
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
}
if (!(p->state & TASK_NORMAL))
@@ -1999,7 +2103,7 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- ttwu_do_wakeup(rq, p, 0);
+ ttwu_do_wakeup(rq, p, 0, cookie);
if (schedstat_enabled())
ttwu_stat(p, smp_processor_id(), 0);
out:
@@ -2359,7 +2463,8 @@ static int dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
- if (new_bw == p->dl.dl_bw)
+ /* !deadline task may carry old deadline bandwidth */
+ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0;
/*
@@ -2398,12 +2503,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
*/
void wake_up_new_task(struct task_struct *p)
{
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
- raw_spin_lock_irqsave(&p->pi_lock, flags);
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
+ raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2412,8 +2517,10 @@ void wake_up_new_task(struct task_struct *p)
*/
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
+ /* Post initialize new task's util average when its cfs_rq is set */
+ post_init_entity_util_avg(&p->se);
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2424,12 +2531,12 @@ void wake_up_new_task(struct task_struct *p)
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
p->sched_class->task_woken(rq, p);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, rf.cookie);
}
#endif
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2689,9 +2796,9 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
-static inline struct rq *
+static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next)
+ struct task_struct *next, struct pin_cookie cookie)
{
struct mm_struct *mm, *oldmm;
@@ -2711,7 +2818,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm(oldmm, mm, next);
+ switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
@@ -2723,7 +2830,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
@@ -2845,7 +2952,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
*/
unsigned long long task_sched_runtime(struct task_struct *p)
{
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
u64 ns;
@@ -2865,7 +2972,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return p->se.sum_exec_runtime;
#endif
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* Must be ->curr _and_ ->on_rq. If dequeued, we would
* project cycles that may never be accounted to this
@@ -2876,7 +2983,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
p->sched_class->update_curr(rq);
}
ns = p->se.sum_exec_runtime;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return ns;
}
@@ -2896,7 +3003,7 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
- update_cpu_load_active(rq);
+ cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
@@ -2939,6 +3046,20 @@ u64 scheduler_tick_max_deferment(void)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+ if (preempt_count() == val) {
+ unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
+}
void preempt_count_add(int val)
{
@@ -2957,17 +3078,21 @@ void preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val) {
- unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
- current->preempt_disable_ip = ip;
-#endif
- trace_preempt_off(CALLER_ADDR0, ip);
- }
+ preempt_latency_start(val);
}
EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+ if (preempt_count() == val)
+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
@@ -2984,13 +3109,15 @@ void preempt_count_sub(int val)
return;
#endif
- if (preempt_count() == val)
- trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+ preempt_latency_stop(val);
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
#endif
/*
@@ -3043,7 +3170,7 @@ static inline void schedule_debug(struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
@@ -3054,20 +3181,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
*/
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev);
+ p = fair_sched_class.pick_next_task(rq, prev, cookie);
if (unlikely(p == RETRY_TASK))
goto again;
/* assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev);
+ p = idle_sched_class.pick_next_task(rq, prev, cookie);
return p;
}
again:
for_each_class(class) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, cookie);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
@@ -3121,6 +3248,7 @@ static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
+ struct pin_cookie cookie;
struct rq *rq;
int cpu;
@@ -3154,7 +3282,7 @@ static void __sched notrace __schedule(bool preempt)
*/
smp_mb__before_spinlock();
raw_spin_lock(&rq->lock);
- lockdep_pin_lock(&rq->lock);
+ cookie = lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3174,9 +3302,9 @@ static void __sched notrace __schedule(bool preempt)
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
- to_wakeup = wq_worker_sleeping(prev, cpu);
+ to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
- try_to_wake_up_local(to_wakeup);
+ try_to_wake_up_local(to_wakeup, cookie);
}
}
switch_count = &prev->nvcsw;
@@ -3185,7 +3313,7 @@ static void __sched notrace __schedule(bool preempt)
if (task_on_rq_queued(prev))
update_rq_clock(rq);
- next = pick_next_task(rq, prev);
+ next = pick_next_task(rq, prev, cookie);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
@@ -3196,14 +3324,15 @@ static void __sched notrace __schedule(bool preempt)
++*switch_count;
trace_sched_switch(preempt, prev, next);
- rq = context_switch(rq, prev, next); /* unlocks the rq */
+ rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
} else {
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock_irq(&rq->lock);
}
balance_callback(rq);
}
+STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
static inline void sched_submit_work(struct task_struct *tsk)
{
@@ -3264,8 +3393,23 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
preempt_disable_notrace();
+ preempt_latency_start(1);
__schedule(true);
+ preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
/*
@@ -3317,7 +3461,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
return;
do {
+ /*
+ * Because the function tracer can trace preempt_count_sub()
+ * and it also uses preempt_enable/disable_notrace(), if
+ * NEED_RESCHED is set, the preempt_enable_notrace() called
+ * by the function tracer will call this function again and
+ * cause infinite recursion.
+ *
+ * Preemption must be disabled here before the function
+ * tracer can trace. Break up preempt_disable() into two
+ * calls. One to disable preemption without fear of being
+ * traced. The other to still record the preemption latency,
+ * which can also be traced by the function tracer.
+ */
preempt_disable_notrace();
+ preempt_latency_start(1);
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -3327,6 +3485,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
__schedule(true);
exception_exit(prev_ctx);
+ preempt_latency_stop(1);
preempt_enable_no_resched_notrace();
} while (need_resched());
}
@@ -3383,12 +3542,13 @@ EXPORT_SYMBOL(default_wake_function);
void rt_mutex_setprio(struct task_struct *p, int prio)
{
int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
- struct rq *rq;
const struct sched_class *prev_class;
+ struct rq_flags rf;
+ struct rq *rq;
BUG_ON(prio > MAX_PRIO);
- rq = __task_rq_lock(p);
+ rq = __task_rq_lock(p, &rf);
/*
* Idle task boosting is a nono in general. There is one
@@ -3464,7 +3624,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
preempt_disable(); /* avoid rq from going away on us */
- __task_rq_unlock(rq);
+ __task_rq_unlock(rq, &rf);
balance_callback(rq);
preempt_enable();
@@ -3474,7 +3634,7 @@ out_unlock:
void set_user_nice(struct task_struct *p, long nice)
{
int old_prio, delta, queued;
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3483,7 +3643,7 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3514,7 +3674,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
out_unlock:
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
}
EXPORT_SYMBOL(set_user_nice);
@@ -3811,11 +3971,11 @@ static int __sched_setscheduler(struct task_struct *p,
MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
- unsigned long flags;
const struct sched_class *prev_class;
- struct rq *rq;
+ struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ struct rq *rq;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -3910,13 +4070,13 @@ recheck:
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EINVAL;
}
@@ -3933,7 +4093,7 @@ recheck:
goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return 0;
}
change:
@@ -3947,7 +4107,7 @@ change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EPERM;
}
#endif
@@ -3962,7 +4122,7 @@ change:
*/
if (!cpumask_subset(span, &p->cpus_allowed) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EPERM;
}
}
@@ -3972,7 +4132,7 @@ change:
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
goto recheck;
}
@@ -3982,7 +4142,7 @@ change:
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
return -EBUSY;
}
@@ -4027,7 +4187,7 @@ change:
check_class_changed(rq, p, prev_class, oldprio);
preempt_disable(); /* avoid rq from going away on us */
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
if (pi)
rt_mutex_adjust_pi(p);
@@ -4880,10 +5040,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
{
struct task_struct *p;
unsigned int time_slice;
- unsigned long flags;
+ struct rq_flags rf;
+ struct timespec t;
struct rq *rq;
int retval;
- struct timespec t;
if (pid < 0)
return -EINVAL;
@@ -4898,11 +5058,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
if (retval)
goto out_unlock;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
time_slice = 0;
if (p->sched_class->get_rr_interval)
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -4978,7 +5138,8 @@ void show_state_filter(unsigned long state_filter)
touch_all_softlockup_watchdogs();
#ifdef CONFIG_SCHED_DEBUG
- sysrq_sched_debug_show();
+ if (!state_filter)
+ sysrq_sched_debug_show();
#endif
rcu_read_unlock();
/*
@@ -5140,6 +5301,8 @@ out:
#ifdef CONFIG_SMP
+static bool sched_smp_initialized __read_mostly;
+
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
int migrate_task_to(struct task_struct *p, int target_cpu)
@@ -5165,11 +5328,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- struct rq *rq;
- unsigned long flags;
bool queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
@@ -5184,7 +5347,7 @@ void sched_setnuma(struct task_struct *p, int nid)
p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE);
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -5200,7 +5363,7 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ switch_mm_irqs_off(mm, &init_mm, current);
finish_arch_post_lock_switch();
}
mmdrop(mm);
@@ -5248,6 +5411,7 @@ static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
+ struct pin_cookie cookie;
int dest_cpu;
/*
@@ -5279,8 +5443,8 @@ static void migrate_tasks(struct rq *dead_rq)
/*
* pick_next_task assumes pinned rq->lock.
*/
- lockdep_pin_lock(&rq->lock);
- next = pick_next_task(rq, &fake_task);
+ cookie = lockdep_pin_lock(&rq->lock);
+ next = pick_next_task(rq, &fake_task, cookie);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -5293,7 +5457,7 @@ static void migrate_tasks(struct rq *dead_rq)
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
@@ -5354,126 +5518,13 @@ static void set_rq_offline(struct rq *rq)
}
}
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static void set_cpu_rq_start_time(unsigned int cpu)
{
- int cpu = (long)hcpu;
- unsigned long flags;
struct rq *rq = cpu_rq(cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- rq->calc_load_update = calc_load_update;
- break;
-
- case CPU_ONLINE:
- /* Update our root-domain */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
- set_rq_online(rq);
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
-
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DYING:
- sched_ttwu_pending();
- /* Update our root-domain */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->rd) {
- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
- set_rq_offline(rq);
- }
- migrate_tasks(rq);
- BUG_ON(rq->nr_running != 1); /* the migration thread */
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- break;
-
- case CPU_DEAD:
- calc_load_migrate(rq);
- break;
-#endif
- }
-
- update_max_interval();
-
- return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else. This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
- .notifier_call = migration_call,
- .priority = CPU_PRI_MIGRATION,
-};
-
-static void set_cpu_rq_start_time(void)
-{
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
rq->age_stamp = sched_clock_cpu(cpu);
}
-static int sched_cpu_active(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
- set_cpu_rq_start_time();
- return NOTIFY_OK;
-
- case CPU_DOWN_FAILED:
- set_cpu_active(cpu, true);
- return NOTIFY_OK;
-
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- set_cpu_active((long)hcpu, false);
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int __init migration_init(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- /* Initialize migration for the boot CPU */
- err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
- BUG_ON(err == NOTIFY_BAD);
- migration_call(&migration_notifier, CPU_ONLINE, cpu);
- register_cpu_notifier(&migration_notifier);
-
- /* Register cpu active notifiers */
- cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
- cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
- return 0;
-}
-early_initcall(migration_init);
-
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
#ifdef CONFIG_SCHED_DEBUG
@@ -6621,10 +6672,10 @@ static void sched_init_numa(void)
init_numa_topology_type();
}
-static void sched_domains_numa_masks_set(int cpu)
+static void sched_domains_numa_masks_set(unsigned int cpu)
{
- int i, j;
int node = cpu_to_node(cpu);
+ int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
@@ -6634,51 +6685,20 @@ static void sched_domains_numa_masks_set(int cpu)
}
}
-static void sched_domains_numa_masks_clear(int cpu)
+static void sched_domains_numa_masks_clear(unsigned int cpu)
{
int i, j;
+
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++)
cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
}
}
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- sched_domains_numa_masks_set(cpu);
- break;
-
- case CPU_DEAD:
- sched_domains_numa_masks_clear(cpu);
- break;
-
- default:
- return NOTIFY_DONE;
- }
-
- return NOTIFY_OK;
-}
#else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
-{
- return 0;
-}
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7068,13 +7088,9 @@ static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
* If we come here as part of a suspend/resume, don't touch cpusets because we
* want to restore it back to its original state upon resume anyway.
*/
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static void cpuset_cpu_active(void)
{
- switch (action) {
- case CPU_ONLINE_FROZEN:
- case CPU_DOWN_FAILED_FROZEN:
-
+ if (cpuhp_tasks_frozen) {
/*
* num_cpus_frozen tracks how many CPUs are involved in suspend
* resume sequence. As long as this is not the last online
@@ -7084,35 +7100,25 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
num_cpus_frozen--;
if (likely(num_cpus_frozen)) {
partition_sched_domains(1, NULL, NULL);
- break;
+ return;
}
-
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
-
- case CPU_ONLINE:
- cpuset_update_active_cpus(true);
- break;
- default:
- return NOTIFY_DONE;
}
- return NOTIFY_OK;
+ cpuset_update_active_cpus(true);
}
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
+static int cpuset_cpu_inactive(unsigned int cpu)
{
unsigned long flags;
- long cpu = (long)hcpu;
struct dl_bw *dl_b;
bool overflow;
int cpus;
- switch (action) {
- case CPU_DOWN_PREPARE:
+ if (!cpuhp_tasks_frozen) {
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -7124,19 +7130,120 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
rcu_read_unlock_sched();
if (overflow)
- return notifier_from_errno(-EBUSY);
+ return -EBUSY;
cpuset_update_active_cpus(false);
- break;
- case CPU_DOWN_PREPARE_FROZEN:
+ } else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
- break;
- default:
- return NOTIFY_DONE;
}
- return NOTIFY_OK;
+ return 0;
}
+int sched_cpu_activate(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ set_cpu_active(cpu, true);
+
+ if (sched_smp_initialized) {
+ sched_domains_numa_masks_set(cpu);
+ cpuset_cpu_active();
+ }
+
+ /*
+ * Put the rq online, if not already. This happens:
+ *
+ * 1) In the early boot process, because we build the real domains
+ * after all cpus have been brought up.
+ *
+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+ * domains.
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_online(rq);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ update_max_interval();
+
+ return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+ int ret;
+
+ set_cpu_active(cpu, false);
+ /*
+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+ * users of this state to go away such that all new such users will
+ * observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so wait for both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
+
+ if (!sched_smp_initialized)
+ return 0;
+
+ ret = cpuset_cpu_inactive(cpu);
+ if (ret) {
+ set_cpu_active(cpu, true);
+ return ret;
+ }
+ sched_domains_numa_masks_clear(cpu);
+ return 0;
+}
+
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
+ update_max_interval();
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+ set_cpu_rq_start_time(cpu);
+ sched_rq_cpu_starting(cpu);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /* Handle pending wakeups and then migrate everything off */
+ sched_ttwu_pending();
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+ set_rq_offline(rq);
+ }
+ migrate_tasks(rq);
+ BUG_ON(rq->nr_running != 1);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ calc_load_migrate(rq);
+ update_max_interval();
+ nohz_balance_exit_idle(cpu);
+ hrtick_clear(rq);
+ return 0;
+}
+#endif
+
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7158,12 +7265,6 @@ void __init sched_init_smp(void)
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
- hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
- hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
- hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
- init_hrtick();
-
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
BUG();
@@ -7172,7 +7273,16 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
+ sched_smp_initialized = true;
+}
+
+static int __init migration_init(void)
+{
+ sched_rq_cpu_starting(smp_processor_id());
+ return 0;
}
+early_initcall(migration_init);
+
#else
void __init sched_init_smp(void)
{
@@ -7307,8 +7417,6 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
- rq->last_load_update_tick = jiffies;
-
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7327,12 +7435,13 @@ void __init sched_init(void)
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
#endif
#ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
#endif
-#endif
+#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
}
@@ -7370,7 +7479,7 @@ void __init sched_init(void)
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
- set_cpu_rq_start_time();
+ set_cpu_rq_start_time(smp_processor_id());
#endif
init_sched_fair_class();
@@ -7535,7 +7644,7 @@ void set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
@@ -7561,7 +7670,7 @@ struct task_group *sched_create_group(struct task_group *parent)
return tg;
err:
- free_sched_group(tg);
+ sched_free_group(tg);
return ERR_PTR(-ENOMEM);
}
@@ -7581,17 +7690,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
{
/* now it should be safe to free those cfs_rqs */
- free_sched_group(container_of(rhp, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
}
-/* Destroy runqueue etc associated with a task group */
void sched_destroy_group(struct task_group *tg)
{
/* wait for possible concurrent references to cfs_rqs complete */
- call_rcu(&tg->rcu, free_sched_group_rcu);
+ call_rcu(&tg->rcu, sched_free_group_rcu);
}
void sched_offline_group(struct task_group *tg)
@@ -7616,10 +7724,10 @@ void sched_move_task(struct task_struct *tsk)
{
struct task_group *tg;
int queued, running;
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(tsk, &flags);
+ rq = task_rq_lock(tsk, &rf);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
@@ -7651,7 +7759,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
- task_rq_unlock(rq, tsk, &flags);
+ task_rq_unlock(rq, tsk, &rf);
}
#endif /* CONFIG_CGROUP_SCHED */
@@ -7871,7 +7979,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
static int sched_rt_global_constraints(void)
{
unsigned long flags;
- int i, ret = 0;
+ int i;
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
for_each_possible_cpu(i) {
@@ -7883,7 +7991,7 @@ static int sched_rt_global_constraints(void)
}
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- return ret;
+ return 0;
}
#endif /* CONFIG_RT_GROUP_SCHED */
@@ -8050,31 +8158,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
+ sched_online_group(tg, parent);
+
return &tg->css;
}
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);
- if (parent)
- sched_online_group(tg, parent);
- return 0;
+ sched_offline_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_destroy_group(tg);
-}
-
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
-
- sched_offline_group(tg);
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
}
static void cpu_cgroup_fork(struct task_struct *task)
@@ -8434,14 +8537,13 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
- .css_online = cpu_cgroup_css_online,
- .css_offline = cpu_cgroup_css_offline,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.legacy_cftypes = cpu_files,
- .early_init = 1,
+ .early_init = true,
};
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb55bbf2..41f85c4d0938 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,11 +25,22 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS,
};
+enum cpuacct_usage_index {
+ CPUACCT_USAGE_USER, /* ... user mode */
+ CPUACCT_USAGE_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_USAGE_NRUSAGE,
+};
+
+struct cpuacct_usage {
+ u64 usages[CPUACCT_USAGE_NRUSAGE];
+};
+
/* track cpu usage of a group of tasks and its child groups */
struct cpuacct {
struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
+ struct cpuacct_usage __percpu *cpuusage;
struct kernel_cpustat __percpu *cpustat;
};
@@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
return css_ca(ca->css.parent);
}
-static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage,
@@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
if (!ca)
goto out;
- ca->cpuusage = alloc_percpu(u64);
+ ca->cpuusage = alloc_percpu(struct cpuacct_usage);
if (!ca->cpuusage)
goto out_free_ca;
@@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
kfree(ca);
}
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
+ enum cpuacct_usage_index index)
{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
+ /*
+ * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+ * the sum of suages.
+ */
+ BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit read safe on 32-bit platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
+#endif
+
+ if (index == CPUACCT_USAGE_NRUSAGE) {
+ int i = 0;
+
+ data = 0;
+ for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ data += cpuusage->usages[i];
+ } else {
+ data = cpuusage->usages[index];
+ }
+
+#ifndef CONFIG_64BIT
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
#endif
return data;
@@ -117,66 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ int i;
#ifndef CONFIG_64BIT
/*
* Take rq->lock to make 64-bit write safe on 32-bit platforms.
*/
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
+#endif
+
+ for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ cpuusage->usages[i] = val;
+
+#ifndef CONFIG_64BIT
raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
#endif
}
/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+static u64 __cpuusage_read(struct cgroup_subsys_state *css,
+ enum cpuacct_usage_index index)
{
struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
int i;
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
+ for_each_possible_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
return totalcpuusage;
}
+static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_USAGE_USER);
+}
+
+static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+}
+
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+}
+
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 reset)
+ u64 val)
{
struct cpuacct *ca = css_ca(css);
- int err = 0;
- int i;
+ int cpu;
- if (reset) {
- err = -EINVAL;
- goto out;
- }
+ /*
+ * Only allow '0' here to do a reset.
+ */
+ if (val)
+ return -EINVAL;
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
+ for_each_possible_cpu(cpu)
+ cpuacct_cpuusage_write(ca, cpu, 0);
-out:
- return err;
+ return 0;
}
-static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+static int __cpuacct_percpu_seq_show(struct seq_file *m,
+ enum cpuacct_usage_index index)
{
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
int i;
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
+ for_each_possible_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i, index);
seq_printf(m, "%llu ", (unsigned long long) percpu);
}
seq_printf(m, "\n");
return 0;
}
+static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+}
+
+static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+ return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+}
+
static const char * const cpuacct_stat_desc[] = {
[CPUACCT_STAT_USER] = "user",
[CPUACCT_STAT_SYSTEM] = "system",
@@ -188,7 +253,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
int cpu;
s64 val = 0;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
val += kcpustat->cpustat[CPUTIME_USER];
val += kcpustat->cpustat[CPUTIME_NICE];
@@ -197,7 +262,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
val = 0;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
val += kcpustat->cpustat[CPUTIME_SYSTEM];
val += kcpustat->cpustat[CPUTIME_IRQ];
@@ -217,10 +282,26 @@ static struct cftype files[] = {
.write_u64 = cpuusage_write,
},
{
+ .name = "usage_user",
+ .read_u64 = cpuusage_user_read,
+ },
+ {
+ .name = "usage_sys",
+ .read_u64 = cpuusage_sys_read,
+ },
+ {
.name = "usage_percpu",
.seq_show = cpuacct_percpu_seq_show,
},
{
+ .name = "usage_percpu_user",
+ .seq_show = cpuacct_percpu_user_seq_show,
+ },
+ {
+ .name = "usage_percpu_sys",
+ .seq_show = cpuacct_percpu_sys_seq_show,
+ },
+ {
.name = "stat",
.seq_show = cpuacct_stats_show,
},
@@ -235,22 +316,16 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int cpu;
+ int index = CPUACCT_USAGE_SYSTEM;
+ struct pt_regs *regs = task_pt_regs(tsk);
- cpu = task_cpu(tsk);
+ if (regs && user_mode(regs))
+ index = CPUACCT_USAGE_USER;
rcu_read_lock();
- ca = task_ca(tsk);
-
- while (true) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
-
- ca = parent_ca(ca);
- if (!ca)
- break;
- }
+ for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
+ this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
rcu_read_unlock();
}
@@ -260,18 +335,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
*
* Note: it's the caller that updates the account of the root cgroup.
*/
-void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
- struct kernel_cpustat *kcpustat;
struct cpuacct *ca;
rcu_read_lock();
- ca = task_ca(p);
- while (ca != &root_cpuacct) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += val;
- ca = parent_ca(ca);
- }
+ for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
+ this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
rcu_read_unlock();
}
@@ -279,5 +349,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
.legacy_cftypes = files,
- .early_init = 1,
+ .early_init = true,
};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed605624a5e7..ba72807c73d4 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
#ifdef CONFIG_CGROUP_CPUACCT
extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
}
static inline void
-cpuacct_account_field(struct task_struct *p, int index, u64 val)
+cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5a75b08cfd85..5be58820465c 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
best_cpu = cpumask_any(later_mask);
goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+ } else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
best_cpu = cpudl_maximum(cp);
if (later_mask)
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 928c4ba32f68..1141954e73b4 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -14,24 +14,50 @@
DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
/**
- * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
* @cpu: The CPU to set the pointer for.
* @data: New pointer value.
+ * @func: Callback function to set for the CPU.
*
- * Set and publish the update_util_data pointer for the given CPU. That pointer
- * points to a struct update_util_data object containing a callback function
- * to call from cpufreq_update_util(). That function will be called from an RCU
- * read-side critical section, so it must not sleep.
+ * Set and publish the update_util_data pointer for the given CPU.
*
- * Callers must use RCU-sched callbacks to free any memory that might be
- * accessed via the old update_util_data pointer or invoke synchronize_sched()
- * right after this function to avoid use-after-free.
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
*/
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned long util, unsigned long max))
{
- if (WARN_ON(data && !data->func))
+ if (WARN_ON(!data || !func))
return;
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
}
-EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..154ae3a51e86
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,530 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int rate_limit_us;
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 freq_update_delay_ns;
+ unsigned int next_freq;
+
+ /* The next fields are only needed if fast switch cannot be used. */
+ struct irq_work irq_work;
+ struct work_struct work;
+ struct mutex work_lock;
+ bool work_in_progress;
+
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+
+ /* The fields below are only needed when sharing a policy. */
+ unsigned long util;
+ unsigned long max;
+ u64 last_update;
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+
+ if (sg_policy->work_in_progress)
+ return false;
+
+ if (unlikely(sg_policy->need_freq_update)) {
+ sg_policy->need_freq_update = false;
+ /*
+ * This happens when limits change, so forget the previous
+ * next_freq value and force an update.
+ */
+ sg_policy->next_freq = UINT_MAX;
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+ return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ sg_policy->last_freq_update_time = time;
+
+ if (policy->fast_switch_enabled) {
+ if (sg_policy->next_freq == next_freq) {
+ trace_cpu_frequency(policy->cur, smp_processor_id());
+ return;
+ }
+ sg_policy->next_freq = next_freq;
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (next_freq == CPUFREQ_ENTRY_INVALID)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+ } else if (sg_policy->next_freq != next_freq) {
+ sg_policy->next_freq = next_freq;
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
+ }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @policy: cpufreq policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ */
+static unsigned int get_next_freq(struct cpufreq_policy *policy,
+ unsigned long util, unsigned long max)
+{
+ unsigned int freq = arch_scale_freq_invariant() ?
+ policy->cpuinfo.max_freq : policy->cur;
+
+ return (freq + (freq >> 2)) * util / max;
+}
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+ unsigned long util, unsigned long max)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int next_f;
+
+ if (!sugov_should_update_freq(sg_policy, time))
+ return;
+
+ next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
+ get_next_freq(policy, util, max);
+ sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int max_f = policy->cpuinfo.max_freq;
+ u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned int j;
+
+ if (util == ULONG_MAX)
+ return max_f;
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu;
+ unsigned long j_util, j_max;
+ s64 delta_ns;
+
+ if (j == smp_processor_id())
+ continue;
+
+ j_sg_cpu = &per_cpu(sugov_cpu, j);
+ /*
+ * If the CPU utilization was last updated before the previous
+ * frequency update and the time elapsed between the last update
+ * of the CPU utilization and the last frequency update is long
+ * enough, don't take the CPU into account as it probably is
+ * idle now.
+ */
+ delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ if (delta_ns > TICK_NSEC)
+ continue;
+
+ j_util = j_sg_cpu->util;
+ if (j_util == ULONG_MAX)
+ return max_f;
+
+ j_max = j_sg_cpu->max;
+ if (j_util * max > j_max * util) {
+ util = j_util;
+ max = j_max;
+ }
+ }
+
+ return get_next_freq(policy, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+ unsigned long util, unsigned long max)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int next_f;
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sg_cpu->util = util;
+ sg_cpu->max = max;
+ sg_cpu->last_update = time;
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ next_f = sugov_next_freq_shared(sg_policy, util, max);
+ sugov_update_commit(sg_policy, time, next_f);
+ }
+
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct work_struct *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+ mutex_lock(&sg_policy->work_lock);
+ __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+ CPUFREQ_RELATION_L);
+ mutex_unlock(&sg_policy->work_lock);
+
+ sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+ schedule_work_on(smp_processor_id(), &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+
+static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+
+ return count;
+}
+
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+ &rate_limit_us.attr,
+ NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+ .default_attrs = sugov_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+
+static struct cpufreq_governor schedutil_gov;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ INIT_WORK(&sg_policy->work, sugov_work);
+ mutex_init(&sg_policy->work_lock);
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ mutex_destroy(&sg_policy->work_lock);
+ kfree(sg_policy);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ unsigned int lat;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy)
+ return -ENOMEM;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto free_sg_policy;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto free_sg_policy;
+ }
+
+ tunables->rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat)
+ tunables->rate_limit_us *= lat;
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ schedutil_gov.name);
+ if (ret)
+ goto fail;
+
+ out:
+ mutex_unlock(&global_tunables_lock);
+
+ cpufreq_enable_fast_switch(policy);
+ return 0;
+
+ fail:
+ policy->governor_data = NULL;
+ sugov_tunables_free(tunables);
+
+ free_sg_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+ pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ cpufreq_disable_fast_switch(policy);
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ sugov_tunables_free(tunables);
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+ return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = UINT_MAX;
+ sg_policy->work_in_progress = false;
+ sg_policy->need_freq_update = false;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ sg_cpu->sg_policy = sg_policy;
+ if (policy_is_shared(policy)) {
+ sg_cpu->util = ULONG_MAX;
+ sg_cpu->max = 0;
+ sg_cpu->last_update = 0;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ sugov_update_shared);
+ } else {
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ sugov_update_single);
+ }
+ }
+ return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_sched();
+
+ irq_work_sync(&sg_policy->irq_work);
+ cancel_work_sync(&sg_policy->work);
+ return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ if (!policy->fast_switch_enabled) {
+ mutex_lock(&sg_policy->work_lock);
+
+ if (policy->max < policy->cur)
+ __cpufreq_driver_target(policy, policy->max,
+ CPUFREQ_RELATION_H);
+ else if (policy->min > policy->cur)
+ __cpufreq_driver_target(policy, policy->min,
+ CPUFREQ_RELATION_L);
+
+ mutex_unlock(&sg_policy->work_lock);
+ }
+
+ sg_policy->need_freq_update = true;
+ return 0;
+}
+
+int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
+{
+ if (event == CPUFREQ_GOV_POLICY_INIT) {
+ return sugov_init(policy);
+ } else if (policy->governor_data) {
+ switch (event) {
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return sugov_exit(policy);
+ case CPUFREQ_GOV_START:
+ return sugov_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return sugov_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ return sugov_limits(policy);
+ }
+ }
+ return -EINVAL;
+}
+
+static struct cpufreq_governor schedutil_gov = {
+ .name = "schedutil",
+ .governor = sugov_governor,
+ .owner = THIS_MODULE,
+};
+
+static int __init sugov_module_init(void)
+{
+ return cpufreq_register_governor(&schedutil_gov);
+}
+
+static void __exit sugov_module_exit(void)
+{
+ cpufreq_unregister_governor(&schedutil_gov);
+}
+
+MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
+MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return &schedutil_gov;
+}
+
+fs_initcall(sugov_module_init);
+#else
+module_init(sugov_module_init);
+#endif
+module_exit(sugov_module_exit);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..11e9705bf937 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97ec9f65..fcb7f0217ff4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (p->nr_cpus_allowed > 1)
+ if (tsk_nr_cpus_allowed(p) > 1)
dl_rq->dl_nr_migratory++;
update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- if (p->nr_cpus_allowed > 1)
+ if (tsk_nr_cpus_allowed(p) > 1)
dl_rq->dl_nr_migratory--;
update_dl_migration(dl_rq);
@@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity,
dl_timer);
struct task_struct *p = dl_task_of(dl_se);
- unsigned long flags;
+ struct rq_flags rf;
struct rq *rq;
- rq = task_rq_lock(p, &flags);
+ rq = task_rq_lock(p, &rf);
/*
* The task might have changed its scheduling policy to something
@@ -670,14 +670,14 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* Nothing relies on rq->lock after this, so its safe to drop
* rq->lock.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
push_dl_task(rq);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, rf.cookie);
}
#endif
unlock:
- task_rq_unlock(rq, p, &flags);
+ task_rq_unlock(rq, p, &rf);
/*
* This can free the task_struct, including this hrtimer, do not touch
@@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq)
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
- /* Kick cpufreq (see the comment in linux/cpufreq.h). */
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_trigger_update(rq_clock(rq));
-
/*
* Consumed budget is computed considering the time as
* observed by schedulable tasks (excluding time spent
@@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq)
return;
}
+ /* kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -966,7 +966,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
enqueue_dl_entity(&p->dl, pi_se, flags);
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1040,9 +1040,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
* try to make it stay here, it might be important.
*/
if (unlikely(dl_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
+ (tsk_nr_cpus_allowed(curr) < 2 ||
!dl_entity_preempt(&p->dl, &curr->dl)) &&
- (p->nr_cpus_allowed > 1)) {
+ (tsk_nr_cpus_allowed(p) > 1)) {
int target = find_later_rq(p);
if (target != -1 &&
@@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (rq->curr->nr_cpus_allowed == 1 ||
+ if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
return;
@@ -1071,7 +1071,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1 &&
+ if (tsk_nr_cpus_allowed(p) != 1 &&
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
@@ -1125,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
@@ -1140,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
pull_dl_task(rq);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
@@ -1185,7 +1186,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
{
update_curr_dl(rq);
- if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+ if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
enqueue_pushable_dl_task(rq, p);
}
@@ -1286,7 +1287,7 @@ static int find_later_rq(struct task_struct *task)
if (unlikely(!later_mask))
return -1;
- if (task->nr_cpus_allowed == 1)
+ if (tsk_nr_cpus_allowed(task) == 1)
return -1;
/*
@@ -1392,8 +1393,9 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu,
- &task->cpus_allowed) ||
+ tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
+ !dl_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
@@ -1431,7 +1433,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
+ BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!dl_task(p));
@@ -1470,7 +1472,7 @@ retry:
*/
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
- rq->curr->nr_cpus_allowed > 1) {
+ tsk_nr_cpus_allowed(rq->curr) > 1) {
resched_curr(rq);
return 0;
}
@@ -1617,9 +1619,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
+ tsk_nr_cpus_allowed(p) > 1 &&
dl_task(rq->curr) &&
- (rq->curr->nr_cpus_allowed < 2 ||
+ (tsk_nr_cpus_allowed(rq->curr) < 2 ||
!dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
@@ -1723,7 +1725,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+ if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
#else
if (dl_task(rq->curr))
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fbc3bd5ff60..cf905f655ba1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -626,15 +626,16 @@ do { \
#undef P
#undef PN
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
-
#ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
P64(avg_idle);
P64(max_idle_balance_cost);
+#undef P64
#endif
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
+
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
@@ -644,7 +645,6 @@ do { \
}
#undef P
-#undef P64
#endif
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46d64e4ccfde..218f8e83db73 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw)
* OR
* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
*
- * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
* we're guaranteed shift stays positive because inv_weight is guaranteed to
* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
*
@@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se)
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+}
+
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
}
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
#endif
/*
@@ -2437,10 +2488,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
list_del_init(&se->group_node);
}
+#endif
cfs_rq->nr_running--;
}
@@ -2550,6 +2603,16 @@ static const u32 runnable_avg_yN_sum[] = {
};
/*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+ 0, 23371, 35056, 40899, 43820, 45281,
+ 46011, 46376, 46559, 46650, 46696, 46719,
+};
+
+/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
@@ -2597,22 +2660,13 @@ static u32 __compute_runnable_contrib(u64 n)
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
- /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
- do {
- contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
- contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
- n -= LOAD_AVG_PERIOD;
- } while (n > LOAD_AVG_PERIOD);
-
+ /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+ contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+ n %= LOAD_AVG_PERIOD;
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
}
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
/*
@@ -2821,23 +2875,54 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ unsigned long max = rq->cpu_capacity_orig;
+
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_clock(rq),
+ min(cfs_rq->avg.util_avg, max), max);
+ }
+}
+
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed_load = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
- removed = 1;
+ removed_load = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sa->util_avg = max_t(long, sa->util_avg - r, 0);
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ removed_util = 1;
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2848,7 +2933,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- return decayed || removed;
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
+ return decayed || removed_load;
}
/* Update task and its cfs_rq load average */
@@ -2867,31 +2955,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
update_tg_load_avg(cfs_rq, 0);
-
- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
- unsigned long max = rq->cpu_capacity_orig;
-
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
- }
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2919,6 +2984,8 @@ skip_aging:
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+
+ cfs_rq_util_change(cfs_rq);
}
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2931,6 +2998,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -2948,7 +3017,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->curr == se, NULL);
}
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
@@ -3030,7 +3099,14 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct rq *rq = rq_of(cfs_rq);
+
+ cpufreq_trigger_update(rq_clock(rq));
+}
+
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
@@ -3178,20 +3254,61 @@ static inline void check_schedstat_required(void)
#endif
}
+
+/*
+ * MIGRATION
+ *
+ * dequeue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
+ * vruntime -= min_vruntime
+ *
+ * enqueue
+ * update_curr()
+ * update_min_vruntime()
+ * vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+ bool curr = cfs_rq->curr == se;
+
/*
- * Update the normalized vruntime before updating min_vruntime
- * through calling update_curr().
+ * If we're the current task, we must renormalise before calling
+ * update_curr().
*/
- if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+ if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime;
+ update_curr(cfs_rq);
+
/*
- * Update run-time statistics of the 'current'.
+ * Otherwise, renormalise after, such that we're placed at the current
+ * moment in time, instead of some random moment in the past. Being
+ * placed in the past could significantly boost this task to the
+ * fairness detriment of existing tasks.
*/
- update_curr(cfs_rq);
+ if (renorm && !curr)
+ se->vruntime += cfs_rq->min_vruntime;
+
enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3207,7 +3324,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
- if (se != cfs_rq->curr)
+ if (!curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -4415,7 +4532,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
/*
* per rq 'load' arrray crap; XXX kill this.
*/
@@ -4481,13 +4598,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
}
return load;
}
+#endif /* CONFIG_NO_HZ_COMMON */
/**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
* @this_rq: The rq to update statistics for
* @this_load: The current load
* @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
*
* Update rq->cpu_load[] statistics. This function is usually called every
* scheduler tick (TICK_NSEC).
@@ -4516,12 +4633,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
* load[i]_n = (1 - 1/2^i)^n * load[i]_0
*
* see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
*/
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+ unsigned long pending_updates)
{
- unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+ unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
this_rq->nr_load_updates++;
@@ -4534,6 +4651,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
@@ -4544,6 +4662,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
*/
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4566,10 +4685,23 @@ static unsigned long weighted_cpuload(const int cpu)
}
#ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load,
- int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load)
{
unsigned long pending_updates;
@@ -4581,28 +4713,15 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
* In the NOHZ_FULL case, we were non-idle, we should consider
* its weighted load.
*/
- __update_cpu_load(this_rq, load, pending_updates, active);
+ cpu_load_update(this_rq, load, pending_updates);
}
}
/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-/*
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
-static void update_cpu_load_idle(struct rq *this_rq)
+static void cpu_load_update_idle(struct rq *this_rq)
{
/*
* bail if there's load or we're actually up-to-date.
@@ -4610,38 +4729,71 @@ static void update_cpu_load_idle(struct rq *this_rq)
if (weighted_cpuload(cpu_of(this_rq)))
return;
- __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+ cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
}
/*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
*/
-void update_cpu_load_nohz(int active)
+void cpu_load_update_nohz_start(void)
{
struct rq *this_rq = this_rq();
+
+ /*
+ * This is all lockless but should be fine. If weighted_cpuload changes
+ * concurrently we'll exit nohz. And cpu_load write can race with
+ * cpu_load_update_idle() but both updater would be writing the same.
+ */
+ this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
+ struct rq *this_rq = this_rq();
+ unsigned long load;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
+ load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(&this_rq->lock);
- __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
+ update_rq_clock(this_rq);
+ cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(&this_rq->lock);
}
-#endif /* CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+ unsigned long curr_jiffies,
+ unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ /* See the mess around cpu_load_update_nohz(). */
+ this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
+ cpu_load_update(this_rq, load, 1);
+}
/*
* Called from scheduler_tick()
*/
-void update_cpu_load_active(struct rq *this_rq)
+void cpu_load_update_active(struct rq *this_rq)
{
unsigned long load = weighted_cpuload(cpu_of(this_rq));
- /*
- * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
- */
- this_rq->last_load_update_tick = jiffies;
- __update_cpu_load(this_rq, load, 1, 1);
+
+ if (tick_nohz_tick_stopped())
+ cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+ else
+ cpu_load_update_periodic(this_rq, load);
}
/*
@@ -4699,46 +4851,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
}
-static void record_wakee(struct task_struct *p)
-{
- /*
- * Rough decay (wiping) for cost saving, don't worry
- * about the boundary, really active task won't care
- * about the loss.
- */
- if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
- current->wakee_flips >>= 1;
- current->wakee_flip_decay_ts = jiffies;
- }
-
- if (current->last_wakee != p) {
- current->last_wakee = p;
- current->wakee_flips++;
- }
-}
-
-static void task_waking_fair(struct task_struct *p)
-{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
-
- se->vruntime -= min_vruntime;
- record_wakee(p);
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
@@ -4854,17 +4966,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
+static void record_wakee(struct task_struct *p)
+{
+ /*
+ * Only decay a single time; tasks that have less then 1 wakeup per
+ * jiffy will not have built up many flips.
+ */
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+ current->wakee_flips >>= 1;
+ current->wakee_flip_decay_ts = jiffies;
+ }
+
+ if (current->last_wakee != p) {
+ current->last_wakee = p;
+ current->wakee_flips++;
+ }
+}
+
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
* A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees. In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other. With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size. Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
*/
static int wake_wide(struct task_struct *p)
{
@@ -5071,7 +5205,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
return i;
/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an eligible idle cpu.
+ *
+ * A completely idle sched group at higher domains is more
+ * desirable than an idle group at a lower level, because lower
+ * domains have smaller groups and usually share hardware
+ * resources which causes tasks to contend on them, e.g. x86
+ * hyperthread siblings in the lowest domain (SMT) can contend
+ * on the shared cpu pipeline.
+ *
+ * However, while we prefer idle groups at higher domains
+ * finding an idle cpu at the lowest domain is still better than
+ * returning 'target', which we've already established, isn't
+ * idle.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
for_each_lower_domain(sd) {
@@ -5081,11 +5227,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;
+ /* Ensure the entire group is idle */
for_each_cpu(i, sched_group_cpus(sg)) {
if (i == target || !idle_cpu(i))
goto next;
}
+ /*
+ * It doesn't matter which cpu we pick, the
+ * whole group is idle.
+ */
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
goto done;
@@ -5152,8 +5303,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (sd_flag & SD_BALANCE_WAKE)
+ if (sd_flag & SD_BALANCE_WAKE) {
+ record_wakee(p);
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ }
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -5233,6 +5386,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
static void migrate_task_rq_fair(struct task_struct *p)
{
/*
+ * As blocked tasks retain absolute vruntime the migration needs to
+ * deal with this by subtracting the old and adding the new
+ * min_vruntime -- the latter is done by enqueue_entity() when placing
+ * the task on the new runqueue.
+ */
+ if (p->state == TASK_WAKING) {
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
+ }
+
+ /*
* We are supposed to update the task to "current" time, then its up to date
* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
* what current time is, so simply throw away the out-of-date time. This
@@ -5415,7 +5594,7 @@ preempt:
}
static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
@@ -5528,9 +5707,9 @@ idle:
* further scheduler activity on it and we're being very careful to
* re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
new_tasks = idle_balance(rq);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -5629,7 +5808,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* W_i,0 = \Sum_j w_i,j (2)
*
* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
+ * is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
* weight:
@@ -6131,7 +6310,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6192,7 +6371,7 @@ static inline void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6601,6 +6780,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
+ /* No ASYM_PACKING if target cpu is already busy */
+ if (env->idle == CPU_NOT_IDLE)
+ return true;
/*
* ASYM_PACKING needs to move all the work to the lowest
* numbered CPUs in the group, therefore mark all groups
@@ -6610,7 +6792,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (!sds->busiest)
return true;
- if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+ /* Prefer to move from highest possible cpu's work */
+ if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
return true;
}
@@ -6756,6 +6939,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
if (!(env->sd->flags & SD_ASYM_PACKING))
return 0;
+ if (env->idle == CPU_NOT_IDLE)
+ return 0;
+
if (!sds->busiest)
return 0;
@@ -6864,9 +7050,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
- * In the presence of smp nice balancing, certain scenarios can have
- * max load less than avg load(as we skip the groups at or below
- * its cpu_capacity, while calculating max_load..)
+ * Avg load of busiest sg can be less and avg load of local sg can
+ * be greater than avg load across all sgs of sd because avg load
+ * factors in sg capacity and sgs with smaller group_type are
+ * skipped when updating the busiest sg:
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
@@ -6879,11 +7066,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
- load_above_capacity = busiest->sum_nr_running *
- SCHED_LOAD_SCALE;
- if (load_above_capacity > busiest->group_capacity)
+ load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
+ if (load_above_capacity > busiest->group_capacity) {
load_above_capacity -= busiest->group_capacity;
- else
+ load_above_capacity *= NICE_0_LOAD;
+ load_above_capacity /= busiest->group_capacity;
+ } else
load_above_capacity = ~0UL;
}
@@ -6891,9 +7079,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load. At the same time,
- * we also don't want to reduce the group load below the group capacity
- * (so that we can implement power-savings policies etc). Thus we look
- * for the minimum possible imbalance.
+ * we also don't want to reduce the group load below the group
+ * capacity. Thus we look for the minimum possible imbalance.
*/
max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
@@ -6917,10 +7104,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+ * if there is an imbalance.
*
* Also calculates the amount of weighted load which should be moved
* to restore balance.
@@ -6928,9 +7112,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* @env: The load balancing environment.
*
* Return: - The busiest group if imbalance exists.
- * - If no imbalance and user has opted for power-savings balance,
- * return the least loaded group whose CPUs can be
- * put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
{
@@ -6948,8 +7129,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest = &sds.busiest_stat;
/* ASYM feature bypasses nice load balance check */
- if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
- check_asym_packing(env, &sds))
+ if (check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
@@ -7374,10 +7554,7 @@ more_balance:
&busiest->active_balance_work);
}
- /*
- * We've kicked active balancing, reset the failure
- * counter.
- */
+ /* We've kicked active balancing, force task migration. */
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
@@ -7612,10 +7789,13 @@ static int active_load_balance_cpu_stop(void *data)
schedstat_inc(sd, alb_count);
p = detach_one_task(&env);
- if (p)
+ if (p) {
schedstat_inc(sd, alb_pushed);
- else
+ /* Active balancing done, reset the failure counter. */
+ sd->nr_balance_failed = 0;
+ } else {
schedstat_inc(sd, alb_failed);
+ }
}
rcu_read_unlock();
out_unlock:
@@ -7686,7 +7866,7 @@ static void nohz_balancer_kick(void)
return;
}
-static inline void nohz_balance_exit_idle(int cpu)
+void nohz_balance_exit_idle(unsigned int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
/*
@@ -7759,18 +7939,6 @@ void nohz_balance_enter_idle(int cpu)
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DYING:
- nohz_balance_exit_idle(smp_processor_id());
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
#endif
static DEFINE_SPINLOCK(balancing);
@@ -7932,7 +8100,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
- update_cpu_load_idle(rq);
+ cpu_load_update_idle(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}
@@ -8357,6 +8525,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
+ post_init_entity_util_avg(se);
}
return 1;
@@ -8513,7 +8682,6 @@ const struct sched_class fair_sched_class = {
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
- .task_waking = task_waking_fair,
.task_dead = task_dead_fair,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
@@ -8575,7 +8743,6 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce94931f1b..2ce5458bbe1d 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
}
static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
put_prev_task(rq, prev);
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index ef7159012cf3..b0b93fd33af9 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq)
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
- load *= exp;
- load += active * (FIXED_1 - exp);
- load += 1UL << (FSHIFT - 1);
- return load >> FSHIFT;
+ unsigned long newload;
+
+ newload = load * exp + active * (FIXED_1 - exp);
+ if (active >= load)
+ newload += FIXED_1-1;
+
+ return newload / FIXED_1;
}
#ifdef CONFIG_NO_HZ_COMMON
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c41ea7ac1764..d5690b722691 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -334,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total++;
- if (p->nr_cpus_allowed > 1)
+ if (tsk_nr_cpus_allowed(p) > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
@@ -351,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq = &rq_of_rt_rq(rt_rq)->rt;
rt_rq->rt_nr_total--;
- if (p->nr_cpus_allowed > 1)
+ if (tsk_nr_cpus_allowed(p) > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
@@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq)
if (curr->sched_class != &rt_sched_class)
return;
- /* Kick cpufreq (see the comment in linux/cpufreq.h). */
- if (cpu_of(rq) == smp_processor_id())
- cpufreq_trigger_update(rq_clock(rq));
-
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
enqueue_rt_entity(rt_se, flags);
- if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
enqueue_pushable_task(rq, p);
}
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* will have to sort it out.
*/
if (curr && unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
+ (tsk_nr_cpus_allowed(curr) < 2 ||
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* Current can't be migrated, useless to reschedule,
* let's hope p can move out.
*/
- if (rq->curr->nr_cpus_allowed == 1 ||
+ if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* p is migratable, so let's not schedule it and
* see if it is pushed or pulled somewhere else.
*/
- if (p->nr_cpus_allowed != 1
+ if (tsk_nr_cpus_allowed(p) != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
@@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
}
static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
@@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
* disabled avoiding further scheduler activity on it and we're
* being very careful to re-start the picking loop.
*/
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, cookie);
pull_rt_task(rq);
- lockdep_pin_lock(&rq->lock);
+ lockdep_repin_lock(&rq->lock, cookie);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
enqueue_pushable_task(rq, p);
}
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
if (unlikely(!lowest_mask))
return -1;
- if (task->nr_cpus_allowed == 1)
+ if (tsk_nr_cpus_allowed(task) == 1)
return -1; /* No other targets possible */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1729,6 +1729,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
+ !rt_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
@@ -1761,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
+ BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
BUG_ON(!task_on_rq_queued(p));
BUG_ON(!rt_task(p));
@@ -2121,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
+ tsk_nr_cpus_allowed(p) > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
- (rq->curr->nr_cpus_allowed < 2 ||
+ (tsk_nr_cpus_allowed(rq->curr) < 2 ||
rq->curr->prio <= p->prio))
push_rt_tasks(rq);
}
@@ -2196,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
#else
if (p->prio < rq->curr->prio)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 382848a24ed9..72f1f3087b04 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq);
#ifdef CONFIG_SMP
-extern void update_cpu_load_active(struct rq *this_rq);
+extern void cpu_load_update_active(struct rq *this_rq);
#else
-static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif
/*
@@ -49,25 +49,32 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
* and does not change the user-interface for setting shares/weights.
*
* We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
+ * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
+ * pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
+ * increase coverage and consistency always enable it on 64bit platforms.
*/
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
-# define SCHED_LOAD_RESOLUTION 10
-# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#ifdef CONFIG_64BIT
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w) ((w) >> SCHED_FIXEDPOINT_SHIFT)
#else
-# define SCHED_LOAD_RESOLUTION 0
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) (w)
# define scale_load_down(w) (w)
#endif
-#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
-#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
-
-#define NICE_0_LOAD SCHED_LOAD_SCALE
-#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+/*
+ * Task weight (visible to users) and its load (invisible to users) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
+ */
+#define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT)
/*
* Single value that decides SCHED_DEADLINE internal math precision.
@@ -585,11 +592,13 @@ struct rq {
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- unsigned long last_load_update_tick;
#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+ unsigned long last_load_update_tick;
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
@@ -854,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym);
struct sched_group_capacity {
atomic_t ref;
/*
- * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
unsigned int capacity;
@@ -1159,7 +1168,7 @@ extern const u32 sched_prio_to_wmult[40];
*
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
- * ENQUEUE_WAKING - sched_class::task_waking was called
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
*
*/
@@ -1174,9 +1183,9 @@ extern const u32 sched_prio_to_wmult[40];
#define ENQUEUE_HEAD 0x08
#define ENQUEUE_REPLENISH 0x10
#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 0x20
+#define ENQUEUE_MIGRATED 0x20
#else
-#define ENQUEUE_WAKING 0x00
+#define ENQUEUE_MIGRATED 0x00
#endif
#define RETRY_TASK ((void *)-1UL)
@@ -1200,14 +1209,14 @@ struct sched_class {
* tasks.
*/
struct task_struct * (*pick_next_task) (struct rq *rq,
- struct task_struct *prev);
+ struct task_struct *prev,
+ struct pin_cookie cookie);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p);
- void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
@@ -1313,6 +1322,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
@@ -1448,86 +1458,32 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
static inline void sched_avg_update(struct rq *rq) { }
#endif
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
- __acquires(rq->lock)
-{
- struct rq *rq;
-
- lockdep_assert_held(&p->pi_lock);
-
- for (;;) {
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- lockdep_pin_lock(&rq->lock);
- return rq;
- }
- raw_spin_unlock(&rq->lock);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
+struct rq_flags {
+ unsigned long flags;
+ struct pin_cookie cookie;
+};
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
+ __acquires(rq->lock);
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
- __acquires(rq->lock)
-{
- struct rq *rq;
+ __acquires(rq->lock);
- for (;;) {
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- rq = task_rq(p);
- raw_spin_lock(&rq->lock);
- /*
- * move_queued_task() task_rq_lock()
- *
- * ACQUIRE (rq->lock)
- * [S] ->on_rq = MIGRATING [L] rq = task_rq()
- * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
- * [S] ->cpu = new_cpu [L] task_rq()
- * [L] ->on_rq
- * RELEASE (rq->lock)
- *
- * If we observe the old cpu in task_rq_lock, the acquire of
- * the old rq->lock will fully serialize against the stores.
- *
- * If we observe the new cpu in task_rq_lock, the acquire will
- * pair with the WMB to ensure we must then also see migrating.
- */
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- lockdep_pin_lock(&rq->lock);
- return rq;
- }
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
- while (unlikely(task_on_rq_migrating(p)))
- cpu_relax();
- }
-}
-
-static inline void __task_rq_unlock(struct rq *rq)
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf->cookie);
raw_spin_unlock(&rq->lock);
}
static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
__releases(rq->lock)
__releases(p->pi_lock)
{
- lockdep_unpin_lock(&rq->lock);
+ lockdep_unpin_lock(&rq->lock, rf->cookie);
raw_spin_unlock(&rq->lock);
- raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
}
#ifdef CONFIG_SMP
@@ -1743,6 +1699,10 @@ enum rq_nohz_flag_bits {
};
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+
+extern void nohz_balance_exit_idle(unsigned int cpu);
+#else
+static inline void nohz_balance_exit_idle(unsigned int cpu) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1841,3 +1801,24 @@ static inline void cpufreq_trigger_update(u64 time)
static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
static inline void cpufreq_trigger_update(u64 time) {}
#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant() (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant() (false)
+#endif
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+ rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ rq->prev_steal_time_rq = 0;
+#endif
+}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da10954..604297a08b3a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
}
static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
{
struct task_struct *stop = rq->stop;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15a1795bbba1..e1e5a354854e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter)
struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
- if (is_compat_task()) {
+ if (in_compat_syscall()) {
struct compat_sock_fprog fprog32;
if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
goto out;
@@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall)
{
int *syscall_whitelist = mode1_syscalls;
#ifdef CONFIG_COMPAT
- if (is_compat_task())
+ if (in_compat_syscall())
syscall_whitelist = mode1_syscalls_32;
#endif
do {
diff --git a/kernel/signal.c b/kernel/signal.c
index 0508544c8ced..ab122a2cee41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2709,6 +2709,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_upper, &to->si_upper);
}
#endif
+#ifdef SEGV_PKUERR
+ if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR)
+ err |= __put_user(from->si_pkey, &to->si_pkey);
+#endif
break;
case __SI_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
@@ -3095,12 +3099,14 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
oss.ss_sp = (void __user *) current->sas_ss_sp;
oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp);
+ oss.ss_flags = sas_ss_flags(sp) |
+ (current->sas_ss_flags & SS_FLAG_BITS);
if (uss) {
void __user *ss_sp;
size_t ss_size;
- int ss_flags;
+ unsigned ss_flags;
+ int ss_mode;
error = -EFAULT;
if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
@@ -3115,18 +3121,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
if (on_sig_stack(sp))
goto out;
+ ss_mode = ss_flags & ~SS_FLAG_BITS;
error = -EINVAL;
- /*
- * Note - this code used to test ss_flags incorrectly:
- * old code may have been written using ss_flags==0
- * to mean ss_flags==SS_ONSTACK (as this was the only
- * way that worked) - this fix preserves that older
- * mechanism.
- */
- if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+ if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+ ss_mode != 0)
goto out;
- if (ss_flags == SS_DISABLE) {
+ if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
@@ -3137,6 +3138,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
current->sas_ss_sp = (unsigned long) ss_sp;
current->sas_ss_size = ss_size;
+ current->sas_ss_flags = ss_flags;
}
error = 0;
@@ -3167,9 +3169,14 @@ int restore_altstack(const stack_t __user *uss)
int __save_altstack(stack_t __user *uss, unsigned long sp)
{
struct task_struct *t = current;
- return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
- __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+ __put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
+ if (err)
+ return err;
+ if (t->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(t);
+ return 0;
}
#ifdef CONFIG_COMPAT
@@ -3581,6 +3588,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma)
void __init signals_init(void)
{
+ /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */
+ BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE
+ != offsetof(struct siginfo, _sifields._pad));
+
sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8aae49dd7da8..17caf4b63342 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage __visible void __do_softirq(void)
+asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
diff --git a/kernel/sys.c b/kernel/sys.c
index 78947de6f969..cf8ba545c7d3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = perf_event_task_enable();
break;
case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
+ if (current->timer_slack_ns > ULONG_MAX)
+ error = ULONG_MAX;
+ else
+ error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
if (arg2 <= 0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5102fabef7f..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,9 +126,13 @@ static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
+static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1143,6 +1147,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one_hundred,
},
+ {
+ .procname = "perf_event_max_stack",
+ .data = NULL, /* filled in by handler */
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = &zero,
+ .extra2 = &six_hundred_forty_kb,
+ },
#endif
#ifdef CONFIG_KMEMCHECK
{
@@ -1404,6 +1417,15 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "watermark_scale_factor",
+ .data = &watermark_scale_factor,
+ .maxlen = sizeof(watermark_scale_factor),
+ .mode = 0644,
+ .proc_handler = watermark_scale_factor_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &one_thousand,
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 7e7746a42a62..10a1d7dc9313 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
}
mnt = task_active_pid_ns(current)->proc_mnt;
- file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
+ file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
result = PTR_ERR(file);
if (IS_ERR(file))
goto out_putname;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 21f82c29c914..b3f05ee20d18 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -357,10 +357,6 @@ static int parse(struct nlattr *na, struct cpumask *mask)
return ret;
}
-#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define TASKSTATS_NEEDS_PADDING 1
-#endif
-
static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
{
struct nlattr *na, *ret;
@@ -370,29 +366,6 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
? TASKSTATS_TYPE_AGGR_PID
: TASKSTATS_TYPE_AGGR_TGID;
- /*
- * The taskstats structure is internally aligned on 8 byte
- * boundaries but the layout of the aggregrate reply, with
- * two NLA headers and the pid (each 4 bytes), actually
- * force the entire structure to be unaligned. This causes
- * the kernel to issue unaligned access warnings on some
- * architectures like ia64. Unfortunately, some software out there
- * doesn't properly unroll the NLA packet and assumes that the start
- * of the taskstats structure will always be 20 bytes from the start
- * of the netlink payload. Aligning the start of the taskstats
- * structure breaks this software, which we don't want. So, for now
- * the alignment only happens on architectures that require it
- * and those users will have to update to fixed versions of those
- * packages. Space is reserved in the packet only when needed.
- * This ifdef should be removed in several years e.g. 2012 once
- * we can be confident that fixed versions are installed on most
- * systems. We add the padding before the aggregate since the
- * aggregate is already a defined type.
- */
-#ifdef TASKSTATS_NEEDS_PADDING
- if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
- goto err;
-#endif
na = nla_nest_start(skb, aggr);
if (!na)
goto err;
@@ -401,7 +374,8 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
nla_nest_cancel(skb, na);
goto err;
}
- ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+ ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS,
+ sizeof(struct taskstats), TASKSTATS_TYPE_NULL);
if (!ret) {
nla_nest_cancel(skb, na);
goto err;
@@ -500,10 +474,9 @@ static size_t taskstats_packet_size(void)
size_t size;
size = nla_total_size(sizeof(u32)) +
- nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-#ifdef TASKSTATS_NEEDS_PADDING
- size += nla_total_size(0); /* Padding for alignment */
-#endif
+ nla_total_size_64bit(sizeof(struct taskstats)) +
+ nla_total_size(0);
+
return size;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa909f9fd559..fa0b983290cf 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
/*
* High resolution timer enabled ?
*/
-static int hrtimer_hres_enabled __read_mostly = 1;
+static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
@@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution);
*/
static int __init setup_hrtimer_hres(char *str)
{
- if (!strcmp(str, "off"))
- hrtimer_hres_enabled = 0;
- else if (!strcmp(str, "on"))
- hrtimer_hres_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
__setup("highres=", setup_hrtimer_hres);
@@ -979,7 +973,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
* relative (HRTIMER_MODE_REL)
*/
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
+ u64 delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -1548,7 +1542,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- unsigned long slack;
+ u64 slack;
slack = current->timer_slack_ns;
if (dl_task(current) || rt_task(current))
@@ -1724,7 +1718,7 @@ void __init hrtimers_init(void)
* @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
*/
int __sched
-schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode, int clock)
{
struct hrtimer_sleeper t;
@@ -1792,7 +1786,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
*
* Returns 0 when the timer has expired otherwise -EINTR
*/
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range_clock(expires, delta, mode,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 969e6704c3c9..536ada80f6dd 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -157,52 +157,50 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
cpumask_var_t tick_nohz_full_mask;
cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
-static unsigned long tick_dep_mask;
+static atomic_t tick_dep_mask;
-static void trace_tick_dependency(unsigned long dep)
+static bool check_tick_dependency(atomic_t *dep)
{
- if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+ int val = atomic_read(dep);
+
+ if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
- return;
+ return true;
}
- if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+ if (val & TICK_DEP_MASK_PERF_EVENTS) {
trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
- return;
+ return true;
}
- if (dep & TICK_DEP_MASK_SCHED) {
+ if (val & TICK_DEP_MASK_SCHED) {
trace_tick_stop(0, TICK_DEP_MASK_SCHED);
- return;
+ return true;
}
- if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+ if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+ return true;
+ }
+
+ return false;
}
static bool can_stop_full_tick(struct tick_sched *ts)
{
WARN_ON_ONCE(!irqs_disabled());
- if (tick_dep_mask) {
- trace_tick_dependency(tick_dep_mask);
+ if (check_tick_dependency(&tick_dep_mask))
return false;
- }
- if (ts->tick_dep_mask) {
- trace_tick_dependency(ts->tick_dep_mask);
+ if (check_tick_dependency(&ts->tick_dep_mask))
return false;
- }
- if (current->tick_dep_mask) {
- trace_tick_dependency(current->tick_dep_mask);
+ if (check_tick_dependency(&current->tick_dep_mask))
return false;
- }
- if (current->signal->tick_dep_mask) {
- trace_tick_dependency(current->signal->tick_dep_mask);
+ if (check_tick_dependency(&current->signal->tick_dep_mask))
return false;
- }
return true;
}
@@ -259,12 +257,12 @@ static void tick_nohz_full_kick_all(void)
preempt_enable();
}
-static void tick_nohz_dep_set_all(unsigned long *dep,
+static void tick_nohz_dep_set_all(atomic_t *dep,
enum tick_dep_bits bit)
{
- unsigned long prev;
+ int prev;
- prev = fetch_or(dep, BIT_MASK(bit));
+ prev = atomic_fetch_or(BIT(bit), dep);
if (!prev)
tick_nohz_full_kick_all();
}
@@ -280,7 +278,7 @@ void tick_nohz_dep_set(enum tick_dep_bits bit)
void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
- clear_bit(bit, &tick_dep_mask);
+ atomic_andnot(BIT(bit), &tick_dep_mask);
}
/*
@@ -289,12 +287,12 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
- unsigned long prev;
+ int prev;
struct tick_sched *ts;
ts = per_cpu_ptr(&tick_cpu_sched, cpu);
- prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+ prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
if (!prev) {
preempt_disable();
/* Perf needs local kick that is NMI safe */
@@ -313,7 +311,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
- clear_bit(bit, &ts->tick_dep_mask);
+ atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
/*
@@ -331,7 +329,7 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
- clear_bit(bit, &tsk->tick_dep_mask);
+ atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
/*
@@ -345,7 +343,7 @@ void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
- clear_bit(bit, &sig->tick_dep_mask);
+ atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}
/*
@@ -366,7 +364,8 @@ void __tick_nohz_task_switch(void)
ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->tick_stopped) {
- if (current->tick_dep_mask || current->signal->tick_dep_mask)
+ if (atomic_read(&current->tick_dep_mask) ||
+ atomic_read(&current->signal->tick_dep_mask))
tick_nohz_full_kick();
}
out:
@@ -378,7 +377,7 @@ static int __init tick_nohz_full_setup(char *str)
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
- pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
@@ -446,8 +445,7 @@ void __init tick_nohz_init(void)
* interrupts to avoid circular dependency on the tick
*/
if (!arch_irq_work_has_interrupt()) {
- pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
- "support irq work self-IPIs\n");
+ pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
cpumask_copy(housekeeping_mask, cpu_possible_mask);
tick_nohz_full_running = false;
@@ -457,7 +455,8 @@ void __init tick_nohz_init(void)
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
+ cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
@@ -486,20 +485,14 @@ void __init tick_nohz_init(void)
/*
* NO HZ enabled ?
*/
-int tick_nohz_enabled __read_mostly = 1;
+bool tick_nohz_enabled __read_mostly = true;
unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
static int __init setup_tick_nohz(char *str)
{
- if (!strcmp(str, "off"))
- tick_nohz_enabled = 0;
- else if (!strcmp(str, "on"))
- tick_nohz_enabled = 1;
- else
- return 0;
- return 1;
+ return (kstrtobool(str, &tick_nohz_enabled) == 0);
}
__setup("nohz=", setup_tick_nohz);
@@ -783,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (!ts->tick_stopped) {
nohz_balance_enter_idle(cpu);
calc_load_enter_idle();
+ cpu_load_update_nohz_start();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -809,11 +803,11 @@ out:
return tick;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
- update_cpu_load_nohz(active);
+ cpu_load_update_nohz_stop();
calc_load_exit_idle();
touch_softlockup_watchdog_sched();
@@ -840,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (can_stop_full_tick(ts))
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
- tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
+ tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
}
@@ -1031,7 +1025,7 @@ void tick_nohz_idle_exit(void)
tick_nohz_stop_idle(ts, now);
if (ts->tick_stopped) {
- tick_nohz_restart_sched_tick(ts, now, 0);
+ tick_nohz_restart_sched_tick(ts, now);
tick_nohz_account_idle_ticks(ts);
}
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index eb4e32566a83..bf38226e5c17 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,7 +60,7 @@ struct tick_sched {
u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
- unsigned long tick_dep_mask;
+ atomic_t tick_dep_mask;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index be115b020d27..a4064b612066 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -160,15 +160,15 @@ static inline void warp_clock(void)
* various programs will get confused when the clock gets warped.
*/
-int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
+int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
static int firsttime = 1;
int error = 0;
- if (tv && !timespec_valid(tv))
+ if (tv && !timespec64_valid(tv))
return -EINVAL;
- error = security_settime(tv, tz);
+ error = security_settime64(tv, tz);
if (error)
return error;
@@ -186,7 +186,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
}
}
if (tv)
- return do_settimeofday(tv);
+ return do_settimeofday64(tv);
return 0;
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index bbc5d1114583..73164c3aa56b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+/*
+ * Like schedule_timeout_uninterruptible(), except this task will not contribute
+ * to load average.
+ */
+signed long __sched schedule_timeout_idle(signed long timeout)
+{
+ __set_current_state(TASK_IDLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_idle);
+
#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
{
@@ -1698,10 +1709,10 @@ EXPORT_SYMBOL(msleep_interruptible);
static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
- unsigned long delta;
+ u64 delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
- delta = (max - min) * NSEC_PER_USEC;
+ delta = (u64)(max - min) * NSEC_PER_USEC;
schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
diff --git a/kernel/torture.c b/kernel/torture.c
index 44aa462d033f..fa0bdeee17ac 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -451,6 +451,7 @@ static int torture_shutdown(void *arg)
torture_shutdown_hook();
else
VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+ ftrace_dump(DUMP_ALL);
kernel_power_off(); /* Shut down the system. */
return 0;
}
@@ -602,8 +603,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable)
{
mutex_lock(&fullstop_mutex);
if (torture_type != NULL) {
- pr_alert("torture_init_begin: refusing %s init: %s running",
+ pr_alert("torture_init_begin: Refusing %s init: %s running.\n",
ttype, torture_type);
+ pr_alert("torture_init_begin: One torture test at a time!\n");
mutex_unlock(&fullstop_mutex);
return false;
}
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2aeb6ffc0a1e..9aef8654e90d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1349,6 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
if (t->action == BLK_TN_MESSAGE) {
log_action(iter, long_act ? "message" : "m");
blk_log_msg(s, iter->ent);
+ return trace_handle_return(s);
}
if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
@@ -1437,12 +1438,12 @@ static struct trace_event trace_blk_event = {
static int __init init_blk_tracer(void)
{
if (!register_trace_event(&trace_blk_event)) {
- pr_warning("Warning: could not register block events\n");
+ pr_warn("Warning: could not register block events\n");
return 1;
}
if (register_tracer(&blk_tracer) != 0) {
- pr_warning("Warning: could not register the block tracer\n");
+ pr_warn("Warning: could not register the block tracer\n");
unregister_trace_event(&trace_blk_event);
return 1;
}
@@ -1551,6 +1552,7 @@ static const struct {
{ BLK_TC_COMPLETE, "complete" },
{ BLK_TC_FS, "fs" },
{ BLK_TC_PC, "pc" },
+ { BLK_TC_NOTIFY, "notify" },
{ BLK_TC_AHEAD, "ahead" },
{ BLK_TC_META, "meta" },
{ BLK_TC_DISCARD, "discard" },
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 326a75e884db..780bcbe1d4de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,8 +13,6 @@
#include <linux/ctype.h>
#include "trace.h"
-static DEFINE_PER_CPU(int, bpf_prog_active);
-
/**
* trace_call_bpf - invoke BPF program
* @prog: BPF program
@@ -64,17 +62,21 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
void *dst = (void *) (long) r1;
- int size = (int) r2;
+ int ret, size = (int) r2;
void *unsafe_ptr = (void *) (long) r3;
- return probe_kernel_read(dst, unsafe_ptr, size);
+ ret = probe_kernel_read(dst, unsafe_ptr, size);
+ if (unlikely(ret < 0))
+ memset(dst, 0, size);
+
+ return ret;
}
static const struct bpf_func_proto bpf_probe_read_proto = {
.func = bpf_probe_read,
.gpl_only = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
+ .arg1_type = ARG_PTR_TO_RAW_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
.arg3_type = ARG_ANYTHING,
};
@@ -223,11 +225,12 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
{
struct pt_regs *regs = (struct pt_regs *) (long) r1;
struct bpf_map *map = (struct bpf_map *) (long) r2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u64 index = flags & BPF_F_INDEX_MASK;
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
@@ -237,6 +240,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
.data = data,
};
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+ return -EINVAL;
+ if (index == BPF_F_CURRENT_CPU)
+ index = raw_smp_processor_id();
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
@@ -270,7 +277,34 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
.arg5_type = ARG_CONST_STACK_SIZE,
};
-static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+
+static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+
+ perf_fetch_caller_regs(regs);
+
+ return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+}
+
+static const struct bpf_func_proto bpf_event_output_proto = {
+ .func = bpf_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
+
+const struct bpf_func_proto *bpf_get_event_output_proto(void)
+{
+ return &bpf_event_output_proto;
+}
+
+static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -297,10 +331,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ default:
+ return NULL;
+ }
+}
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
case BPF_FUNC_perf_event_output:
return &bpf_perf_event_output_proto;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto;
default:
- return NULL;
+ return tracing_func_proto(func_id);
}
}
@@ -332,9 +376,82 @@ static struct bpf_prog_type_list kprobe_tl = {
.type = BPF_PROG_TYPE_KPROBE,
};
+static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+ /*
+ * r1 points to perf tracepoint buffer where first 8 bytes are hidden
+ * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
+ * from there and call the same bpf_perf_event_output() helper
+ */
+ u64 ctx = *(long *)(uintptr_t)r1;
+
+ return bpf_perf_event_output(ctx, r2, index, r4, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
+ .func = bpf_perf_event_output_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
+
+static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ u64 ctx = *(long *)(uintptr_t)r1;
+
+ return bpf_get_stackid(ctx, r2, r3, r4, r5);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
+ .func = bpf_get_stackid_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto_tp;
+ case BPF_FUNC_get_stackid:
+ return &bpf_get_stackid_proto_tp;
+ default:
+ return tracing_func_proto(func_id);
+ }
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+ return false;
+ if (type != BPF_READ)
+ return false;
+ if (off % size != 0)
+ return false;
+ return true;
+}
+
+static const struct bpf_verifier_ops tracepoint_prog_ops = {
+ .get_func_proto = tp_prog_func_proto,
+ .is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tracepoint_tl = {
+ .ops = &tracepoint_prog_ops,
+ .type = BPF_PROG_TYPE_TRACEPOINT,
+};
+
static int __init register_kprobe_prog_ops(void)
{
bpf_register_prog_type(&kprobe_tl);
+ bpf_register_prog_type(&tracepoint_tl);
return 0;
}
late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f4e6aae6ebe7..7e8d792da963 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1030,8 +1030,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
for_each_possible_cpu(cpu) {
stat = &per_cpu(ftrace_profile_stats, cpu);
- /* allocate enough for function name + cpu number */
- name = kmalloc(32, GFP_KERNEL);
+ name = kasprintf(GFP_KERNEL, "function%d", cpu);
if (!name) {
/*
* The files created are permanent, if something happens
@@ -1043,7 +1042,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
return;
}
stat->stat = function_stats;
- snprintf(name, 32, "function%d", cpu);
stat->stat.name = name;
ret = register_stat_tracer(&stat->stat);
if (ret) {
@@ -1058,8 +1056,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer)
entry = tracefs_create_file("function_profile_enabled", 0644,
d_tracer, NULL, &ftrace_profile_fops);
if (!entry)
- pr_warning("Could not create tracefs "
- "'function_profile_enabled' entry\n");
+ pr_warn("Could not create tracefs 'function_profile_enabled' entry\n");
}
#else /* CONFIG_FUNCTION_PROFILER */
@@ -1622,7 +1619,7 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
return keep_regs;
}
-static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
+static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
{
@@ -1630,12 +1627,13 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
struct ftrace_hash *other_hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
+ bool update = false;
int count = 0;
int all = 0;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return;
+ return false;
/*
* In the filter_hash case:
@@ -1662,7 +1660,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* then there's nothing to do.
*/
if (ftrace_hash_empty(hash))
- return;
+ return false;
}
do_for_each_ftrace_rec(pg, rec) {
@@ -1706,7 +1704,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (inc) {
rec->flags++;
if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
- return;
+ return false;
/*
* If there's only a single callback registered to a
@@ -1732,7 +1730,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
rec->flags |= FTRACE_FL_REGS;
} else {
if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
- return;
+ return false;
rec->flags--;
/*
@@ -1765,22 +1763,28 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
*/
}
count++;
+
+ /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
+ update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE;
+
/* Shortcut, if we handled all records, we are done. */
if (!all && count == hash->count)
- return;
+ return update;
} while_for_each_ftrace_rec();
+
+ return update;
}
-static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
+static bool ftrace_hash_rec_disable(struct ftrace_ops *ops,
int filter_hash)
{
- __ftrace_hash_rec_update(ops, filter_hash, 0);
+ return __ftrace_hash_rec_update(ops, filter_hash, 0);
}
-static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
+static bool ftrace_hash_rec_enable(struct ftrace_ops *ops,
int filter_hash)
{
- __ftrace_hash_rec_update(ops, filter_hash, 1);
+ return __ftrace_hash_rec_update(ops, filter_hash, 1);
}
static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
@@ -2326,8 +2330,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
if (rec->flags & FTRACE_FL_TRAMP_EN) {
ops = ftrace_find_tramp_ops_curr(rec);
if (FTRACE_WARN_ON(!ops)) {
- pr_warning("Bad trampoline accounting at: %p (%pS)\n",
- (void *)rec->ip, (void *)rec->ip);
+ pr_warn("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
/* Ftrace is shutting down, return anything */
return (unsigned long)FTRACE_ADDR;
}
@@ -2656,7 +2660,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return ret;
ftrace_start_up++;
- command |= FTRACE_UPDATE_CALLS;
/*
* Note that ftrace probes uses this to start up
@@ -2677,7 +2680,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return ret;
}
- ftrace_hash_rec_enable(ops, 1);
+ if (ftrace_hash_rec_enable(ops, 1))
+ command |= FTRACE_UPDATE_CALLS;
ftrace_startup_enable(command);
@@ -2707,11 +2711,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
/* Disabling ipmodify never fails */
ftrace_hash_ipmodify_disable(ops);
- ftrace_hash_rec_disable(ops, 1);
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ if (ftrace_hash_rec_disable(ops, 1))
+ command |= FTRACE_UPDATE_CALLS;
- command |= FTRACE_UPDATE_CALLS;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
if (saved_ftrace_func != ftrace_trace_function) {
saved_ftrace_func = ftrace_trace_function;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 81b87451c0ea..0c7dee221dca 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,5 +15,6 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 95181e36891a..9c143739b8d7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -437,7 +437,7 @@ struct ring_buffer_per_cpu {
raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
- unsigned int nr_pages;
+ unsigned long nr_pages;
unsigned int current_context;
struct list_head *pages;
struct buffer_page *head_page; /* read from head */
@@ -458,7 +458,7 @@ struct ring_buffer_per_cpu {
u64 write_stamp;
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
- int nr_pages_to_update;
+ long nr_pages_to_update;
struct list_head new_pages; /* new pages to add */
struct work_struct update_pages_work;
struct completion update_done;
@@ -1128,10 +1128,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
return 0;
}
-static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
{
- int i;
struct buffer_page *bpage, *tmp;
+ long i;
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -1168,7 +1168,7 @@ free_pages:
}
static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned nr_pages)
+ unsigned long nr_pages)
{
LIST_HEAD(pages);
@@ -1193,7 +1193,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
}
static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
@@ -1293,8 +1293,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
struct lock_class_key *key)
{
struct ring_buffer *buffer;
+ long nr_pages;
int bsize;
- int cpu, nr_pages;
+ int cpu;
/* keep it in its own cache line */
buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1420,12 +1421,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
}
static int
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
struct buffer_page *to_remove_page, *tmp_iter_page;
struct buffer_page *last_page, *first_page;
- unsigned int nr_removed;
+ unsigned long nr_removed;
unsigned long head_bit;
int page_entries;
@@ -1642,7 +1643,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
int cpu_id)
{
struct ring_buffer_per_cpu *cpu_buffer;
- unsigned nr_pages;
+ unsigned long nr_pages;
int cpu, err = 0;
/*
@@ -1656,14 +1657,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return size;
- size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
- size *= BUF_PAGE_SIZE;
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
/* we need a minimum of two pages */
- if (size < BUF_PAGE_SIZE * 2)
- size = BUF_PAGE_SIZE * 2;
+ if (nr_pages < 2)
+ nr_pages = 2;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size = nr_pages * BUF_PAGE_SIZE;
/*
* Don't succeed if resizing is disabled, as a reader might be
@@ -4640,8 +4640,9 @@ static int rb_cpu_notify(struct notifier_block *self,
struct ring_buffer *buffer =
container_of(self, struct ring_buffer, cpu_notify);
long cpu = (long)hcpu;
- int cpu_i, nr_pages_same;
- unsigned int nr_pages;
+ long nr_pages_same;
+ int cpu_i;
+ unsigned long nr_pages;
switch (action) {
case CPU_UP_PREPARE:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d9293402ee68..a2f0b9f33e9b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -74,11 +74,6 @@ static struct tracer_opt dummy_tracer_opt[] = {
{ }
};
-static struct tracer_flags dummy_tracer_flags = {
- .val = 0,
- .opts = dummy_tracer_opt
-};
-
static int
dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
@@ -1258,12 +1253,22 @@ int __init register_tracer(struct tracer *type)
if (!type->set_flag)
type->set_flag = &dummy_set_flag;
- if (!type->flags)
- type->flags = &dummy_tracer_flags;
- else
+ if (!type->flags) {
+ /*allocate a dummy tracer_flags*/
+ type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL);
+ if (!type->flags) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ type->flags->val = 0;
+ type->flags->opts = dummy_tracer_opt;
+ } else
if (!type->flags->opts)
type->flags->opts = dummy_tracer_opt;
+ /* store the tracer for __set_tracer_option */
+ type->flags->trace = type;
+
ret = run_tracer_selftest(type);
if (ret < 0)
goto out;
@@ -1659,6 +1664,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#else
TRACE_FLAG_IRQS_NOSUPPORT |
#endif
+ ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
@@ -2071,20 +2077,20 @@ void trace_printk_init_buffers(void)
/* trace_printk() is for debug use only. Don't use it in production. */
- pr_warning("\n");
- pr_warning("**********************************************************\n");
- pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
- pr_warning("** **\n");
- pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
- pr_warning("** **\n");
- pr_warning("** This means that this is a DEBUG kernel and it is **\n");
- pr_warning("** unsafe for production use. **\n");
- pr_warning("** **\n");
- pr_warning("** If you see this message and you are not debugging **\n");
- pr_warning("** the kernel, report this immediately to your vendor! **\n");
- pr_warning("** **\n");
- pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
- pr_warning("**********************************************************\n");
+ pr_warn("\n");
+ pr_warn("**********************************************************\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("** **\n");
+ pr_warn("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warn("** **\n");
+ pr_warn("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warn("** unsafe for production use. **\n");
+ pr_warn("** **\n");
+ pr_warn("** If you see this message and you are not debugging **\n");
+ pr_warn("** the kernel, report this immediately to your vendor! **\n");
+ pr_warn("** **\n");
+ pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warn("**********************************************************\n");
/* Expand the buffers to set size */
tracing_update_buffers();
@@ -3505,7 +3511,7 @@ static int __set_tracer_option(struct trace_array *tr,
struct tracer_flags *tracer_flags,
struct tracer_opt *opts, int neg)
{
- struct tracer *trace = tr->current_trace;
+ struct tracer *trace = tracer_flags->trace;
int ret;
ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
@@ -4101,7 +4107,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
*/
map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
if (!map_array) {
- pr_warning("Unable to allocate trace enum mapping\n");
+ pr_warn("Unable to allocate trace enum mapping\n");
return;
}
@@ -4949,7 +4955,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
spd.nr_pages = i;
- ret = splice_to_pipe(pipe, &spd);
+ if (i)
+ ret = splice_to_pipe(pipe, &spd);
+ else
+ ret = 0;
out:
splice_shrink_spd(&spd);
return ret;
@@ -6131,7 +6140,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
snprintf(cpu_dir, 30, "cpu%ld", cpu);
d_cpu = tracefs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
- pr_warning("Could not create tracefs '%s' entry\n", cpu_dir);
+ pr_warn("Could not create tracefs '%s' entry\n", cpu_dir);
return;
}
@@ -6318,7 +6327,7 @@ struct dentry *trace_create_file(const char *name,
ret = tracefs_create_file(name, mode, parent, data, fops);
if (!ret)
- pr_warning("Could not create tracefs '%s' entry\n", name);
+ pr_warn("Could not create tracefs '%s' entry\n", name);
return ret;
}
@@ -6337,7 +6346,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr)
tr->options = tracefs_create_dir("options", d_tracer);
if (!tr->options) {
- pr_warning("Could not create tracefs directory 'options'\n");
+ pr_warn("Could not create tracefs directory 'options'\n");
return NULL;
}
@@ -6391,11 +6400,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
return;
for (i = 0; i < tr->nr_topts; i++) {
- /*
- * Check if these flags have already been added.
- * Some tracers share flags.
- */
- if (tr->topts[i].tracer->flags == tracer->flags)
+ /* Make sure there's no duplicate flags. */
+ if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags))
return;
}
@@ -7248,8 +7254,8 @@ __init static int tracer_alloc_buffers(void)
if (trace_boot_clock) {
ret = tracing_set_clock(&global_trace, trace_boot_clock);
if (ret < 0)
- pr_warning("Trace clock %s not defined, going back to default\n",
- trace_boot_clock);
+ pr_warn("Trace clock %s not defined, going back to default\n",
+ trace_boot_clock);
}
/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8414fa40bf27..3fff4adfd431 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -125,6 +125,7 @@ enum trace_flag_type {
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_PREEMPT_RESCHED = 0x20,
+ TRACE_FLAG_NMI = 0x40,
};
#define TRACE_BUF_SIZE 1024
@@ -345,6 +346,7 @@ struct tracer_opt {
struct tracer_flags {
u32 val;
struct tracer_opt *opts;
+ struct tracer *trace;
};
/* Makes more easy to define a tracer opt */
@@ -1111,6 +1113,18 @@ struct filter_pred {
unsigned short right;
};
+static inline bool is_string_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_DYN_STRING ||
+ field->filter_type == FILTER_STATIC_STRING ||
+ field->filter_type == FILTER_PTR_STRING;
+}
+
+static inline bool is_function_field(struct ftrace_event_field *field)
+{
+ return field->filter_type == FILTER_TRACE_FN;
+}
+
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct trace_event_file *file,
@@ -1159,9 +1173,24 @@ struct event_trigger_data {
struct event_filter __rcu *filter;
char *filter_str;
void *private_data;
+ bool paused;
struct list_head list;
};
+extern void trigger_data_free(struct event_trigger_data *data);
+extern int event_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data);
+extern int trace_event_trigger_enable_disable(struct trace_event_file *file,
+ int trigger_enable);
+extern void update_cond_flag(struct trace_event_file *file);
+extern void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct trace_event_file *file);
+extern int set_trigger_filter(char *filter_str,
+ struct event_trigger_data *trigger_data,
+ struct trace_event_file *file);
+extern int register_event_command(struct event_command *cmd);
+
/**
* struct event_trigger_ops - callbacks for trace event triggers
*
@@ -1174,7 +1203,8 @@ struct event_trigger_data {
* @func: The trigger 'probe' function called when the triggering
* event occurs. The data passed into this callback is the data
* that was supplied to the event_command @reg() function that
- * registered the trigger (see struct event_command).
+ * registered the trigger (see struct event_command) along with
+ * the trace record, rec.
*
* @init: An optional initialization function called for the trigger
* when the trigger is registered (via the event_command reg()
@@ -1199,7 +1229,8 @@ struct event_trigger_data {
* (see trace_event_triggers.c).
*/
struct event_trigger_ops {
- void (*func)(struct event_trigger_data *data);
+ void (*func)(struct event_trigger_data *data,
+ void *rec);
int (*init)(struct event_trigger_ops *ops,
struct event_trigger_data *data);
void (*free)(struct event_trigger_ops *ops,
@@ -1243,27 +1274,10 @@ struct event_trigger_ops {
* values are defined by adding new values to the trigger_type
* enum in include/linux/trace_events.h.
*
- * @post_trigger: A flag that says whether or not this command needs
- * to have its action delayed until after the current event has
- * been closed. Some triggers need to avoid being invoked while
- * an event is currently in the process of being logged, since
- * the trigger may itself log data into the trace buffer. Thus
- * we make sure the current event is committed before invoking
- * those triggers. To do that, the trigger invocation is split
- * in two - the first part checks the filter using the current
- * trace record; if a command has the @post_trigger flag set, it
- * sets a bit for itself in the return value, otherwise it
- * directly invokes the trigger. Once all commands have been
- * either invoked or set their return flag, the current record is
- * either committed or discarded. At that point, if any commands
- * have deferred their triggers, those commands are finally
- * invoked following the close of the current event. In other
- * words, if the event_trigger_ops @func() probe implementation
- * itself logs to the trace buffer, this flag should be set,
- * otherwise it can be left unspecified.
+ * @flags: See the enum event_command_flags below.
*
- * All the methods below, except for @set_filter(), must be
- * implemented.
+ * All the methods below, except for @set_filter() and @unreg_all(),
+ * must be implemented.
*
* @func: The callback function responsible for parsing and
* registering the trigger written to the 'trigger' file by the
@@ -1288,6 +1302,10 @@ struct event_trigger_ops {
* This is usually implemented by the generic utility function
* @unregister_trigger() (see trace_event_triggers.c).
*
+ * @unreg_all: An optional function called to remove all the triggers
+ * from the list of triggers associated with the event. Called
+ * when a trigger file is opened in truncate mode.
+ *
* @set_filter: An optional function called to parse and set a filter
* for the trigger. If no @set_filter() method is set for the
* event command, filters set by the user for the command will be
@@ -1301,7 +1319,7 @@ struct event_command {
struct list_head list;
char *name;
enum event_trigger_type trigger_type;
- bool post_trigger;
+ int flags;
int (*func)(struct event_command *cmd_ops,
struct trace_event_file *file,
char *glob, char *cmd, char *params);
@@ -1313,12 +1331,56 @@ struct event_command {
struct event_trigger_ops *ops,
struct event_trigger_data *data,
struct trace_event_file *file);
+ void (*unreg_all)(struct trace_event_file *file);
int (*set_filter)(char *filter_str,
struct event_trigger_data *data,
struct trace_event_file *file);
struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param);
};
+/**
+ * enum event_command_flags - flags for struct event_command
+ *
+ * @POST_TRIGGER: A flag that says whether or not this command needs
+ * to have its action delayed until after the current event has
+ * been closed. Some triggers need to avoid being invoked while
+ * an event is currently in the process of being logged, since
+ * the trigger may itself log data into the trace buffer. Thus
+ * we make sure the current event is committed before invoking
+ * those triggers. To do that, the trigger invocation is split
+ * in two - the first part checks the filter using the current
+ * trace record; if a command has the @post_trigger flag set, it
+ * sets a bit for itself in the return value, otherwise it
+ * directly invokes the trigger. Once all commands have been
+ * either invoked or set their return flag, the current record is
+ * either committed or discarded. At that point, if any commands
+ * have deferred their triggers, those commands are finally
+ * invoked following the close of the current event. In other
+ * words, if the event_trigger_ops @func() probe implementation
+ * itself logs to the trace buffer, this flag should be set,
+ * otherwise it can be left unspecified.
+ *
+ * @NEEDS_REC: A flag that says whether or not this command needs
+ * access to the trace record in order to perform its function,
+ * regardless of whether or not it has a filter associated with
+ * it (filters make a trigger require access to the trace record
+ * but are not always present).
+ */
+enum event_command_flags {
+ EVENT_CMD_FL_POST_TRIGGER = 1,
+ EVENT_CMD_FL_NEEDS_REC = 2,
+};
+
+static inline bool event_command_post_trigger(struct event_command *cmd_ops)
+{
+ return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER;
+}
+
+static inline bool event_command_needs_rec(struct event_command *cmd_ops)
+{
+ return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC;
+}
+
extern int trace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable);
extern int tracing_alloc_snapshot(void);
@@ -1365,8 +1427,13 @@ int perf_ftrace_event_register(struct trace_event_call *call,
#ifdef CONFIG_FTRACE_SYSCALLS
void init_ftrace_syscalls(void);
+const char *get_syscall_name(int syscall);
#else
static inline void init_ftrace_syscalls(void) { }
+static inline const char *get_syscall_name(int syscall)
+{
+ return NULL;
+}
#endif
#ifdef CONFIG_EVENT_TRACING
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..562fa69df5d3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!is_sampling_event(p_event))
+ return 0;
+
/*
* We don't allow user space callchains for function trace
* event, due to issues with page faults while tracing page
@@ -260,42 +263,43 @@ void perf_trace_del(struct perf_event *p_event, int flags)
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
-void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs **regs, int *rctxp)
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
{
- struct trace_entry *entry;
- unsigned long flags;
char *raw_data;
- int pc;
+ int rctx;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
- "perf buffer not large enough"))
+ "perf buffer not large enough"))
return NULL;
- pc = preempt_count();
-
- *rctxp = perf_swevent_get_recursion_context();
- if (*rctxp < 0)
+ *rctxp = rctx = perf_swevent_get_recursion_context();
+ if (rctx < 0)
return NULL;
if (regs)
- *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
- raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+ *regs = this_cpu_ptr(&__perf_regs[rctx]);
+ raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
/* zero the dead bytes from align to not leak stack to user */
memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+ return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
+NOKPROBE_SYMBOL(perf_trace_buf_alloc);
+
+void perf_trace_buf_update(void *record, u16 type)
+{
+ struct trace_entry *entry = record;
+ int pc = preempt_count();
+ unsigned long flags;
- entry = (struct trace_entry *)raw_data;
local_save_flags(flags);
tracing_generic_entry_update(entry, flags, pc);
entry->type = type;
-
- return raw_data;
}
-EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
-NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_update);
#ifdef CONFIG_FUNCTION_TRACER
static void
@@ -316,15 +320,16 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
+ memset(&regs, 0, sizeof(regs));
perf_fetch_caller_regs(&regs);
- entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+ entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
if (!entry)
return;
entry->ip = ip;
entry->parent_ip = parent_ip;
- perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+ perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
1, &regs, head, NULL);
#undef ENTRY_SIZE
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..b7b0760ba6ee 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
}
}
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+ struct ftrace_event_field *tail;
+ struct list_head *head;
+
+ head = trace_get_fields(call);
+ /*
+ * head->next points to the last field with the largest offset,
+ * since it was added last by trace_define_field()
+ */
+ tail = list_first_entry(head, struct ftrace_event_field, link);
+ return tail->offset + tail->size;
+}
+
int trace_event_raw_init(struct trace_event_call *call)
{
int id;
@@ -2095,8 +2113,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("filter", 0644, file->dir, file,
&ftrace_event_filter_fops);
- trace_create_file("trigger", 0644, file->dir, file,
- &event_trigger_fops);
+ /*
+ * Only event directories that can be enabled should have
+ * triggers.
+ */
+ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+ trace_create_file("trigger", 0644, file->dir, file,
+ &event_trigger_fops);
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 6816302542b2..b3f5051cd4e9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -961,18 +961,6 @@ int filter_assign_type(const char *type)
return FILTER_OTHER;
}
-static bool is_function_field(struct ftrace_event_field *field)
-{
- return field->filter_type == FILTER_TRACE_FN;
-}
-
-static bool is_string_field(struct ftrace_event_field *field)
-{
- return field->filter_type == FILTER_DYN_STRING ||
- field->filter_type == FILTER_STATIC_STRING ||
- field->filter_type == FILTER_PTR_STRING;
-}
-
static bool is_legal_op(struct ftrace_event_field *field, int op)
{
if (is_string_field(field) &&
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index b38f617b6181..d67992f3bb0e 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -28,8 +28,7 @@
static LIST_HEAD(trigger_commands);
static DEFINE_MUTEX(trigger_cmd_mutex);
-static void
-trigger_data_free(struct event_trigger_data *data)
+void trigger_data_free(struct event_trigger_data *data)
{
if (data->cmd_ops->set_filter)
data->cmd_ops->set_filter(NULL, data, NULL);
@@ -73,18 +72,20 @@ event_triggers_call(struct trace_event_file *file, void *rec)
return tt;
list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->paused)
+ continue;
if (!rec) {
- data->ops->func(data);
+ data->ops->func(data, rec);
continue;
}
filter = rcu_dereference_sched(data->filter);
if (filter && !filter_match_preds(filter, rec))
continue;
- if (data->cmd_ops->post_trigger) {
+ if (event_command_post_trigger(data->cmd_ops)) {
tt |= data->cmd_ops->trigger_type;
continue;
}
- data->ops->func(data);
+ data->ops->func(data, rec);
}
return tt;
}
@@ -94,6 +95,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
* event_triggers_post_call - Call 'post_triggers' for a trace event
* @file: The trace_event_file associated with the event
* @tt: enum event_trigger_type containing a set bit for each trigger to invoke
+ * @rec: The trace entry for the event
*
* For each trigger associated with an event, invoke the trigger
* function registered with the associated trigger command, if the
@@ -104,13 +106,16 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
*/
void
event_triggers_post_call(struct trace_event_file *file,
- enum event_trigger_type tt)
+ enum event_trigger_type tt,
+ void *rec)
{
struct event_trigger_data *data;
list_for_each_entry_rcu(data, &file->triggers, list) {
+ if (data->paused)
+ continue;
if (data->cmd_ops->trigger_type & tt)
- data->ops->func(data);
+ data->ops->func(data, rec);
}
}
EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -188,6 +193,19 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
return -ENODEV;
}
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC)) {
+ struct trace_event_file *event_file;
+ struct event_command *p;
+
+ event_file = event_file_data(file);
+
+ list_for_each_entry(p, &trigger_commands, list) {
+ if (p->unreg_all)
+ p->unreg_all(event_file);
+ }
+ }
+
if (file->f_mode & FMODE_READ) {
ret = seq_open(file, &event_triggers_seq_ops);
if (!ret) {
@@ -306,7 +324,7 @@ const struct file_operations event_trigger_fops = {
* Currently we only register event commands from __init, so mark this
* __init too.
*/
-static __init int register_event_command(struct event_command *cmd)
+__init int register_event_command(struct event_command *cmd)
{
struct event_command *p;
int ret = 0;
@@ -395,9 +413,8 @@ event_trigger_print(const char *name, struct seq_file *m,
*
* Return: 0 on success, errno otherwise
*/
-static int
-event_trigger_init(struct event_trigger_ops *ops,
- struct event_trigger_data *data)
+int event_trigger_init(struct event_trigger_ops *ops,
+ struct event_trigger_data *data)
{
data->ref++;
return 0;
@@ -425,8 +442,8 @@ event_trigger_free(struct event_trigger_ops *ops,
trigger_data_free(data);
}
-static int trace_event_trigger_enable_disable(struct trace_event_file *file,
- int trigger_enable)
+int trace_event_trigger_enable_disable(struct trace_event_file *file,
+ int trigger_enable)
{
int ret = 0;
@@ -483,13 +500,14 @@ clear_event_triggers(struct trace_array *tr)
* its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be
* cleared.
*/
-static void update_cond_flag(struct trace_event_file *file)
+void update_cond_flag(struct trace_event_file *file)
{
struct event_trigger_data *data;
bool set_cond = false;
list_for_each_entry_rcu(data, &file->triggers, list) {
- if (data->filter || data->cmd_ops->post_trigger) {
+ if (data->filter || event_command_post_trigger(data->cmd_ops) ||
+ event_command_needs_rec(data->cmd_ops)) {
set_cond = true;
break;
}
@@ -560,9 +578,9 @@ out:
* Usually used directly as the @unreg method in event command
* implementations.
*/
-static void unregister_trigger(char *glob, struct event_trigger_ops *ops,
- struct event_trigger_data *test,
- struct trace_event_file *file)
+void unregister_trigger(char *glob, struct event_trigger_ops *ops,
+ struct event_trigger_data *test,
+ struct trace_event_file *file)
{
struct event_trigger_data *data;
bool unregistered = false;
@@ -696,9 +714,9 @@ event_trigger_callback(struct event_command *cmd_ops,
*
* Return: 0 on success, errno otherwise
*/
-static int set_trigger_filter(char *filter_str,
- struct event_trigger_data *trigger_data,
- struct trace_event_file *file)
+int set_trigger_filter(char *filter_str,
+ struct event_trigger_data *trigger_data,
+ struct trace_event_file *file)
{
struct event_trigger_data *data = trigger_data;
struct event_filter *filter = NULL, *tmp;
@@ -747,7 +765,7 @@ static int set_trigger_filter(char *filter_str,
}
static void
-traceon_trigger(struct event_trigger_data *data)
+traceon_trigger(struct event_trigger_data *data, void *rec)
{
if (tracing_is_on())
return;
@@ -756,7 +774,7 @@ traceon_trigger(struct event_trigger_data *data)
}
static void
-traceon_count_trigger(struct event_trigger_data *data)
+traceon_count_trigger(struct event_trigger_data *data, void *rec)
{
if (tracing_is_on())
return;
@@ -771,7 +789,7 @@ traceon_count_trigger(struct event_trigger_data *data)
}
static void
-traceoff_trigger(struct event_trigger_data *data)
+traceoff_trigger(struct event_trigger_data *data, void *rec)
{
if (!tracing_is_on())
return;
@@ -780,7 +798,7 @@ traceoff_trigger(struct event_trigger_data *data)
}
static void
-traceoff_count_trigger(struct event_trigger_data *data)
+traceoff_count_trigger(struct event_trigger_data *data, void *rec)
{
if (!tracing_is_on())
return;
@@ -876,13 +894,13 @@ static struct event_command trigger_traceoff_cmd = {
#ifdef CONFIG_TRACER_SNAPSHOT
static void
-snapshot_trigger(struct event_trigger_data *data)
+snapshot_trigger(struct event_trigger_data *data, void *rec)
{
tracing_snapshot();
}
static void
-snapshot_count_trigger(struct event_trigger_data *data)
+snapshot_count_trigger(struct event_trigger_data *data, void *rec)
{
if (!data->count)
return;
@@ -890,7 +908,7 @@ snapshot_count_trigger(struct event_trigger_data *data)
if (data->count != -1)
(data->count)--;
- snapshot_trigger(data);
+ snapshot_trigger(data, rec);
}
static int
@@ -969,13 +987,13 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
#define STACK_SKIP 3
static void
-stacktrace_trigger(struct event_trigger_data *data)
+stacktrace_trigger(struct event_trigger_data *data, void *rec)
{
trace_dump_stack(STACK_SKIP);
}
static void
-stacktrace_count_trigger(struct event_trigger_data *data)
+stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
{
if (!data->count)
return;
@@ -983,7 +1001,7 @@ stacktrace_count_trigger(struct event_trigger_data *data)
if (data->count != -1)
(data->count)--;
- stacktrace_trigger(data);
+ stacktrace_trigger(data, rec);
}
static int
@@ -1017,7 +1035,7 @@ stacktrace_get_trigger_ops(char *cmd, char *param)
static struct event_command trigger_stacktrace_cmd = {
.name = "stacktrace",
.trigger_type = ETT_STACKTRACE,
- .post_trigger = true,
+ .flags = EVENT_CMD_FL_POST_TRIGGER,
.func = event_trigger_callback,
.reg = register_trigger,
.unreg = unregister_trigger,
@@ -1054,7 +1072,7 @@ struct enable_trigger_data {
};
static void
-event_enable_trigger(struct event_trigger_data *data)
+event_enable_trigger(struct event_trigger_data *data, void *rec)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1065,7 +1083,7 @@ event_enable_trigger(struct event_trigger_data *data)
}
static void
-event_enable_count_trigger(struct event_trigger_data *data)
+event_enable_count_trigger(struct event_trigger_data *data, void *rec)
{
struct enable_trigger_data *enable_data = data->private_data;
@@ -1079,7 +1097,7 @@ event_enable_count_trigger(struct event_trigger_data *data)
if (data->count != -1)
(data->count)--;
- event_enable_trigger(data);
+ event_enable_trigger(data, rec);
}
static int
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index fcd41a166405..5a095c2e4b69 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -219,6 +219,8 @@ static void tracing_stop_function_trace(struct trace_array *tr)
unregister_ftrace_function(tr->ops);
}
+static struct tracer function_trace;
+
static int
func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
{
@@ -228,6 +230,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
break;
+ /* We can change this flag when not running. */
+ if (tr->current_trace != &function_trace)
+ break;
+
unregister_ftrace_function(tr->ops);
if (set) {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index a663cbb84107..3a0244ff7ea8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -8,6 +8,7 @@
*/
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -1350,7 +1351,7 @@ void graph_trace_open(struct trace_iterator *iter)
out_err_free:
kfree(data);
out_err:
- pr_warning("function graph tracer: not enough memory\n");
+ pr_warn("function graph tracer: not enough memory\n");
}
void graph_trace_close(struct trace_iterator *iter)
@@ -1468,12 +1469,12 @@ static __init int init_graph_trace(void)
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
if (!register_trace_event(&graph_trace_entry_event)) {
- pr_warning("Warning: could not register graph trace events\n");
+ pr_warn("Warning: could not register graph trace events\n");
return 1;
}
if (!register_trace_event(&graph_trace_ret_event)) {
- pr_warning("Warning: could not register graph trace events\n");
+ pr_warn("Warning: could not register graph trace events\n");
return 1;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index e4e56589ec1d..03cdff84d026 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr,
return 0;
local_save_flags(*flags);
- /* slight chance to get a false positive on tracing_cpu */
- if (!irqs_disabled_flags(*flags))
+ /*
+ * Slight chance to get a false positive on tracing_cpu,
+ * although I'm starting to think there isn't a chance.
+ * Leave this for now just to be paranoid.
+ */
+ if (!irqs_disabled_flags(*flags) && !preempt_count())
return 0;
*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
@@ -622,7 +626,6 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
- tracing_reset_online_cpus(&tr->trace_buffer);
ftrace_init_array_ops(tr, irqsoff_tracer_call);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 21b81a41dae5..5546eec0505f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -459,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk)
if (ret == 0)
tk->tp.flags |= TP_FLAG_REGISTERED;
else {
- pr_warning("Could not insert probe at %s+%lu: %d\n",
- trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
+ pr_warn("Could not insert probe at %s+%lu: %d\n",
+ trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret);
if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) {
- pr_warning("This probe might be able to register after"
- "target module is loaded. Continue.\n");
+ pr_warn("This probe might be able to register after target module is loaded. Continue.\n");
ret = 0;
} else if (ret == -EILSEQ) {
- pr_warning("Probing address(0x%p) is not an "
- "instruction boundary.\n",
- tk->rp.kp.addr);
+ pr_warn("Probing address(0x%p) is not an instruction boundary.\n",
+ tk->rp.kp.addr);
ret = -EINVAL;
}
}
@@ -529,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
/* Register new event */
ret = register_kprobe_event(tk);
if (ret) {
- pr_warning("Failed to register probe event(%d)\n", ret);
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
@@ -564,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
__unregister_trace_kprobe(tk);
ret = __register_trace_kprobe(tk);
if (ret)
- pr_warning("Failed to re-register probe %s on"
- "%s: %d\n",
- trace_event_name(&tk->tp.call),
- mod->name, ret);
+ pr_warn("Failed to re-register probe %s on %s: %d\n",
+ trace_event_name(&tk->tp.call),
+ mod->name, ret);
}
}
mutex_unlock(&probe_lock);
@@ -1152,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
return;
entry->ip = (unsigned long)tk->rp.kp.addr;
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
- perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1187,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
return;
entry->func = (unsigned long)tk->rp.kp.addr;
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
- perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
@@ -1336,16 +1335,14 @@ static __init int init_kprobe_trace(void)
/* Event list interface */
if (!entry)
- pr_warning("Could not create tracefs "
- "'kprobe_events' entry\n");
+ pr_warn("Could not create tracefs 'kprobe_events' entry\n");
/* Profile interface */
entry = tracefs_create_file("kprobe_profile", 0444, d_tracer,
NULL, &kprobe_profile_ops);
if (!entry)
- pr_warning("Could not create tracefs "
- "'kprobe_profile' entry\n");
+ pr_warn("Could not create tracefs 'kprobe_profile' entry\n");
return 0;
}
fs_initcall(init_kprobe_trace);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2be8c4f2403d..68f376ca6d3f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
/* XXX: This is later than where events were lost. */
trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
if (!overrun_detected)
- pr_warning("mmiotrace has lost events.\n");
+ pr_warn("mmiotrace has lost events\n");
overrun_detected = true;
goto print_out;
}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 8bb2071474dd..49f61fe96a6b 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -56,7 +56,7 @@ static void nop_trace_reset(struct trace_array *tr)
}
/* It only serves as a signal handler and a callback to
- * accept or refuse tthe setting of a flag.
+ * accept or refuse the setting of a flag.
* If you don't implement it, then the flag setting will be
* automatically accepted.
*/
@@ -75,7 +75,7 @@ static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (bit == TRACE_NOP_OPT_REFUSE) {
printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
- "Now cat trace_options to see the result\n",
+ " Now cat trace_options to see the result\n",
set);
return -EINVAL;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 282982195e09..0bb9cf2d53e6 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -389,7 +389,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
char irqs_off;
int hardirq;
int softirq;
+ int nmi;
+ nmi = entry->flags & TRACE_FLAG_NMI;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
@@ -415,10 +417,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
}
hardsoft_irq =
+ (nmi && hardirq) ? 'Z' :
+ nmi ? 'z' :
(hardirq && softirq) ? 'H' :
- hardirq ? 'h' :
- softirq ? 's' :
- '.';
+ hardirq ? 'h' :
+ softirq ? 's' :
+ '.' ;
trace_seq_printf(s, "%c%c%c",
irqs_off, need_resched, hardsoft_irq);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 060df67dbdd1..f96f0383f6c6 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -296,6 +296,9 @@ static int t_show(struct seq_file *m, void *v)
const char *str = *fmt;
int i;
+ if (!*fmt)
+ return 0;
+
seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
/*
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 1769a81da8a7..1d372fa6fefb 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
*tmp = '\0';
size = tmp - kbuf + 1;
} else if (done + size < count) {
- pr_warning("Line length is too long: "
- "Should be less than %d.", WRITE_BUFSIZE);
+ pr_warn("Line length is too long: Should be less than %d\n",
+ WRITE_BUFSIZE);
ret = -EINVAL;
goto out;
}
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 6cf935316769..413ff108fbd0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -281,8 +281,7 @@ static int tracing_stat_init(void)
stat_dir = tracefs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
- pr_warning("Could not create tracefs "
- "'trace_stat' entry\n");
+ pr_warn("Could not create tracefs 'trace_stat' entry\n");
return 0;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d1663083d903..b2b6efc083a4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -106,6 +106,17 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
return syscalls_metadata[nr];
}
+const char *get_syscall_name(int syscall)
+{
+ struct syscall_metadata *entry;
+
+ entry = syscall_nr_to_meta(syscall);
+ if (!entry)
+ return NULL;
+
+ return entry->name;
+}
+
static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
@@ -576,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = ALIGN(size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
- sys_data->enter_event->event.type, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+ perf_trace_buf_submit(rec, size, rctx,
+ sys_data->enter_event->event.type, 1, regs,
+ head, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -649,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
- sys_data->exit_event->event.type, NULL, &rctx);
+ rec = perf_trace_buf_alloc(size, NULL, &rctx);
if (!rec)
return;
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+ perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
+ 1, regs, head, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index d2f6d0be3503..c53485441c88 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
ret = register_uprobe_event(tu);
if (ret) {
- pr_warning("Failed to register probe event(%d)\n", ret);
+ pr_warn("Failed to register probe event(%d)\n", ret);
goto end;
}
@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
if (hlist_empty(head))
goto out;
- entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, NULL, &rctx);
if (!entry)
goto out;
@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
memset(data + len, 0, size - esize - len);
}
- perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+ perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+ head, NULL);
out:
preempt_enable();
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ecd536de603a..d0639d917899 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -491,7 +491,7 @@ static __init int init_tracepoints(void)
ret = register_module_notifier(&tracepoint_module_nb);
if (ret)
- pr_warning("Failed to register tracepoint module enter notifier\n");
+ pr_warn("Failed to register tracepoint module enter notifier\n");
return ret;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b3ace6ebbba3..9acb29f280ec 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
* both lockup detectors are disabled if proc_watchdog_update()
* returns an error.
*/
+ if (old == new)
+ goto out;
+
err = proc_watchdog_update();
}
out:
@@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write,
int proc_watchdog_thresh(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- int err, old;
+ int err, old, new;
get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
@@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
/*
* Update the sample period. Restore on failure.
*/
+ new = ACCESS_ONCE(watchdog_thresh);
+ if (old == new)
+ goto out;
+
set_sample_period();
err = proc_watchdog_update();
if (err) {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 16e13d8628a3..5f5068e94003 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -666,6 +666,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ /*
+ * The following mb guarantees that previous clear of a PENDING bit
+ * will not be reordered with any speculative LOADS or STORES from
+ * work->current_func, which is executed afterwards. This possible
+ * reordering can lead to a missed execution on attempt to qeueue
+ * the same @work. E.g. consider this case:
+ *
+ * CPU#0 CPU#1
+ * ---------------------------- --------------------------------
+ *
+ * 1 STORE event_indicated
+ * 2 queue_work_on() {
+ * 3 test_and_set_bit(PENDING)
+ * 4 } set_..._and_clear_pending() {
+ * 5 set_work_data() # clear bit
+ * 6 smp_mb()
+ * 7 work->current_func() {
+ * 8 LOAD event_indicated
+ * }
+ *
+ * Without an explicit full barrier speculative LOAD on line 8 can
+ * be executed before CPU#0 does STORE on line 1. If that happens,
+ * CPU#0 observes the PENDING bit is still set and new execution of
+ * a @work is not queued in a hope, that CPU#1 will eventually
+ * finish the queued @work. Meanwhile CPU#1 does not see
+ * event_indicated is set, because speculative LOAD was executed
+ * before actual STORE.
+ */
+ smp_mb();
}
static void clear_work_data(struct work_struct *work)
@@ -857,7 +886,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
/**
* wq_worker_sleeping - a worker is going to sleep
* @task: task going to sleep
- * @cpu: CPU in question, must be the current CPU number
*
* This function is called during schedule() when a busy worker is
* going to sleep. Worker on the same cpu can be woken up by
@@ -869,7 +897,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu)
* Return:
* Worker task on @cpu to wake up, %NULL if none.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
+struct task_struct *wq_worker_sleeping(struct task_struct *task)
{
struct worker *worker = kthread_data(task), *to_wakeup = NULL;
struct worker_pool *pool;
@@ -885,7 +913,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
pool = worker->pool;
/* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
+ if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
return NULL;
/*
@@ -4526,6 +4554,17 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+
+ /*
+ * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+ * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
+ * being reworked and this can go away in time.
+ */
+ if (!(pool->flags & POOL_DISASSOCIATED)) {
+ spin_unlock_irq(&pool->lock);
+ return;
+ }
+
pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
@@ -4695,7 +4734,7 @@ static void work_for_cpu_fn(struct work_struct *work)
}
/**
- * work_on_cpu - run a function in user context on a particular cpu
+ * work_on_cpu - run a function in thread context on a particular cpu
* @cpu: the cpu to run on
* @fn: the function to run
* @arg: the function arg
@@ -5221,8 +5260,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
wq_dev->wq = wq;
wq_dev->dev.bus = &wq_subsys;
- wq_dev->dev.init_name = wq->name;
wq_dev->dev.release = wq_device_release;
+ dev_set_name(&wq_dev->dev, "%s", wq->name);
/*
* unbound_attrs are created separately. Suppress uevent until
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 45215870ac6c..8635417c587b 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void)
* sched/core.c and workqueue.c.
*/
void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */