summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c20
-rw-r--r--kernel/audit.c10
-rw-r--r--kernel/audit_watch.c12
-rw-r--r--kernel/bpf/arraymap.c37
-rw-r--r--kernel/bpf/core.c101
-rw-r--r--kernel/bpf/syscall.c54
-rw-r--r--kernel/bpf/verifier.c312
-rw-r--r--kernel/cgroup.c24
-rw-r--r--kernel/configs/android-base.config160
-rw-r--r--kernel/configs/android-recommended.config125
-rw-r--r--kernel/cpu.c27
-rw-r--r--kernel/cpuset.c55
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/events/core.c55
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c117
-rw-r--r--kernel/futex.c11
-rw-r--r--kernel/gcov/Kconfig1
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/irq/cpuhotplug.c24
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c5
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/kcov.c431
-rw-r--r--kernel/kthread.c104
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c11
-rw-r--r--kernel/locking/locktorture.c4
-rw-r--r--kernel/locking/mutex-debug.c12
-rw-r--r--kernel/locking/mutex-debug.h4
-rw-r--r--kernel/locking/mutex.c11
-rw-r--r--kernel/locking/mutex.h2
-rw-r--r--kernel/locking/osq_lock.c22
-rw-r--r--kernel/locking/rwsem-xadd.c35
-rw-r--r--kernel/module.c37
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/printk/printk.c5
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/rcu/Makefile4
-rw-r--r--kernel/rcu/tree.c56
-rw-r--r--kernel/rcu/tree_plugin.h14
-rw-r--r--kernel/resource.c13
-rw-r--r--kernel/sched/Makefile7
-rw-r--r--kernel/sched/auto_group.c23
-rw-r--r--kernel/sched/core.c409
-rw-r--r--kernel/sched/cpufreq.c63
-rw-r--r--kernel/sched/cpufreq_sched.c499
-rw-r--r--kernel/sched/cpufreq_schedutil.c827
-rw-r--r--kernel/sched/cpupri.c48
-rw-r--r--kernel/sched/deadline.c162
-rw-r--r--kernel/sched/debug.c26
-rw-r--r--kernel/sched/fair.c2136
-rw-r--r--kernel/sched/hmp.c7
-rw-r--r--kernel/sched/idle_task.c3
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/rt.c525
-rw-r--r--kernel/sched/sched.h215
-rw-r--r--kernel/sched/sched_avg.c46
-rw-r--r--kernel/sched/stats.c26
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/tune.c12
-rw-r--r--kernel/sched/walt.c411
-rw-r--r--kernel/sched/walt.h4
-rw-r--r--kernel/seccomp.c23
-rw-r--r--kernel/signal.c42
-rw-r--r--kernel/softirq.c32
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time/alarmtimer.c15
-rw-r--r--kernel/time/hrtimer.c12
-rw-r--r--kernel/time/posix-timers.c34
-rw-r--r--kernel/time/tick-sched.c31
-rw-r--r--kernel/time/timekeeping.c102
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c32
-rw-r--r--kernel/trace/ftrace.c27
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace.c157
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_functions_graph.c1
-rw-r--r--kernel/trace/trace_irqsoff.c133
-rw-r--r--kernel/trace/trace_kprobe.c21
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c61
-rw-r--r--kernel/workqueue_internal.h3
96 files changed, 5789 insertions, 2450 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..2dea801370f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,6 +19,17 @@ CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -69,6 +80,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..37f1dc696fbd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -99,7 +99,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
+ if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
diff --git a/kernel/async.c b/kernel/async.c
index 4c3773c0bf63..f1fd155abff6 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -84,20 +84,24 @@ static atomic_t entry_count;
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct list_head *pending;
+ struct async_entry *first = NULL;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
- if (domain)
- pending = &domain->pending;
- else
- pending = &async_global_pending;
+ if (domain) {
+ if (!list_empty(&domain->pending))
+ first = list_first_entry(&domain->pending,
+ struct async_entry, domain_list);
+ } else {
+ if (!list_empty(&async_global_pending))
+ first = list_first_entry(&async_global_pending,
+ struct async_entry, global_list);
+ }
- if (!list_empty(pending))
- ret = list_first_entry(pending, struct async_entry,
- domain_list)->cookie;
+ if (first)
+ ret = first->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
diff --git a/kernel/audit.c b/kernel/audit.c
index 34f690b9213a..e228b88dfd23 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -80,13 +80,13 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-u32 audit_enabled;
-u32 audit_ever_enabled;
+u32 audit_enabled = AUDIT_OFF;
+u32 audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default;
+static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
@@ -1185,8 +1185,6 @@ static int __init audit_init(void)
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
@@ -1203,6 +1201,8 @@ static int __init audit_enable(char *str)
audit_default = !!simple_strtol(str, NULL, 0);
if (!audit_default)
audit_initialized = AUDIT_DISABLED;
+ audit_enabled = audit_default;
+ audit_ever_enabled = !!audit_enabled;
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 939945a5649c..a162661c9d60 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..3608fa1aec8a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -20,8 +20,10 @@
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
+ u32 elem_size, array_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u32 elem_size, array_size;
+ u64 mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -36,12 +38,33 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
/* check round_up into zero and u32 overflow */
if (elem_size == 0 ||
- attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+ max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
return ERR_PTR(-ENOMEM);
- array_size = sizeof(*array) + attr->max_entries * elem_size;
+ array_size = sizeof(*array) + max_entries * elem_size;
/* allocate all map elements and zero-initialize them */
array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -50,6 +73,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (!array)
return ERR_PTR(-ENOMEM);
}
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
array->map.key_size = attr->key_size;
@@ -70,7 +95,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (index >= array->map.max_entries)
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* Called from syscall */
@@ -111,7 +136,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
/* all elements already exist */
return -EEXIST;
- memcpy(array->value + array->elem_size * index, value, map->value_size);
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
+ value, map->value_size);
return 0;
}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd572c..eb52d11fdaa7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -137,6 +137,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(__bpf_prog_free);
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_JMP &&
+ /* Call and Exit are both special jumps with no
+ * target inside the BPF instruction image.
+ */
+ BPF_OP(insn->code) != BPF_CALL &&
+ BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 i, insn_cnt = prog->len;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_is_jmp_and_has_target(insn))
+ continue;
+
+ /* Adjust offset of jmps if we cross boundaries. */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+ insn->off -= delta;
+ }
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ struct bpf_prog *prog_adj;
+
+ /* Since our patchlet doesn't expand the image, we're done. */
+ if (insn_delta == 0) {
+ memcpy(prog->insnsi + off, patch, sizeof(*patch));
+ return prog;
+ }
+
+ insn_adj_cnt = prog->len + insn_delta;
+
+ /* Several new instructions need to be inserted. Make room
+ * for them. Likely, there's no need for a new allocation as
+ * last page could have large enough tailroom.
+ */
+ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+ GFP_USER);
+ if (!prog_adj)
+ return NULL;
+
+ prog_adj->len = insn_adj_cnt;
+
+ /* Patching happens in 3 steps:
+ *
+ * 1) Move over tail of insnsi from next instruction onwards,
+ * so we can patch the single target insn with one or more
+ * new ones (patching is always from 1 to n insns, n > 0).
+ * 2) Inject new instructions at the target location.
+ * 3) Adjust branch offsets if necessary.
+ */
+ insn_rest = insn_adj_cnt - off - len;
+
+ memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+ sizeof(*patch) * insn_rest);
+ memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+ bpf_adj_branches(prog_adj, off, insn_delta);
+
+ return prog_adj;
+}
+
#ifdef CONFIG_BPF_JIT
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
@@ -185,6 +256,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -372,7 +444,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -391,7 +463,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
@@ -446,7 +518,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -654,6 +726,13 @@ load_byte:
return 0;
}
+#else
+static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
@@ -700,9 +779,23 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
int bpf_prog_select_runtime(struct bpf_prog *fp)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
fp->bpf_func = (void *) __bpf_prog_run;
-
+#else
+ fp->bpf_func = (void *) __bpf_prog_ret0;
+#endif
+
+ /* eBPF JITs can rewrite the program in case constant
+ * blinding is active. However, in case of error during
+ * blinding, bpf_int_jit_compile() must always return a
+ * valid program, which in this case would simply not
+ * be JITed, but falls back to the interpreter.
+ */
bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited)
+ return -ENOTSUPP;
+#endif
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4e32cc94edd9..424accd20c2d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -447,57 +447,6 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl)
list_add(&tl->list_node, &bpf_prog_types);
}
-/* fixup insn->imm field of bpf_call instructions:
- * if (insn->imm == BPF_FUNC_map_lookup_elem)
- * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
- * else if (insn->imm == BPF_FUNC_map_update_elem)
- * insn->imm = bpf_map_update_elem - __bpf_call_base;
- * else ...
- *
- * this function is called after eBPF program passed verification
- */
-static void fixup_bpf_calls(struct bpf_prog *prog)
-{
- const struct bpf_func_proto *fn;
- int i;
-
- for (i = 0; i < prog->len; i++) {
- struct bpf_insn *insn = &prog->insnsi[i];
-
- if (insn->code == (BPF_JMP | BPF_CALL)) {
- /* we reach here when program has bpf_call instructions
- * and it passed bpf_check(), means that
- * ops->get_func_proto must have been supplied, check it
- */
- BUG_ON(!prog->aux->ops->get_func_proto);
-
- if (insn->imm == BPF_FUNC_get_route_realm)
- prog->dst_needed = 1;
- if (insn->imm == BPF_FUNC_get_prandom_u32)
- bpf_user_rnd_init_once();
- if (insn->imm == BPF_FUNC_tail_call) {
- /* mark bpf_tail_call as different opcode
- * to avoid conditional branch in
- * interpeter for every normal call
- * and to prevent accidental JITing by
- * JIT compiler that doesn't support
- * bpf_tail_call yet
- */
- insn->imm = 0;
- insn->code |= BPF_X;
- continue;
- }
-
- fn = prog->aux->ops->get_func_proto(insn->imm);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- BUG_ON(!fn->func);
- insn->imm = fn->func - __bpf_call_base;
- }
- }
-}
-
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
@@ -680,9 +629,6 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- /* fixup BPF_CALL->imm field */
- fixup_bpf_calls(prog);
-
/* eBPF program is ready to be JITed */
err = bpf_prog_select_runtime(prog);
if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cbfba78d3db..c14003840bc5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -186,6 +186,14 @@ struct verifier_stack_elem {
struct verifier_stack_elem *next;
};
+struct bpf_insn_aux_data {
+ union {
+ enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
+ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
+ };
+ bool seen; /* this insn was processed by the verifier */
+};
+
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
/* single container for all structs
@@ -200,6 +208,7 @@ struct verifier_env {
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
bool allow_ptr_leaks;
+ struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
};
/* verbose verifier prints what it's seeing
@@ -313,7 +322,8 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct verifier_env *env,
+ const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -377,9 +387,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
} else {
verbose("BUG_ld_%02x\n", insn->code);
return;
@@ -663,6 +683,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
+static bool is_ctx_reg(struct verifier_env *env, int regno)
+{
+ const struct reg_state *reg = &env->cur_state.regs[regno];
+
+ return reg->type == PTR_TO_CTX;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -754,6 +781,17 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -929,7 +967,7 @@ error:
return -EINVAL;
}
-static int check_call(struct verifier_env *env, int func_id)
+static int check_call(struct verifier_env *env, int func_id, int insn_idx)
{
struct verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
@@ -965,6 +1003,13 @@ static int check_call(struct verifier_env *env, int func_id)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (map == NULL) {
+ verbose("verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = map;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
if (err)
return err;
@@ -1028,7 +1073,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ BPF_CLASS(insn->code) == BPF_ALU64) {
verbose("BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -1132,6 +1178,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose("BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -1758,16 +1809,17 @@ static int do_check(struct verifier_env *env)
if (log_level) {
verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
+ print_bpf_insn(env, insn);
}
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
+ enum bpf_reg_type *prev_src_type, src_reg_type;
/* check for reserved fields is already done */
@@ -1796,16 +1848,18 @@ static int do_check(struct verifier_env *env)
continue;
}
- if (insn->imm == 0) {
+ prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
* dst_reg = *(u32 *)(src_reg + off)
- * use reserved 'imm' field to mark this insn
+ * save type to validate intersecting paths
*/
- insn->imm = src_reg_type;
+ *prev_src_type = src_reg_type;
- } else if (src_reg_type != insn->imm &&
+ } else if (src_reg_type != *prev_src_type &&
(src_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_src_type == PTR_TO_CTX)) {
/* ABuser program is trying to use the same insn
* dst_reg = *(u32*) (src_reg + off)
* with different pointer types:
@@ -1818,7 +1872,7 @@ static int do_check(struct verifier_env *env)
}
} else if (class == BPF_STX) {
- enum bpf_reg_type dst_reg_type;
+ enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
err = check_xadd(env, insn);
@@ -1846,11 +1900,13 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
- if (insn->imm == 0) {
- insn->imm = dst_reg_type;
- } else if (dst_reg_type != insn->imm &&
+ prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_dst_type == NOT_INIT) {
+ *prev_dst_type = dst_reg_type;
+ } else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_dst_type == PTR_TO_CTX)) {
verbose("same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -1866,6 +1922,12 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -1885,7 +1947,7 @@ static int do_check(struct verifier_env *env)
return -EINVAL;
}
- err = check_call(env, insn->imm);
+ err = check_call(env, insn->imm, insn_idx);
if (err)
return err;
@@ -1952,6 +2014,7 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
verbose("invalid BPF_LD mode\n");
return -EINVAL;
@@ -2081,23 +2144,60 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
insn->src_reg = 0;
}
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
+ u32 off, u32 cnt)
{
- struct bpf_insn *insn = prog->insnsi;
- int insn_cnt = prog->len;
+ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
int i;
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (BPF_CLASS(insn->code) != BPF_JMP ||
- BPF_OP(insn->code) == BPF_CALL ||
- BPF_OP(insn->code) == BPF_EXIT)
- continue;
+ if (cnt == 1)
+ return 0;
+ new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ if (!new_data)
+ return -ENOMEM;
+ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+ memcpy(new_data + off + cnt - 1, old_data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
+ env->insn_aux_data = new_data;
+ vfree(old_data);
+ return 0;
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (!new_prog)
+ return NULL;
+ if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ return NULL;
+ return new_prog;
+}
- /* adjust offset of jmps if necessary */
- if (i < pos && i + insn->off + 1 > pos)
- insn->off += delta;
- else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
- insn->off -= delta;
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
}
}
@@ -2107,17 +2207,18 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
static int convert_ctx_accesses(struct verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
+ const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
- u32 cnt;
- int i;
enum bpf_access_type type;
+ int i, delta = 0;
if (!env->prog->aux->ops->convert_ctx_access)
return 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ u32 cnt;
+
if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
type = BPF_READ;
else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
@@ -2125,11 +2226,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
else
continue;
- if (insn->imm != PTR_TO_CTX) {
- /* clear internal mark */
- insn->imm = 0;
+ if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- }
cnt = env->prog->aux->ops->
convert_ctx_access(type, insn->dst_reg, insn->src_reg,
@@ -2139,34 +2237,107 @@ static int convert_ctx_accesses(struct verifier_env *env)
return -EINVAL;
}
- if (cnt == 1) {
- memcpy(insn, insn_buf, sizeof(*insn));
- continue;
- }
-
- /* several new insns need to be inserted. Make room for them */
- insn_cnt += cnt - 1;
- new_prog = bpf_prog_realloc(env->prog,
- bpf_prog_size(insn_cnt),
- GFP_USER);
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
- new_prog->len = insn_cnt;
+ delta += cnt - 1;
- memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
- sizeof(*insn) * (insn_cnt - i - cnt));
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
- /* copy substitute insns in place of load instruction */
- memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+ return 0;
+}
- /* adjust branches in the whole program */
- adjust_branches(new_prog, i, cnt - 1);
+/* fixup insn->imm field of bpf_call instructions
+ *
+ * this function is called after eBPF program passed verification
+ */
+static int fixup_bpf_calls(struct verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, cnt, delta = 0;
- /* keep walking new program and skip insns we just inserted */
- env->prog = new_prog;
- insn = new_prog->insnsi + i + cnt - 1;
- i += cnt - 1;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpeter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verbose("kernel subsystem misconfigured func %d\n",
+ insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
}
return 0;
@@ -2210,6 +2381,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ (*prog)->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
env->prog = *prog;
/* grab the mutex to protect few globals used by verifier */
@@ -2228,12 +2404,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* log_* values have to be sane */
if (log_size < 128 || log_size > UINT_MAX >> 8 ||
log_level == 0 || log_ubuf == NULL)
- goto free_env;
+ goto err_unlock;
ret = -ENOMEM;
log_buf = vmalloc(log_size);
if (!log_buf)
- goto free_env;
+ goto err_unlock;
} else {
log_level = 0;
}
@@ -2262,9 +2438,15 @@ skip_full_check:
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
+ if (ret == 0)
+ ret = fixup_bpf_calls(env);
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -2302,14 +2484,16 @@ skip_full_check:
free_log_buf:
if (log_level)
vfree(log_buf);
-free_env:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
*/
release_maps(env);
*prog = env->prog;
- kfree(env);
+err_unlock:
mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
+ kfree(env);
return ret;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25cf44889559..3fdb7545852e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -717,10 +717,10 @@ static void css_set_move_task(struct task_struct *task,
if (to_cset) {
/*
- * We are synchronized through cgroup_threadgroup_rwsem
- * against PF_EXITING setting such that we can't race
- * against cgroup_exit() changing the css_set to
- * init_css_set and dropping the old one.
+ * We are synchronized through css_set_lock against
+ * PF_EXITING setting such that we can't race against
+ * cgroup_exit() disassociating the task from the
+ * css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -2799,6 +2799,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -2813,6 +2814,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
+ percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return retval;
@@ -4072,6 +4074,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
mutex_lock(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* all tasks in @from are being moved, all csets are source */
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
@@ -4100,6 +4104,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
mutex_unlock(&cgroup_mutex);
return ret;
}
@@ -5701,19 +5706,22 @@ void cgroup_exit(struct task_struct *tsk)
int i;
/*
- * Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check css_set and cg_list without synchronization.
+ * Avoid potential race with the migrate path.
+ */
+ spin_lock_irq(&css_set_lock);
+ /*
+ * Unlink from @tsk from its css_set.
*/
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
- spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
- spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
}
+ spin_unlock_irq(&css_set_lock);
+
/* see cgroup_post_fork() for details */
for_each_subsys_which(ss, i, &have_exit_callback)
ss->exit(tsk);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100644
index 000000000000..d70829033bb7
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,160 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100644
index 000000000000..297756be369c
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,125 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e822cb0e18d5..5b4440d57f89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -91,6 +91,11 @@ static struct {
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+void cpu_hotplug_mutex_held(void)
+{
+ lockdep_assert_held(&cpu_hotplug.lock);
+}
+EXPORT_SYMBOL(cpu_hotplug_mutex_held);
void get_online_cpus(void)
{
@@ -361,6 +366,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
+ if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1)
+ return -EBUSY;
+
cpu_hotplug_begin();
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
@@ -372,6 +380,21 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
goto out_release;
}
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so wait for both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
+
smpboot_park_threads(cpu);
/*
@@ -505,8 +528,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- pr_warn("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+ pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 29c7240172d3..a599351997ad 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -60,6 +60,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+struct static_key cpusets_pre_enable_key __read_mostly = STATIC_KEY_INIT_FALSE;
struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
/* See "Frequency meter" comments, below. */
@@ -174,9 +175,9 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
+static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags);
+ return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -806,16 +807,15 @@ done:
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes get_online_cpus().
*/
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_unlocked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
+ cpu_hotplug_mutex_held();
lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -823,27 +823,27 @@ static void rebuild_sched_domains_locked(void)
* Anyways, hotplug work item will rebuild sched domains.
*/
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
}
#else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_unlocked(void)
{
}
#endif /* CONFIG_SMP */
void rebuild_sched_domains(void)
{
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_unlocked();
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
/**
@@ -875,7 +875,6 @@ static void update_tasks_cpumask(struct cpuset *cs)
*
* On legacy hierachy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
@@ -930,7 +929,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
rcu_read_unlock();
if (need_rebuild_sched_domains)
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_unlocked();
}
/**
@@ -1289,7 +1288,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_unlocked();
}
return 0;
@@ -1320,7 +1319,6 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1355,7 +1353,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
- rebuild_sched_domains_locked();
+ rebuild_sched_domains_unlocked();
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1620,6 +1618,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1657,6 +1656,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}
@@ -1667,6 +1667,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1681,6 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
return retval;
}
@@ -1719,6 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -1744,6 +1747,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -1912,6 +1916,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -2049,13 +2054,14 @@ out_unlock:
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_unlocked().
*/
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ get_online_cpus();
mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
@@ -2065,6 +2071,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex);
+ put_online_cpus();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2293,6 +2300,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2367,8 +2381,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(bool cpu_online)
@@ -2387,6 +2403,11 @@ void cpuset_update_active_cpus(bool cpu_online)
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 0b891286a150..3990c1f73e45 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -357,7 +357,7 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (kallsyms_symbol_next(p_tmp, i) < 0)
+ if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
break;
kdb_printf("%s ", p_tmp);
*(p_tmp + len) = '\0';
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95c447e658f7..322f63370038 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3437,22 +3437,27 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
+ event_cpu = READ_ONCE(event->oncpu);
+
if (event->state == PERF_EVENT_STATE_ACTIVE &&
- !cpu_isolated(event->oncpu)) {
+ !cpu_isolated(event_cpu)) {
struct perf_read_data data = {
.event = event,
.group = group,
.ret = 0,
};
+
+ if ((unsigned int)event_cpu >= nr_cpu_ids)
+ return 0;
if (!event->attr.exclude_idle ||
- !per_cpu(is_idle, event->oncpu)) {
- smp_call_function_single(event->oncpu,
+ !per_cpu(is_idle, event_cpu)) {
+ smp_call_function_single(event_cpu,
__perf_event_read, &data, 1);
ret = data.ret;
}
@@ -7297,6 +7302,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
}
event->tp_event->prog = prog;
+ event->tp_event->bpf_prog_owner = event;
return 0;
}
@@ -7309,7 +7315,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
return;
prog = event->tp_event->prog;
- if (prog) {
+ if (prog && event->tp_event->bpf_prog_owner == event) {
event->tp_event->prog = NULL;
bpf_prog_put_rcu(prog);
}
@@ -8693,28 +8699,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
diff --git a/kernel/exit.c b/kernel/exit.c
index d8a12cc06aee..06d54f550c36 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
+#include <linux/kcov.h>
#include "sched/tune.h"
@@ -669,6 +670,7 @@ void do_exit(long code)
TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
+ kcov_task_exit(tsk);
WARN_ON(blk_needs_flush_plug(tsk));
@@ -762,7 +764,7 @@ void do_exit(long code)
disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
- exit_thread();
+ exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..4f06fc34313f 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,7 +66,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2845c5bdc8e3..4251e3806640 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,6 +59,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -76,6 +77,7 @@
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
+#include <linux/kcov.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -148,18 +150,18 @@ static inline void free_task_struct(struct task_struct *tsk)
}
#endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
{
}
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
# if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -168,30 +170,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
return page ? page_address(page) : NULL;
}
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
{
- kasan_alloc_pages(virt_to_page(ti), THREAD_SIZE_ORDER);
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ struct page *page = virt_to_page(stack);
+
+ kasan_alloc_pages(page, THREAD_SIZE_ORDER);
+ kaiser_unmap_thread_stack(stack);
+ __free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+ return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
}
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(unsigned long *stack)
{
- kmem_cache_free(thread_info_cache, ti);
+ kmem_cache_free(thread_stack_cache, stack);
}
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
{
- thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
THREAD_SIZE, 0, NULL);
- BUG_ON(thread_info_cache == NULL);
+ BUG_ON(thread_stack_cache == NULL);
}
# endif
#endif
@@ -214,9 +219,9 @@ struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
{
- struct zone *zone = page_zone(virt_to_page(ti));
+ struct zone *zone = page_zone(virt_to_page(stack));
mod_zone_page_state(zone, NR_KERNEL_STACK, account);
}
@@ -224,8 +229,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)
void free_task(struct task_struct *tsk)
{
account_kernel_stack(tsk->stack, -1);
- arch_release_thread_info(tsk->stack);
- free_thread_info(tsk->stack);
+ arch_release_thread_stack(tsk->stack);
+ free_thread_stack(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
put_seccomp_filter(tsk);
@@ -336,7 +341,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- struct thread_info *ti;
+ unsigned long *stack;
int err;
if (node == NUMA_NO_NODE)
@@ -345,15 +350,19 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- ti = alloc_thread_info_node(tsk, node);
- if (!ti)
+ stack = alloc_thread_stack_node(tsk, node);
+ if (!stack)
goto free_tsk;
err = arch_dup_task_struct(tsk, orig);
if (err)
- goto free_ti;
+ goto free_stack;
+
+ tsk->stack = stack;
- tsk->stack = ti;
+ err = kaiser_map_thread_stack(tsk->stack);
+ if (err)
+ goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -370,7 +379,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_int();
+ tsk->stack_canary = get_random_long();
#endif
/*
@@ -385,12 +394,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- account_kernel_stack(ti, 1);
+ account_kernel_stack(stack, 1);
+
+ kcov_task_init(tsk);
return tsk;
-free_ti:
- free_thread_info(ti);
+free_stack:
+ free_thread_stack(stack);
free_tsk:
free_task_struct(tsk);
return NULL;
@@ -695,6 +706,26 @@ void __mmdrop(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(__mmdrop);
+static inline void __mmput(struct mm_struct *mm)
+{
+ VM_BUG_ON(atomic_read(&mm->mm_users));
+
+ uprobe_clear_state(mm);
+ exit_aio(mm);
+ ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
+ exit_mmap(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+ mmdrop(mm);
+}
+
/*
* Decrement the use count and release all resources for an mm.
*/
@@ -704,26 +735,27 @@ int mmput(struct mm_struct *mm)
might_sleep();
if (atomic_dec_and_test(&mm->mm_users)) {
- uprobe_clear_state(mm);
- exit_aio(mm);
- ksm_exit(mm);
- khugepaged_exit(mm); /* must run before exit_mmap */
- exit_mmap(mm);
- set_mm_exe_file(mm, NULL);
- if (!list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- list_del(&mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
- mmdrop(mm);
+ __mmput(mm);
mm_freed = 1;
}
return mm_freed;
}
EXPORT_SYMBOL_GPL(mmput);
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
@@ -832,8 +864,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mm = get_task_mm(task);
if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode) &&
- !capable(CAP_SYS_RESOURCE)) {
+ !ptrace_may_access(task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index af29863f3349..a09c1dd1f659 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1621,6 +1621,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_q *this, *next;
WAKE_Q(wake_q);
+ if (nr_wake < 0 || nr_requeue < 0)
+ return -EINVAL;
+
if (requeue_pi) {
/*
* Requeue PI only works on two distinct uaddrs. This
@@ -1939,8 +1942,12 @@ static int unqueue_me(struct futex_q *q)
/* In the common case we don't take the spinlock, which is nice. */
retry:
- lock_ptr = q->lock_ptr;
- barrier();
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
spin_lock(lock_ptr);
/*
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index c92e44855ddd..1276aabaab55 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
+ depends on !COMPILE_TEST
depends on GCOV_KERNEL
depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 7080ae1eb6c1..f850e906564b 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_icall_topn);
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index e25e92fb44fa..46a18e72bce6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1
+#if (__GNUC__ >= 7)
+#define GCOV_COUNTERS 9
+#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
diff --git a/kernel/groups.c b/kernel/groups.c
index 74d431d25251..5ea9847f172f 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -101,7 +101,7 @@ static int groups_from_user(struct group_info *group_info,
}
/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
{
int base, max, stride;
int gidsetsize = group_info->ngroups;
@@ -128,6 +128,7 @@ static void groups_sort(struct group_info *group_info)
stride /= 3;
}
}
+EXPORT_SYMBOL(groups_sort);
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -159,7 +160,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
- groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
@@ -243,6 +243,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6c8e154c7384..4684b7595e63 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -36,10 +36,32 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = &available_cpus;
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ /*
+ * The order of preference for selecting a fallback CPU is
+ *
+ * (1) online and un-isolated CPU from default affinity
+ * (2) online and un-isolated CPU
+ * (3) online CPU
+ */
cpumask_andnot(&available_cpus, cpu_online_mask,
cpu_isolated_mask);
- if (cpumask_empty(affinity))
+ if (cpumask_intersects(&available_cpus, irq_default_affinity))
+ cpumask_and(&available_cpus, &available_cpus,
+ irq_default_affinity);
+ else if (cpumask_empty(&available_cpus))
affinity = cpu_online_mask;
+
+ /*
+ * We are overriding the affinity with all online and
+ * un-isolated cpus. irq_set_affinity_locked() call
+ * below notify this mask to PM QOS affinity listener.
+ * That results in applying the CPU_DMA_LATENCY QOS
+ * to all the CPUs specified in the mask. But the low
+ * level irqchip driver sets the affinity of an irq
+ * to only one CPU. So pick only one CPU from the
+ * prepared mask while overriding the user affinity.
+ */
+ affinity = cpumask_of(cpumask_any(affinity));
ret = true;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e5c70dcb7f8e..2c2effdb4437 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1305,8 +1305,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret)
+ if (ret) {
+ irq_release_resources(desc);
goto out_mask;
+ }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a24c5b909047..b05509af0352 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
goto free_cpumask;
}
+ if (cpumask_subset(new_value, cpu_isolated_mask)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 453ec4232852..e863b2339174 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -553,7 +553,7 @@ static __init int jump_label_test(void)
return 0;
}
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */
#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..5813e9375a93
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,431 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+#include <asm/setup.h>
+
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ * - initial state after open()
+ * - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ * - then, mmap() call (several calls are allowed but not useful)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
+ */
+struct kcov {
+ /*
+ * Reference counter. We keep one for:
+ * - opened file descriptor
+ * - task with enabled coverage (we can't unwire it from another task)
+ */
+ atomic_t refcount;
+ /* The lock protects mode, size, area and t. */
+ spinlock_t lock;
+ enum kcov_mode mode;
+ /* Size of arena (in long's for KCOV_MODE_TRACE). */
+ unsigned size;
+ /* Coverage buffer shared with user space. */
+ void *area;
+ /* Task for which we collect coverage, or NULL. */
+ struct task_struct *t;
+};
+
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+{
+ enum kcov_mode mode;
+
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts.
+ */
+ if (!in_task())
+ return false;
+ mode = READ_ONCE(t->kcov_mode);
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+
+static unsigned long canonicalize_ip(unsigned long ip)
+{
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
+ return ip;
+}
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
+static void kcov_get(struct kcov *kcov)
+{
+ atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+ if (atomic_dec_and_test(&kcov->refcount)) {
+ vfree(kcov->area);
+ kfree(kcov);
+ }
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ t->kcov_mode = KCOV_MODE_DISABLED;
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+ t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+ struct kcov *kcov;
+
+ kcov = t->kcov;
+ if (kcov == NULL)
+ return;
+ spin_lock(&kcov->lock);
+ if (WARN_ON(kcov->t != t)) {
+ spin_unlock(&kcov->lock);
+ return;
+ }
+ /* Just to not leave dangling references behind. */
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock(&kcov->lock);
+ kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+ int res = 0;
+ void *area;
+ struct kcov *kcov = vma->vm_file->private_data;
+ unsigned long size, off;
+ struct page *page;
+
+ area = vmalloc_user(vma->vm_end - vma->vm_start);
+ if (!area)
+ return -ENOMEM;
+
+ spin_lock(&kcov->lock);
+ size = kcov->size * sizeof(unsigned long);
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ vma->vm_end - vma->vm_start != size) {
+ res = -EINVAL;
+ goto exit;
+ }
+ if (!kcov->area) {
+ kcov->area = area;
+ vma->vm_flags |= VM_DONTEXPAND;
+ spin_unlock(&kcov->lock);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
+ }
+ return 0;
+ }
+exit:
+ spin_unlock(&kcov->lock);
+ vfree(area);
+ return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+ struct kcov *kcov;
+
+ kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+ if (!kcov)
+ return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
+ atomic_set(&kcov->refcount, 1);
+ spin_lock_init(&kcov->lock);
+ filep->private_data = kcov;
+ return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+ kcov_put(filep->private_data);
+ return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+ unsigned long arg)
+{
+ struct task_struct *t;
+ unsigned long size, unused;
+
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ */
+ if (kcov->mode != KCOV_MODE_DISABLED)
+ return -EBUSY;
+ /*
+ * Size must be at least 2 to hold current position and one PC.
+ * Later we allocate size * sizeof(unsigned long) memory,
+ * that must not overflow.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ return 0;
+ case KCOV_ENABLE:
+ /*
+ * Enable coverage for the current task.
+ * At this point user must have been enabled trace mode,
+ * and mmapped the file. Coverage collection is disabled only
+ * at task exit or voluntary by KCOV_DISABLE. After that it can
+ * be enabled for another task.
+ */
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
+ return -EINVAL;
+ if (kcov->t != NULL)
+ return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
+ t = current;
+ /* Cache in task struct for performance. */
+ t->kcov_size = kcov->size;
+ t->kcov_area = kcov->area;
+ /* See comment in check_kcov_mode(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, kcov->mode);
+ t->kcov = kcov;
+ kcov->t = t;
+ /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ case KCOV_DISABLE:
+ /* Disable coverage for the current task. */
+ unused = arg;
+ if (unused != 0 || current->kcov != kcov)
+ return -EINVAL;
+ t = current;
+ if (WARN_ON(kcov->t != t))
+ return -EINVAL;
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
+ kcov_put(kcov);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct kcov *kcov;
+ int res;
+
+ kcov = filep->private_data;
+ spin_lock(&kcov->lock);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock(&kcov->lock);
+ return res;
+}
+
+static const struct file_operations kcov_fops = {
+ .open = kcov_open,
+ .unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
+ .mmap = kcov_mmap,
+ .release = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+ if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+ pr_err("failed to create kcov in debugfs\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 850b255649a2..d9b0be5c6a5f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -65,7 +65,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
static struct kthread *to_live_kthread(struct task_struct *k)
{
struct completion *vfork = ACCESS_ONCE(k->vfork_done);
- if (likely(vfork))
+ if (likely(vfork) && try_get_task_stack(k))
return __to_kthread(vfork);
return NULL;
}
@@ -427,8 +427,10 @@ void kthread_unpark(struct task_struct *k)
{
struct kthread *kthread = to_live_kthread(k);
- if (kthread)
+ if (kthread) {
__kthread_unpark(k, kthread);
+ put_task_stack(k);
+ }
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -457,6 +459,7 @@ int kthread_park(struct task_struct *k)
wait_for_completion(&kthread->parked);
}
}
+ put_task_stack(k);
ret = 0;
}
return ret;
@@ -492,6 +495,7 @@ int kthread_stop(struct task_struct *k)
__kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
+ put_task_stack(k);
}
ret = k->exit_code;
put_task_struct(k);
@@ -604,6 +608,19 @@ repeat:
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ lockdep_assert_held(&worker->lock);
+
+ return !list_empty(&work->node) || work->canceling;
+}
+
/* insert @work before @pos in @worker */
static void insert_kthread_work(struct kthread_worker *worker,
struct kthread_work *work,
@@ -633,7 +650,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
unsigned long flags;
spin_lock_irqsave(&worker->lock, flags);
- if (list_empty(&work->node)) {
+ if (!queuing_blocked(worker, work)) {
insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
@@ -694,6 +711,87 @@ retry:
}
EXPORT_SYMBOL_GPL(flush_kthread_work);
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ * %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work,
+ unsigned long *flags)
+{
+ /*
+ * Try to remove the work from a worker list. It might either
+ * be from worker->work_list or from worker->delayed_work_list.
+ */
+ if (!list_empty(&work->node)) {
+ list_del_init(&work->node);
+ return true;
+ }
+
+ return false;
+}
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work)
+{
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+ int ret = false;
+
+ if (!worker)
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ ret = __kthread_cancel_work(work, &flags);
+
+ if (worker->current_work != work)
+ goto out_fast;
+
+ /*
+ * The work is in progress and we need to wait with the lock released.
+ * In the meantime, block any queuing by setting the canceling counter.
+ */
+ work->canceling++;
+ spin_unlock_irqrestore(&worker->lock, flags);
+ flush_kthread_work(work);
+ spin_lock_irqsave(&worker->lock, flags);
+ work->canceling--;
+
+out_fast:
+ spin_unlock_irqrestore(&worker->lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+ return __kthread_cancel_work_sync(work);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
/**
* flush_kthread_worker - flush all current works on a kthread_worker
* @worker: worker to flush
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6cc2a4a..31322a4275cd 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace56618f6..0e2c4911ba61 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3128,10 +3128,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references)
+ if (hlock->references) {
+ /*
+ * Check: unsigned int references:12, overflow.
+ */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
+ return 0;
+
hlock->references++;
- else
+ } else {
hlock->references = 2;
+ }
return 1;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919d63b2..d580b7d6ee6d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -776,6 +776,8 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+ kfree(cxt.lwsa);
+ kfree(cxt.lrsa);
torture_cleanup_end();
}
@@ -917,6 +919,8 @@ static int __init lock_torture_init(void)
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
/* Mark the current thread as blocked on the lock: */
- ti->task->blocked_on = waiter;
+ task->blocked_on = waiter;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
- DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
- DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
- ti->task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(waiter->task != task);
+ DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+ task->blocked_on = NULL;
list_del_init(&waiter->list);
waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..d06ae3bb46c5 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
- struct thread_info *ti);
+ struct task_struct *task);
extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti);
+ struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 14b9cca36b05..c61c56f05dfa 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -549,7 +549,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_add_waiter(lock, &waiter, task);
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -596,7 +596,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
__set_task_state(task, TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current_thread_info());
+ mutex_remove_waiter(lock, &waiter, task);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@@ -617,7 +617,7 @@ skip_wait:
return 0;
err:
- mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ mutex_remove_waiter(lock, &waiter, task);
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, 1, ip);
@@ -731,6 +731,7 @@ static inline void
__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
{
unsigned long flags;
+ WAKE_Q(wake_q);
/*
* As a performance measurement, release the lock before doing other
@@ -758,11 +759,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
struct mutex_waiter, list);
debug_mutex_wake_waiter(lock, waiter);
-
- wake_up_process(waiter->task);
+ wake_q_add(&wake_q, waiter->task);
}
spin_unlock_mutex(&lock->wait_lock, flags);
+ wake_up_q(&wake_q);
}
/*
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..a68bae5e852a 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,7 +13,7 @@
do { spin_lock(lock); (void)(flags); } while (0)
#define spin_unlock_mutex(lock, flags) \
do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
__list_del((waiter)->list.prev, (waiter)->list.next)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a37857ab55..0befa20ce96e 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,6 +1,7 @@
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
+#include <linux/sched/rt.h>
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
+ struct task_struct *task = current;
int curr = encode_cpu(smp_processor_id());
int old;
@@ -104,6 +106,19 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * osq_lock() unqueue
+ *
+ * node->prev = prev osq_wait_next()
+ * WMB MB
+ * prev->next = node next->prev = prev // unqueue-C
+ *
+ * Here 'node->prev' and 'next->prev' are the same variable and we need
+ * to ensure these stores happen in-order to avoid corrupting the list.
+ */
+ smp_wmb();
+
WRITE_ONCE(prev->next, node);
/*
@@ -118,8 +133,13 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
+ * If a task spins on owner on a CPU after acquiring
+ * osq_lock while a RT task spins on another CPU to
+ * acquire osq_lock, it will starve the owner from
+ * completing if owner is to be scheduled on the same CPU.
+ * It will be a live lock.
*/
- if (need_resched())
+ if (need_resched() || rt_task(task))
goto unqueue;
cpu_relax_lowlatency();
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..75c950ede9c7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -511,6 +511,41 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
unsigned long flags;
/*
+ * If a spinner is present, there is a chance that the load of
+ * rwsem_has_spinner() in rwsem_wake() can be reordered with
+ * respect to decrement of rwsem count in __up_write() leading
+ * to wakeup being missed.
+ *
+ * spinning writer up_write caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * spin_lock(wait_lock)
+ * sem->count=0xFFFFFFFF00000001
+ * +0xFFFFFFFF00000000
+ * count=sem->count
+ * MB
+ * sem->count=0xFFFFFFFE00000001
+ * -0xFFFFFFFF00000001
+ * RMB
+ * spin_trylock(wait_lock)
+ * return
+ * rwsem_try_write_lock(count)
+ * spin_unlock(wait_lock)
+ * schedule()
+ *
+ * Reordering of atomic_long_sub_return_release() in __up_write()
+ * and rwsem_has_spinner() in rwsem_wake() can cause missing of
+ * wakeup in up_write() context. In spinning writer, sem->count
+ * and local variable count is 0XFFFFFFFE00000001. It would result
+ * in rwsem_try_write_lock() failing to acquire rwsem and spinning
+ * writer going to sleep in rwsem_down_write_failed().
+ *
+ * The smp_rmb() here is to make sure that the spinner state is
+ * consulted after sem->count is updated in up_write context.
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/module.c b/kernel/module.c
index ea5ba3e8d472..a0eeedb3e5cd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2404,7 +2404,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
if (sym->st_shndx == SHN_UNDEF)
return 'U';
- if (sym->st_shndx == SHN_ABS)
+ if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu)
return 'a';
if (sym->st_shndx >= SHN_LORESERVE)
return '?';
@@ -2433,7 +2433,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info)
}
static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
- unsigned int shnum)
+ unsigned int shnum, unsigned int pcpundx)
{
const Elf_Shdr *sec;
@@ -2442,6 +2442,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
|| !src->st_name)
return false;
+#ifdef CONFIG_KALLSYMS_ALL
+ if (src->st_shndx == pcpundx)
+ return true;
+#endif
+
sec = sechdrs + src->st_shndx;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
@@ -2479,7 +2484,8 @@ static void layout_symtab(struct module *mod, struct load_info *info)
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
strtab_size += strlen(&info->strtab[src[i].st_name])+1;
ndst++;
}
@@ -2537,7 +2543,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
src = mod->kallsyms->symtab;
for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
if (i == 0 ||
- is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
+ is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
+ info->index.pcpu)) {
dst[ndst] = src[i];
dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
@@ -2868,6 +2875,15 @@ static struct module *setup_load_info(struct load_info *info, int flags)
return mod;
}
+static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
+{
+ if (retpoline_module_ok(get_modinfo(info, "retpoline")))
+ return;
+
+ pr_warn("%s: loading module not compiled with retpoline compiler.\n",
+ mod->name);
+}
+
static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
const char *modmagic = get_modinfo(info, "vermagic");
@@ -2887,8 +2903,14 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return -ENOEXEC;
}
- if (!get_modinfo(info, "intree"))
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
+
+ check_modinfo_retpoline(mod, info);
if (get_modinfo(info, "staging")) {
add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
@@ -3053,6 +3075,8 @@ static int move_module(struct module *mod, struct load_info *info)
static int check_module_license_and_versions(struct module *mod)
{
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+
/*
* ndiswrapper is under GPL by itself, but loads proprietary modules.
* Don't use add_taint_module(), as it would prevent ndiswrapper from
@@ -3071,6 +3095,9 @@ static int check_module_license_and_versions(struct module *mod)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
LOCKDEP_NOW_UNRELIABLE);
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/panic.c b/kernel/panic.c
index 982a52352cfc..75f564a94a82 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,6 +24,7 @@
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
+#include <soc/qcom/minidump.h>
#define CREATE_TRACE_POINTS
#include <trace/events/exception.h>
@@ -108,6 +109,7 @@ void panic(const char *fmt, ...)
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
+ dump_stack_minidump(0);
pr_emerg("Kernel panic - not syncing: %s\n", buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
@@ -172,7 +174,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e7f1f736a5b6..cc177142a08f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -19,8 +19,9 @@
#include <linux/kmod.h>
#include <trace/events/power.h>
#include <linux/wakeup_reason.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -208,6 +209,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9fcb521fab0e..dca87791e9c1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3180,9 +3180,8 @@ void show_regs_print_info(const char *log_lvl)
{
dump_stack_print_info(log_lvl);
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
+ printk("%stask: %p task.stack: %p\n",
+ log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e1160e5..9cd8e18e6f18 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -44,7 +44,7 @@ int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
static cpumask_var_t prof_cpu_mask;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
static DEFINE_PER_CPU(int, cpu_profile_flip);
static DEFINE_MUTEX(profile_flip_mutex);
@@ -201,7 +201,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
}
EXPORT_SYMBOL_GPL(profile_event_unregister);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
/*
* Each cpu has a pair of open-addressed hashtables for pending
* profile hits. read_profile() IPI's all cpus to request them
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c7e8ed99c953..5e2cd1030702 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,19 +28,25 @@
#include <linux/compat.h>
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
rcu_read_lock();
- child->ptracer_cred = get_cred(__task_cred(new_parent));
+ __ptrace_link(child, new_parent, __task_cred(new_parent));
rcu_read_unlock();
}
@@ -353,7 +359,7 @@ static int ptrace_attach(struct task_struct *task, long request,
flags |= PT_SEIZED;
task->ptrace = flags;
- __ptrace_link(task, current);
+ ptrace_link(task, current);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -420,7 +426,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 61a16569ffbf..032b2c015beb 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2cb46d51d715..3decfbc88308 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -248,24 +248,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
- unsigned long flags;
-
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
- trace_rcu_grace_period(TPS("rcu_sched"),
- __this_cpu_read(rcu_sched_data.gpnum),
- TPS("cpuqs"));
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
- if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
- return;
- local_irq_save(flags);
- if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
- __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
- rcu_report_exp_rdp(&rcu_sched_state,
- this_cpu_ptr(&rcu_sched_data),
- true);
- }
- local_irq_restore(flags);
- }
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
+ return;
+ trace_rcu_grace_period(TPS("rcu_sched"),
+ __this_cpu_read(rcu_sched_data.gpnum),
+ TPS("cpuqs"));
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data), true);
}
void rcu_bh_qs(void)
@@ -302,17 +295,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
* We inform the RCU core by emulating a zero-duration dyntick-idle
* period, which we in turn do by incrementing the ->dynticks counter
* by two.
+ *
+ * The caller must have disabled interrupts.
*/
static void rcu_momentary_dyntick_idle(void)
{
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_dynticks *rdtp;
int resched_mask;
struct rcu_state *rsp;
- local_irq_save(flags);
-
/*
* Yes, we can lose flag-setting operations. This is OK, because
* the flag will be set again after some delay.
@@ -342,13 +334,12 @@ static void rcu_momentary_dyntick_idle(void)
smp_mb__after_atomic(); /* Later stuff after QS. */
break;
}
- local_irq_restore(flags);
}
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
- * The caller must have disabled preemption.
+ * The caller must have disabled interrupts.
*/
void rcu_note_context_switch(void)
{
@@ -378,9 +369,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
*/
void rcu_all_qs(void)
{
+ unsigned long flags;
+
barrier(); /* Avoid RCU read-side critical sections leaking down. */
- if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
+ local_irq_save(flags);
rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ }
this_cpu_inc(rcu_qs_ctr);
barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
@@ -761,6 +757,12 @@ void rcu_irq_exit(void)
local_irq_save(flags);
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+ return;
+
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -889,6 +891,12 @@ void rcu_irq_enter(void)
local_irq_save(flags);
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+ return;
+
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 32cbe72bf545..c6fc11d626f8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -147,8 +147,8 @@ static void __init rcu_bootup_announce(void)
* the corresponding expedited grace period will also be the end of the
* normal grace period.
*/
-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
- unsigned long flags) __releases(rnp->lock)
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
+ __releases(rnp->lock) /* But leaves rrupts disabled. */
{
int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
(rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
@@ -236,7 +236,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
- raw_spin_unlock(&rnp->lock);
+ raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
/*
* Report the quiescent state for the expedited GP. This expedited
@@ -251,7 +251,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
} else {
WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
}
- local_irq_restore(flags);
}
/*
@@ -286,12 +285,11 @@ static void rcu_preempt_qs(void)
* predating the current grace period drain, in other words, until
* rnp->gp_tasks becomes NULL.
*
- * Caller must disable preemption.
+ * Caller must disable interrupts.
*/
static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
- unsigned long flags;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -301,7 +299,7 @@ static void rcu_preempt_note_context_switch(void)
/* Possibly blocking in an RCU read-side critical section. */
rdp = this_cpu_ptr(rcu_state_p->rda);
rnp = rdp->mynode;
- raw_spin_lock_irqsave(&rnp->lock, flags);
+ raw_spin_lock(&rnp->lock);
smp_mb__after_unlock_lock();
t->rcu_read_unlock_special.b.blocked = true;
t->rcu_blocked_node = rnp;
@@ -318,7 +316,7 @@ static void rcu_preempt_note_context_switch(void)
(rnp->qsmask & rdp->grpmask)
? rnp->gpnum
: rnp->gpnum + 1);
- rcu_preempt_ctxt_queue(rnp, rdp, flags);
+ rcu_preempt_ctxt_queue(rnp, rdp);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) {
diff --git a/kernel/resource.c b/kernel/resource.c
index 4c9835c09dcd..c09d484f7b5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
+ unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
+
+ if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+ start = r->start;
+ end = r->end;
+ } else {
+ start = end = 0;
+ }
+
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, (unsigned long long) r->start,
- width, (unsigned long long) r->end,
+ width, start,
+ width, end,
r->name ? r->name : "<BAD>");
return 0;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 308f80ce2e43..7dde1b9918e4 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
@@ -22,4 +26,5 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601ddf7..8620fd01b3d0 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
-
/*
- * We can only assume the task group can't go away on us if
- * autogroup_move_group() can see us on ->thread_group list.
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
*/
- if (p->flags & PF_EXITING)
- return false;
-
return true;
}
@@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
}
p->signal->autogroup = autogroup_kref_get(ag);
-
- if (!READ_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+ * can't be removed from thread list, we hold ->siglock.
+ */
for_each_thread(p, t)
sched_move_task(t);
-out:
+
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d2b8834dd3b..03b59c330bdd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -554,6 +554,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
return;
+ head->count++;
+
get_task_struct(task);
/*
@@ -563,6 +565,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
head->lastp = &node->next;
}
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ int sibling_count_hint);
+
void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;
@@ -577,10 +583,10 @@ void wake_up_q(struct wake_q_head *head)
task->wake_q.next = NULL;
/*
- * wake_up_process() implies a wmb() to pair with the queueing
+ * try_to_wake_up() implies a wmb() to pair with the queueing
* in wake_q_add() so as not to miss wakeups.
*/
- wake_up_process(task);
+ try_to_wake_up(task, TASK_NORMAL, 0, head->count);
put_task_struct(task);
}
}
@@ -621,8 +627,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1373,7 +1378,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(dst_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
check_preempt_curr(dst_rq, p, 0);
@@ -1701,14 +1708,16 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+ int sibling_count_hint)
{
bool allow_isolated = (p->flags & PF_KTHREAD);
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+ sibling_count_hint);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2006,6 +2015,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ * in this event.
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
@@ -2017,7 +2028,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* or @state didn't match @p's state.
*/
static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ int sibling_count_hint)
{
unsigned long flags;
int cpu, src_cpu, success = 0;
@@ -2133,7 +2145,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+ sibling_count_hint);
/* Refresh src_cpu as it could have changed since we last read it */
src_cpu = task_cpu(p);
@@ -2235,7 +2248,7 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_NORMAL, 0);
+ return try_to_wake_up(p, TASK_NORMAL, 0, 1);
}
EXPORT_SYMBOL(wake_up_process);
@@ -2255,13 +2268,13 @@ EXPORT_SYMBOL(wake_up_process);
int wake_up_process_no_notif(struct task_struct *p)
{
WARN_ON(task_is_stopped_or_traced(p));
- return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
+ return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1);
}
EXPORT_SYMBOL(wake_up_process_no_notif);
int wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ return try_to_wake_up(p, state, 0, 1);
}
/*
@@ -2276,6 +2289,7 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_period = 0;
dl_se->flags = 0;
dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
@@ -2312,11 +2326,11 @@ void sched_exit(struct task_struct *p)
reset_task_stats(p);
p->ravg.mark_start = wallclock;
p->ravg.sum_history[0] = EXITING_TASK_MARKER;
- free_task_load_ptrs(p);
enqueue_task(rq, p, 0);
clear_ed_task(p, rq);
task_rq_unlock(rq, p, &flags);
+ free_task_load_ptrs(p);
}
#endif /* CONFIG_SCHED_HMP */
@@ -2336,9 +2350,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SCHED_WALT
+ p->last_sleep_ts = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2347,6 +2368,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_dl_task_timer(&p->dl);
__dl_clear_params(p);
+ init_rt_schedtune_timer(&p->rt);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
@@ -2428,11 +2450,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2469,8 +2491,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2480,7 +2501,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2613,6 +2640,8 @@ void wake_up_new_task(struct task_struct *p)
add_new_task_to_grp(p);
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ p->state = TASK_RUNNING;
+
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
@@ -2620,12 +2649,16 @@ void wake_up_new_task(struct task_struct *p)
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
#endif
-
rq = __task_rq_lock(p);
mark_task_starting(p);
+ update_rq_clock(rq);
+ post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -2931,7 +2964,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm(oldmm, mm, next);
+ switch_mm_irqs_off(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
@@ -3070,7 +3103,7 @@ void sched_exec(void)
raw_spin_lock_irqsave(&p->pi_lock, flags);
curr_cpu = task_cpu(p);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -3136,93 +3169,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-
-static inline
-unsigned long add_capacity_margin(unsigned long cpu_capacity)
-{
- cpu_capacity = cpu_capacity * capacity_margin;
- cpu_capacity /= SCHED_CAPACITY_SCALE;
- return cpu_capacity;
-}
-
-static inline
-unsigned long sum_capacity_reqs(unsigned long cfs_cap,
- struct sched_capacity_reqs *scr)
-{
- unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
- return total += scr->dl;
-}
-
-static void sched_freq_tick_pelt(int cpu)
-{
- unsigned long cpu_utilization = capacity_max;
- unsigned long capacity_curr = capacity_curr_of(cpu);
- struct sched_capacity_reqs *scr;
-
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
- if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
- return;
-
- /*
- * To make free room for a task that is building up its "real"
- * utilization and to harm its performance the least, request
- * a jump to a higher OPP as soon as the margin of free capacity
- * is impacted (specified by capacity_margin).
- */
- set_cfs_cpu_capacity(cpu, true, cpu_utilization);
-}
-
-#ifdef CONFIG_SCHED_WALT
-static void sched_freq_tick_walt(int cpu)
-{
- unsigned long cpu_utilization = cpu_util(cpu);
- unsigned long capacity_curr = capacity_curr_of(cpu);
-
- if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
- return sched_freq_tick_pelt(cpu);
-
- /*
- * Add a margin to the WALT utilization.
- * NOTE: WALT tracks a single CPU signal for all the scheduling
- * classes, thus this margin is going to be added to the DL class as
- * well, which is something we do not do in sched_freq_tick_pelt case.
- */
- cpu_utilization = add_capacity_margin(cpu_utilization);
- if (cpu_utilization <= capacity_curr)
- return;
-
- /*
- * It is likely that the load is growing so we
- * keep the added margin in our request as an
- * extra boost.
- */
- set_cfs_cpu_capacity(cpu, true, cpu_utilization);
-
-}
-#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
-#else
-#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
-#endif /* CONFIG_SCHED_WALT */
-
-static void sched_freq_tick(int cpu)
-{
- unsigned long capacity_orig, capacity_curr;
-
- if (!sched_freq())
- return;
-
- capacity_orig = capacity_orig_of(cpu);
- capacity_curr = capacity_curr_of(cpu);
- if (capacity_curr == capacity_orig)
- return;
-
- _sched_freq_tick(cpu);
-}
-#else
-static inline void sched_freq_tick(int cpu) { }
-#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3249,7 +3195,6 @@ void scheduler_tick(void)
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
early_notif = early_detection_notify(rq, wallclock);
- sched_freq_tick(cpu);
raw_spin_unlock(&rq->lock);
if (early_notif)
@@ -3511,7 +3456,6 @@ static void __sched notrace __schedule(bool preempt)
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch();
prev = rq->curr;
/*
@@ -3530,13 +3474,16 @@ static void __sched notrace __schedule(bool preempt)
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ local_irq_disable();
+ rcu_note_context_switch();
+
/*
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- raw_spin_lock_irq(&rq->lock);
+ raw_spin_lock(&rq->lock);
lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -3582,6 +3529,10 @@ static void __sched notrace __schedule(bool preempt)
if (!is_idle_task(prev) && !prev->on_rq)
update_avg_burst(prev);
+#ifdef CONFIG_SCHED_WALT
+ if (!prev->on_rq)
+ prev->last_sleep_ts = wallclock;
+#endif
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3758,7 +3709,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
- return try_to_wake_up(curr->private, mode, wake_flags);
+ return try_to_wake_up(curr->private, mode, wake_flags, 1);
}
EXPORT_SYMBOL(default_wake_function);
@@ -3784,6 +3735,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3879,6 +3831,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -4039,6 +3993,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
dl_se->flags = attr->sched_flags;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
/*
* Changing the parameters of a task is 'tricky' and we're not doing
@@ -4306,6 +4261,7 @@ recheck:
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
/*
* Changing the policy of the stop threads its a very bad idea
@@ -4961,6 +4917,15 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+
+ /*
+ * The userspace tasks are forbidden to run on
+ * isolated CPUs. So exclude isolated CPUs from
+ * the getaffinity.
+ */
+ if (!(p->flags & PF_KTHREAD))
+ cpumask_andnot(mask, mask, cpu_isolated_mask);
+
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -6460,9 +6425,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
return -1;
}
@@ -6555,8 +6517,12 @@ static inline bool sched_debug(void)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
+ if (cpumask_weight(sched_domain_span(sd)) == 1) {
+ if (sd->groups->sge)
+ sd->flags &= ~SD_LOAD_BALANCE;
+ else
+ return 1;
+ }
/* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE |
@@ -6564,6 +6530,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN |
SD_SHARE_CAP_STATES)) {
@@ -6595,11 +6562,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN |
SD_SHARE_CAP_STATES);
+ if (parent->groups->sge) {
+ parent->flags &= ~SD_LOAD_BALANCE;
+ return 0;
+ }
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -6659,6 +6631,19 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
+void sched_get_rd(struct root_domain *rd)
+{
+ atomic_inc(&rd->refcount);
+}
+
+void sched_put_rd(struct root_domain *rd)
+{
+ if (!atomic_dec_and_test(&rd->refcount))
+ return;
+
+ call_rcu_sched(&rd->rcu, free_rootdomain);
+}
+
static int init_rootdomain(struct root_domain *rd)
{
memset(rd, 0, sizeof(*rd));
@@ -6672,6 +6657,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_dlo_mask;
@@ -6680,6 +6671,9 @@ static int init_rootdomain(struct root_domain *rd)
goto free_rto_mask;
init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+ rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
return 0;
free_rto_mask:
@@ -6913,6 +6907,9 @@ enum s_alloc {
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
+ * Only CPUs that can arrive at this group should be considered to continue
+ * balancing.
+ *
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
@@ -6924,18 +6921,31 @@ enum s_alloc {
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ const struct cpumask *sg_span = sched_group_cpus(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
continue;
cpumask_set_cpu(i, sched_group_mask(sg));
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
}
/*
@@ -6996,6 +7006,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -7291,11 +7302,19 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- * SD_SHARE_CAP_STATES - describes shared capacity states
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ * SD_SHARE_CAP_STATES - describes shared capacity states
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -7305,11 +7324,13 @@ static int sched_domains_curr_level;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN | \
SD_SHARE_CAP_STATES)
static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+ struct sched_domain *child, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
int sd_weight, sd_flags = 0;
@@ -7361,6 +7382,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
+ .child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
@@ -7370,6 +7392,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
@@ -7816,16 +7845,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
+ struct sched_domain *sd = sd_init(tl, child, cpu);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
- sd->child = child;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
@@ -7859,7 +7885,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
- struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7877,8 +7902,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
}
}
@@ -7914,8 +7937,19 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+ int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+ if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+ cpu_rq(max_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+ if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+ cpu_rq(min_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
sd = *per_cpu_ptr(d.sd, i);
+
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
@@ -8136,17 +8170,16 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
break;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
case CPU_ONLINE:
cpuset_update_active_cpus(true);
@@ -8339,6 +8372,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
@@ -8737,27 +8771,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- unsigned long flags;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &flags);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -8770,11 +8786,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -9180,11 +9222,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -9202,15 +9253,28 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
- sched_move_task(task);
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &flags);
+
+ update_rq_clock(rq);
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &flags);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9221,8 +9285,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
@@ -9566,6 +9646,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..dbc51442ecbc
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned int flags))
+{
+ if (WARN_ON(!data || !func))
+ return;
+
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
deleted file mode 100644
index d751bc2d0d6e..000000000000
--- a/kernel/sched/cpufreq_sched.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/cpufreq.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/percpu.h>
-#include <linux/irq_work.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/cpufreq_sched.h>
-
-#include "sched.h"
-
-#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
-#define THROTTLE_UP_NSEC 500000 /* 500us default */
-
-struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
-static bool __read_mostly cpufreq_driver_slow;
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-static struct cpufreq_governor cpufreq_gov_sched;
-#endif
-
-static DEFINE_PER_CPU(unsigned long, enabled);
-DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
-
-/**
- * gov_data - per-policy data internal to the governor
- * @up_throttle: next throttling period expiry if increasing OPP
- * @down_throttle: next throttling period expiry if decreasing OPP
- * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
- * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
- * @task: worker thread for dvfs transition that may block/sleep
- * @irq_work: callback used to wake up worker thread
- * @requested_freq: last frequency requested by the sched governor
- *
- * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
- * per-policy instance of it is created when the cpufreq_sched governor receives
- * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
- * member of struct cpufreq_policy.
- *
- * Readers of this data must call down_read(policy->rwsem). Writers must
- * call down_write(policy->rwsem).
- */
-struct gov_data {
- ktime_t up_throttle;
- ktime_t down_throttle;
- unsigned int up_throttle_nsec;
- unsigned int down_throttle_nsec;
- struct task_struct *task;
- struct irq_work irq_work;
- unsigned int requested_freq;
-};
-
-static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
- unsigned int freq)
-{
- struct gov_data *gd = policy->governor_data;
-
- /* avoid race with cpufreq_sched_stop */
- if (!down_write_trylock(&policy->rwsem))
- return;
-
- __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
-
- gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
- gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
- up_write(&policy->rwsem);
-}
-
-static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
-{
- ktime_t now = ktime_get();
-
- ktime_t throttle = gd->requested_freq < cur_freq ?
- gd->down_throttle : gd->up_throttle;
-
- if (ktime_after(now, throttle))
- return false;
-
- while (1) {
- int usec_left = ktime_to_ns(ktime_sub(throttle, now));
-
- usec_left /= NSEC_PER_USEC;
- trace_cpufreq_sched_throttled(usec_left);
- usleep_range(usec_left, usec_left + 100);
- now = ktime_get();
- if (ktime_after(now, throttle))
- return true;
- }
-}
-
-/*
- * we pass in struct cpufreq_policy. This is safe because changing out the
- * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
- * which tears down all of the data structures and __cpufreq_governor(policy,
- * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
- * new policy pointer
- */
-static int cpufreq_sched_thread(void *data)
-{
- struct sched_param param;
- struct cpufreq_policy *policy;
- struct gov_data *gd;
- unsigned int new_request = 0;
- unsigned int last_request = 0;
- int ret;
-
- policy = (struct cpufreq_policy *) data;
- gd = policy->governor_data;
-
- param.sched_priority = 50;
- ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
- if (ret) {
- pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
- do_exit(-EINVAL);
- } else {
- pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
- __func__, gd->task->pid);
- }
-
- do {
- new_request = gd->requested_freq;
- if (new_request == last_request) {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
- break;
- schedule();
- } else {
- /*
- * if the frequency thread sleeps while waiting to be
- * unthrottled, start over to check for a newer request
- */
- if (finish_last_request(gd, policy->cur))
- continue;
- last_request = new_request;
- cpufreq_sched_try_driver_target(policy, new_request);
- }
- } while (!kthread_should_stop());
-
- return 0;
-}
-
-static void cpufreq_sched_irq_work(struct irq_work *irq_work)
-{
- struct gov_data *gd;
-
- gd = container_of(irq_work, struct gov_data, irq_work);
- if (!gd)
- return;
-
- wake_up_process(gd->task);
-}
-
-static void update_fdomain_capacity_request(int cpu)
-{
- unsigned int freq_new, index_new, cpu_tmp;
- struct cpufreq_policy *policy;
- struct gov_data *gd;
- unsigned long capacity = 0;
-
- /*
- * Avoid grabbing the policy if possible. A test is still
- * required after locking the CPU's policy to avoid racing
- * with the governor changing.
- */
- if (!per_cpu(enabled, cpu))
- return;
-
- policy = cpufreq_cpu_get(cpu);
- if (IS_ERR_OR_NULL(policy))
- return;
-
- if (policy->governor != &cpufreq_gov_sched ||
- !policy->governor_data)
- goto out;
-
- gd = policy->governor_data;
-
- /* find max capacity requested by cpus in this policy */
- for_each_cpu(cpu_tmp, policy->cpus) {
- struct sched_capacity_reqs *scr;
-
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
- capacity = max(capacity, scr->total);
- }
-
- /* Convert the new maximum capacity request into a cpu frequency */
- freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
- if (cpufreq_frequency_table_target(policy, policy->freq_table,
- freq_new, CPUFREQ_RELATION_L,
- &index_new))
- goto out;
- freq_new = policy->freq_table[index_new].frequency;
-
- if (freq_new > policy->max)
- freq_new = policy->max;
-
- if (freq_new < policy->min)
- freq_new = policy->min;
-
- trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
- gd->requested_freq);
- if (freq_new == gd->requested_freq)
- goto out;
-
- gd->requested_freq = freq_new;
-
- /*
- * Throttling is not yet supported on platforms with fast cpufreq
- * drivers.
- */
- if (cpufreq_driver_slow)
- irq_work_queue_on(&gd->irq_work, cpu);
- else
- cpufreq_sched_try_driver_target(policy, freq_new);
-
-out:
- cpufreq_cpu_put(policy);
-}
-
-void update_cpu_capacity_request(int cpu, bool request)
-{
- unsigned long new_capacity;
- struct sched_capacity_reqs *scr;
-
- /* The rq lock serializes access to the CPU's sched_capacity_reqs. */
- lockdep_assert_held(&cpu_rq(cpu)->lock);
-
- scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
- new_capacity = scr->cfs + scr->rt;
- new_capacity = new_capacity * capacity_margin
- / SCHED_CAPACITY_SCALE;
- new_capacity += scr->dl;
-
- if (new_capacity == scr->total)
- return;
-
- trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
-
- scr->total = new_capacity;
- if (request)
- update_fdomain_capacity_request(cpu);
-}
-
-static inline void set_sched_freq(void)
-{
- static_key_slow_inc(&__sched_freq);
-}
-
-static inline void clear_sched_freq(void)
-{
- static_key_slow_dec(&__sched_freq);
-}
-
-static struct attribute_group sched_attr_group_gov_pol;
-static struct attribute_group *get_sysfs_attr(void)
-{
- return &sched_attr_group_gov_pol;
-}
-
-static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
-{
- struct gov_data *gd;
- int cpu;
- int rc;
-
- for_each_cpu(cpu, policy->cpus)
- memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
- sizeof(struct sched_capacity_reqs));
-
- gd = kzalloc(sizeof(*gd), GFP_KERNEL);
- if (!gd)
- return -ENOMEM;
-
- gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
- policy->cpuinfo.transition_latency :
- THROTTLE_UP_NSEC;
- gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
- pr_debug("%s: throttle threshold = %u [ns]\n",
- __func__, gd->up_throttle_nsec);
-
- rc = sysfs_create_group(&policy->kobj, get_sysfs_attr());
- if (rc) {
- pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
- goto err;
- }
-
- policy->governor_data = gd;
- if (cpufreq_driver_is_slow()) {
- cpufreq_driver_slow = true;
- gd->task = kthread_create(cpufreq_sched_thread, policy,
- "kschedfreq:%d",
- cpumask_first(policy->related_cpus));
- if (IS_ERR_OR_NULL(gd->task)) {
- pr_err("%s: failed to create kschedfreq thread\n",
- __func__);
- goto err;
- }
- get_task_struct(gd->task);
- kthread_bind_mask(gd->task, policy->related_cpus);
- wake_up_process(gd->task);
- init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
- }
-
- set_sched_freq();
-
- return 0;
-
-err:
- policy->governor_data = NULL;
- kfree(gd);
- return -ENOMEM;
-}
-
-static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
-{
- struct gov_data *gd = policy->governor_data;
-
- clear_sched_freq();
- if (cpufreq_driver_slow) {
- kthread_stop(gd->task);
- put_task_struct(gd->task);
- }
-
- sysfs_remove_group(&policy->kobj, get_sysfs_attr());
-
- policy->governor_data = NULL;
-
- kfree(gd);
- return 0;
-}
-
-static int cpufreq_sched_start(struct cpufreq_policy *policy)
-{
- int cpu;
-
- for_each_cpu(cpu, policy->cpus)
- per_cpu(enabled, cpu) = 1;
-
- return 0;
-}
-
-static void cpufreq_sched_limits(struct cpufreq_policy *policy)
-{
- unsigned int clamp_freq;
- struct gov_data *gd = policy->governor_data;;
-
- pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
- policy->cpu, policy->min, policy->max,
- policy->cur);
-
- clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
-
- if (policy->cur != clamp_freq)
- __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
-}
-
-static int cpufreq_sched_stop(struct cpufreq_policy *policy)
-{
- int cpu;
-
- for_each_cpu(cpu, policy->cpus)
- per_cpu(enabled, cpu) = 0;
-
- return 0;
-}
-
-static int cpufreq_sched_setup(struct cpufreq_policy *policy,
- unsigned int event)
-{
- switch (event) {
- case CPUFREQ_GOV_POLICY_INIT:
- return cpufreq_sched_policy_init(policy);
- case CPUFREQ_GOV_POLICY_EXIT:
- return cpufreq_sched_policy_exit(policy);
- case CPUFREQ_GOV_START:
- return cpufreq_sched_start(policy);
- case CPUFREQ_GOV_STOP:
- return cpufreq_sched_stop(policy);
- case CPUFREQ_GOV_LIMITS:
- cpufreq_sched_limits(policy);
- break;
- }
- return 0;
-}
-
-/* Tunables */
-static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
-{
- return sprintf(buf, "%u\n", gd->up_throttle_nsec);
-}
-
-static ssize_t store_up_throttle_nsec(struct gov_data *gd,
- const char *buf, size_t count)
-{
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- gd->up_throttle_nsec = val;
- return count;
-}
-
-static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
-{
- return sprintf(buf, "%u\n", gd->down_throttle_nsec);
-}
-
-static ssize_t store_down_throttle_nsec(struct gov_data *gd,
- const char *buf, size_t count)
-{
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- gd->down_throttle_nsec = val;
- return count;
-}
-
-/*
- * Create show/store routines
- * - sys: One governor instance for complete SYSTEM
- * - pol: One governor instance per struct cpufreq_policy
- */
-#define show_gov_pol_sys(file_name) \
-static ssize_t show_##file_name##_gov_pol \
-(struct cpufreq_policy *policy, char *buf) \
-{ \
- return show_##file_name(policy->governor_data, buf); \
-}
-
-#define store_gov_pol_sys(file_name) \
-static ssize_t store_##file_name##_gov_pol \
-(struct cpufreq_policy *policy, const char *buf, size_t count) \
-{ \
- return store_##file_name(policy->governor_data, buf, count); \
-}
-
-#define gov_pol_attr_rw(_name) \
- static struct freq_attr _name##_gov_pol = \
- __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
-
-#define show_store_gov_pol_sys(file_name) \
- show_gov_pol_sys(file_name); \
- store_gov_pol_sys(file_name)
-#define tunable_handlers(file_name) \
- show_gov_pol_sys(file_name); \
- store_gov_pol_sys(file_name); \
- gov_pol_attr_rw(file_name)
-
-tunable_handlers(down_throttle_nsec);
-tunable_handlers(up_throttle_nsec);
-
-/* Per policy governor instance */
-static struct attribute *sched_attributes_gov_pol[] = {
- &up_throttle_nsec_gov_pol.attr,
- &down_throttle_nsec_gov_pol.attr,
- NULL,
-};
-
-static struct attribute_group sched_attr_group_gov_pol = {
- .attrs = sched_attributes_gov_pol,
- .name = "sched",
-};
-
-#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
-static
-#endif
-struct cpufreq_governor cpufreq_gov_sched = {
- .name = "sched",
- .governor = cpufreq_sched_setup,
- .owner = THIS_MODULE,
-};
-
-static int __init cpufreq_sched_init(void)
-{
- int cpu;
-
- for_each_cpu(cpu, cpu_possible_mask)
- per_cpu(enabled, cpu) = 0;
- return cpufreq_register_governor(&cpufreq_gov_sched);
-}
-
-/* Try to make this the default governor */
-fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..6c84b4d28914
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,827 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned long boosted_cpu_util(int cpu);
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER (1000)
+#define SUGOV_KTHREAD_PRIORITY 50
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int up_rate_limit_us;
+ unsigned int down_rate_limit_us;
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 min_rate_limit_ns;
+ s64 up_rate_delay_ns;
+ s64 down_rate_delay_ns;
+ unsigned int next_freq;
+ unsigned int cached_raw_freq;
+
+ /* The next fields are only needed if fast switch cannot be used. */
+ struct irq_work irq_work;
+ struct kthread_work work;
+ struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
+ bool work_in_progress;
+
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+
+ bool iowait_boost_pending;
+ unsigned int iowait_boost;
+ unsigned int iowait_boost_max;
+ u64 last_update;
+
+ /* The fields below are only needed when sharing a policy. */
+ unsigned long util;
+ unsigned long max;
+ unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+
+ if (sg_policy->work_in_progress)
+ return false;
+
+ if (unlikely(sg_policy->need_freq_update)) {
+ sg_policy->need_freq_update = false;
+ /*
+ * This happens when limits change, so forget the previous
+ * next_freq value and force an update.
+ */
+ sg_policy->next_freq = UINT_MAX;
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ /* No need to recalculate next freq for min_rate_limit_us at least */
+ return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ s64 delta_ns;
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ if (next_freq > sg_policy->next_freq &&
+ delta_ns < sg_policy->up_rate_delay_ns)
+ return true;
+
+ if (next_freq < sg_policy->next_freq &&
+ delta_ns < sg_policy->down_rate_delay_ns)
+ return true;
+
+ return false;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) {
+ /* Reset cached freq as next_freq isn't changed */
+ sg_policy->cached_raw_freq = 0;
+ return;
+ }
+
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+
+ if (policy->fast_switch_enabled) {
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (next_freq == CPUFREQ_ENTRY_INVALID)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+ } else {
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
+ }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int freq = arch_scale_freq_invariant() ?
+ policy->cpuinfo.max_freq : policy->cur;
+
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_policy->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static inline bool use_pelt(void)
+{
+#ifdef CONFIG_SCHED_WALT
+ return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+ return true;
+#endif
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long max_cap, rt;
+ s64 delta;
+
+ max_cap = arch_scale_cpu_capacity(NULL, cpu);
+
+ sched_avg_update(rq);
+ delta = time - rq->age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+ rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+ rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
+
+ *util = boosted_cpu_util(cpu);
+ if (likely(use_pelt()))
+ *util = *util + rt;
+
+ *util = min(*util, max_cap);
+ *max = max_cap;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ if (flags & SCHED_CPUFREQ_IOWAIT) {
+ if (sg_cpu->iowait_boost_pending)
+ return;
+
+ sg_cpu->iowait_boost_pending = true;
+
+ if (sg_cpu->iowait_boost) {
+ sg_cpu->iowait_boost <<= 1;
+ if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else {
+ sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+ }
+ } else if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC) {
+ sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
+ }
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+ unsigned long *max)
+{
+ unsigned int boost_util, boost_max;
+
+ if (!sg_cpu->iowait_boost)
+ return;
+
+ if (sg_cpu->iowait_boost_pending) {
+ sg_cpu->iowait_boost_pending = false;
+ } else {
+ sg_cpu->iowait_boost >>= 1;
+ if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+ sg_cpu->iowait_boost = 0;
+ return;
+ }
+ }
+
+ boost_util = sg_cpu->iowait_boost;
+ boost_max = sg_cpu->iowait_boost_max;
+
+ if (*util * boost_max < *max * boost_util) {
+ *util = boost_util;
+ *max = boost_max;
+ }
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util, max;
+ unsigned int next_f;
+ bool busy;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ if (!sugov_should_update_freq(sg_policy, time))
+ return;
+
+ busy = sugov_cpu_is_busy(sg_cpu);
+
+ if (flags & SCHED_CPUFREQ_DL) {
+ next_f = policy->cpuinfo.max_freq;
+ } else {
+ sugov_get_util(&util, &max, time);
+ sugov_iowait_boost(sg_cpu, &util, &max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq) {
+ next_f = sg_policy->next_freq;
+
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
+ }
+ sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
+{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util = 0, max = 1;
+ unsigned int j;
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+ unsigned long j_util, j_max;
+ s64 delta_ns;
+
+ /*
+ * If the CPU utilization was last updated before the previous
+ * frequency update and the time elapsed between the last update
+ * of the CPU utilization and the last frequency update is long
+ * enough, don't take the CPU into account as it probably is
+ * idle now (and clear iowait_boost for it).
+ */
+ delta_ns = time - j_sg_cpu->last_update;
+ if (delta_ns > TICK_NSEC) {
+ j_sg_cpu->iowait_boost = 0;
+ j_sg_cpu->iowait_boost_pending = false;
+ continue;
+ }
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
+ return policy->cpuinfo.max_freq;
+
+ j_util = j_sg_cpu->util;
+ j_max = j_sg_cpu->max;
+ if (j_util * max > j_max * util) {
+ util = j_util;
+ max = j_max;
+ }
+
+ sugov_iowait_boost(j_sg_cpu, &util, &max);
+ }
+
+ return get_next_freq(sg_policy, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned long util, max;
+ unsigned int next_f;
+
+ sugov_get_util(&util, &max, time);
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sg_cpu->util = util;
+ sg_cpu->max = max;
+ sg_cpu->flags = flags;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ if (flags & SCHED_CPUFREQ_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu, time);
+
+ sugov_update_commit(sg_policy, time, next_f);
+ }
+
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct kthread_work *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+ mutex_lock(&sg_policy->work_lock);
+ __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+ CPUFREQ_RELATION_L);
+ mutex_unlock(&sg_policy->work_lock);
+
+ sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ queue_kthread_work(&sg_policy->worker, &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+ mutex_lock(&min_rate_lock);
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+ mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->up_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->down_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+ &up_rate_limit_us.attr,
+ &down_rate_limit_us.attr,
+ NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+ .default_attrs = sugov_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ kfree(sg_policy);
+}
+
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ init_kthread_work(&sg_policy->work, sugov_work);
+ init_kthread_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ flush_kthread_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ cpufreq_enable_fast_switch(policy);
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto stop_kthread;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto stop_kthread;
+ }
+
+ if (policy->up_transition_delay_us && policy->down_transition_delay_us) {
+ tunables->up_rate_limit_us = policy->up_transition_delay_us;
+ tunables->down_rate_limit_us = policy->down_transition_delay_us;
+ } else {
+ unsigned int lat;
+
+ tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat) {
+ tunables->up_rate_limit_us *= lat;
+ tunables->down_rate_limit_us *= lat;
+ }
+ }
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ cpufreq_gov_schedutil.name);
+ if (ret)
+ goto fail;
+
+out:
+ mutex_unlock(&global_tunables_lock);
+ return 0;
+
+fail:
+ policy->governor_data = NULL;
+ sugov_tunables_free(tunables);
+
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
+ pr_err("initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ sugov_tunables_free(tunables);
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_kthread_stop(sg_policy);
+ sugov_policy_free(sg_policy);
+
+ cpufreq_disable_fast_switch(policy);
+ return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ sg_policy->up_rate_delay_ns =
+ sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = UINT_MAX;
+ sg_policy->work_in_progress = false;
+ sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->sg_policy = sg_policy;
+ sg_cpu->flags = SCHED_CPUFREQ_DL;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
+ }
+ return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_sched();
+
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
+ return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ if (!policy->fast_switch_enabled) {
+ mutex_lock(&sg_policy->work_lock);
+ cpufreq_policy_apply_limits(policy);
+ mutex_unlock(&sg_policy->work_lock);
+ }
+
+ sg_policy->need_freq_update = true;
+
+ return 0;
+}
+
+static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ switch(event) {
+ case CPUFREQ_GOV_POLICY_INIT:
+ return sugov_init(policy);
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return sugov_exit(policy);
+ case CPUFREQ_GOV_START:
+ return sugov_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return sugov_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ return sugov_limits(policy);
+ default:
+ BUG();
+ }
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil = {
+ .name = "schedutil",
+ .governor = cpufreq_schedutil_cb,
+ .owner = THIS_MODULE,
+};
+
+static int __init sugov_register(void)
+{
+ return cpufreq_register_governor(&cpufreq_gov_schedutil);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..14225d5d8617 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -27,6 +27,8 @@
* of the License.
*/
+#include "sched.h"
+
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -51,6 +53,27 @@ static int convert_prio(int prio)
}
/**
+ * drop_nopreempt_cpus - remove a cpu from the mask if it is likely
+ * non-preemptible
+ * @lowest_mask: mask with selected CPUs (non-NULL)
+ */
+static void
+drop_nopreempt_cpus(struct cpumask *lowest_mask)
+{
+ unsigned int cpu = cpumask_first(lowest_mask);
+
+ while (cpu < nr_cpu_ids) {
+ /* unlocked access */
+ struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr);
+
+ if (task_may_not_preempt(task, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+
+ cpu = cpumask_next(cpu, lowest_mask);
+ }
+}
+
+/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
@@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
{
int idx = 0;
int task_pri = convert_prio(p->prio);
+ bool drop_nopreempts = task_pri <= MAX_RT_PRIO;
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+retry:
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
int skip = 0;
@@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
+ if (drop_nopreempts)
+ drop_nopreempt_cpus(lowest_mask);
/*
* We have to ensure that we have at least one bit
* still set in the array, since the map could have
@@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
return 1;
}
-
+ /*
+ * If we can't find any non-preemptible cpu's, retry so we can
+ * find the lowest priority target and avoid priority inversion.
+ */
+ if (drop_nopreempts) {
+ drop_nopreempts = false;
+ goto retry;
+ }
return 0;
}
@@ -246,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp)
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
+
+/*
+ * cpupri_check_rt - check if CPU has a RT task
+ * should be called from rcu-sched read section.
+ */
+bool cpupri_check_rt(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL;
+}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3d55ec89c400..188c8388a63f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,6 +18,8 @@
#include <linux/slab.h>
+#include "walt.h"
+
struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -461,13 +463,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*
* This function returns true if:
*
- * runtime / (deadline - t) > dl_runtime / dl_period ,
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
*
* IOW we can't recycle current parameters.
*
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
* task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
*/
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, u64 t)
@@ -492,7 +494,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE);
@@ -500,13 +502,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
}
/*
- * When a -deadline entity is queued back on the runqueue, its runtime and
- * deadline might need updating.
+ * Revised wakeup rule [1]: For self-suspending tasks, rather then
+ * re-initializing task's runtime and deadline, the revised wakeup
+ * rule adjusts the task's runtime to avoid the task to overrun its
+ * density.
+ *
+ * Reasoning: a task may overrun the density if:
+ * runtime / (deadline - t) > dl_runtime / dl_deadline
+ *
+ * Therefore, runtime can be adjusted to:
+ * runtime = (dl_runtime / dl_deadline) * (deadline - t)
+ *
+ * In such way that runtime will be equal to the maximum density
+ * the task can use without breaking any rule.
+ *
+ * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant
+ * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24.
+ */
+static void
+update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 laxity = dl_se->deadline - rq_clock(rq);
+
+ /*
+ * If the task has deadline < period, and the deadline is in the past,
+ * it should already be throttled before this check.
+ *
+ * See update_dl_entity() comments for further details.
+ */
+ WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq)));
+
+ dl_se->runtime = (dl_se->dl_density * laxity) >> 20;
+}
+
+/*
+ * Regarding the deadline, a task with implicit deadline has a relative
+ * deadline == relative period. A task with constrained deadline has a
+ * relative deadline <= relative period.
+ *
+ * We support constrained deadline tasks. However, there are some restrictions
+ * applied only for tasks which do not have an implicit deadline. See
+ * update_dl_entity() to know more about such restrictions.
*
- * The policy here is that we update the deadline of the entity only if:
- * - the current deadline is in the past,
- * - using the remaining runtime with the current deadline would make
- * the entity exceed its bandwidth.
+ * The dl_is_implicit() returns true if the task has an implicit deadline.
+ */
+static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline == dl_se->dl_period;
+}
+
+/*
+ * When a deadline entity is placed in the runqueue, its runtime and deadline
+ * might need to be updated. This is done by a CBS wake up rule. There are two
+ * different rules: 1) the original CBS; and 2) the Revisited CBS.
+ *
+ * When the task is starting a new period, the Original CBS is used. In this
+ * case, the runtime is replenished and a new absolute deadline is set.
+ *
+ * When a task is queued before the begin of the next period, using the
+ * remaining runtime and deadline could make the entity to overflow, see
+ * dl_entity_overflow() to find more about runtime overflow. When such case
+ * is detected, the runtime and deadline need to be updated.
+ *
+ * If the task has an implicit deadline, i.e., deadline == period, the Original
+ * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * set, as in the previous cases.
+ *
+ * However, the Original CBS does not work properly for tasks with
+ * deadline < period, which are said to have a constrained deadline. By
+ * applying the Original CBS, a constrained deadline task would be able to run
+ * runtime/deadline in a period. With deadline < period, the task would
+ * overrun the runtime/period allowed bandwidth, breaking the admission test.
+ *
+ * In order to prevent this misbehave, the Revisited CBS is used for
+ * constrained deadline tasks when a runtime overflow is detected. In the
+ * Revisited CBS, rather than replenishing & setting a new absolute deadline,
+ * the remaining runtime of the task is reduced to avoid runtime overflow.
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -528,15 +601,28 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ !dl_se->dl_boosted)){
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
}
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
/*
* If the entity depleted all its runtime, and if we want it to sleep
* while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
* and try to activate it.
*
* Notice that it is important for the caller to know if the timer
@@ -558,7 +644,7 @@ static int start_dl_timer(struct task_struct *p)
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
- act = ns_to_ktime(dl_se->deadline);
+ act = ns_to_ktime(dl_next_period(dl_se));
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -722,6 +808,39 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
timer->function = dl_task_timer;
}
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+ dl_se->runtime = 0;
+ }
+}
+
static
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
@@ -755,6 +874,9 @@ static void update_curr_dl(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1037,6 +1159,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
}
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl))
+ dl_check_constrained_dl(&p->dl);
+
+ /*
* If p is throttled, we do nothing. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
@@ -1102,7 +1233,8 @@ static void yield_task_dl(struct rq *rq)
static int find_later_rq(struct task_struct *task);
static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
struct task_struct *curr;
struct rq *rq;
@@ -1620,7 +1752,9 @@ retry:
next_task->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, next_task, 0);
clear_average_bw(&next_task->dl, &rq->dl);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, later_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
add_average_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
next_task->on_rq = TASK_ON_RQ_QUEUED;
@@ -1712,7 +1846,9 @@ static void pull_dl_task(struct rq *this_rq)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
clear_average_bw(&p->dl, &src_rq->dl);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
add_average_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c8c4272c61d8..ed8e6bb4531b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -636,6 +636,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
+ /* eas */
+ /* select_idle_sibling() */
+ P(se.statistics.nr_wakeups_sis_attempts);
+ P(se.statistics.nr_wakeups_sis_idle);
+ P(se.statistics.nr_wakeups_sis_cache_affine);
+ P(se.statistics.nr_wakeups_sis_suff_cap);
+ P(se.statistics.nr_wakeups_sis_idle_cpu);
+ P(se.statistics.nr_wakeups_sis_count);
+ /* select_energy_cpu_brute() */
+ P(se.statistics.nr_wakeups_secb_attempts);
+ P(se.statistics.nr_wakeups_secb_sync);
+ P(se.statistics.nr_wakeups_secb_idle_bt);
+ P(se.statistics.nr_wakeups_secb_insuff_cap);
+ P(se.statistics.nr_wakeups_secb_no_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_count);
+ /* find_best_target() */
+ P(se.statistics.nr_wakeups_fbt_attempts);
+ P(se.statistics.nr_wakeups_fbt_no_cpu);
+ P(se.statistics.nr_wakeups_fbt_no_sd);
+ P(se.statistics.nr_wakeups_fbt_pref_idle);
+ P(se.statistics.nr_wakeups_fbt_count);
+ /* cas */
+ /* select_task_rq_fair() */
+ P(se.statistics.nr_wakeups_cas_attempts);
+ P(se.statistics.nr_wakeups_cas_count);
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
__P(load_avg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 099a1b93bebf..23e37b0674df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -35,6 +35,8 @@
#include "sched.h"
#include <trace/events/sched.h>
#include "tune.h"
+#include "walt.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -50,9 +52,7 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-unsigned int sysctl_sched_is_big_little = 0;
unsigned int sysctl_sched_sync_hint_enable = 1;
-unsigned int sysctl_sched_initial_task_util = 0;
unsigned int sysctl_sched_cstate_aware = 1;
/*
@@ -119,6 +119,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -294,19 +300,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -664,7 +710,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
@@ -688,20 +734,115 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = sched_freq() ?
- sysctl_sched_initial_task_util :
- scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * In previous Android versions, we used to have:
+ * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ * However, that functionality has been moved to enqueue.
+ * It is unclear if we should restore this in enqueue.
+ */
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-#else
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ /*
+ * If we wish to restore tuning via setting initial util,
+ * this is where we should do it.
+ */
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ return;
+ }
+ }
+
+ attach_entity_cfs_rq(se);
+}
+
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1425,7 +1566,8 @@ balance:
* Call select_idle_sibling to maybe find a better one.
*/
if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
assign:
assigned = true;
@@ -2410,28 +2552,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
-
- return tg_weight;
-}
+ load = scale_load_down(cfs_rq->load.weight);
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2450,6 +2586,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -2468,16 +2605,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2486,8 +2627,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -2499,6 +2641,7 @@ u32 sched_get_wake_up_idle(struct task_struct *p)
return !!enabled;
}
+EXPORT_SYMBOL(sched_get_wake_up_idle);
int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
{
@@ -2511,6 +2654,7 @@ int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle)
return 0;
}
+EXPORT_SYMBOL(sched_set_wake_up_idle);
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -3790,25 +3934,262 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/*
@@ -3828,23 +4209,43 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+ removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3859,68 +4260,89 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq == &rq_of(cfs_rq)->cfs)
trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
return decayed || removed;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int cpu = cpu_of(rq_of(cfs_rq));
+ int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
- if (entity_is_task(se))
- trace_sched_load_avg_task(task_of(se), &se->avg);
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -3928,34 +4350,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3984,24 +4392,37 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -4038,7 +4459,16 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
@@ -4187,9 +4617,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -4262,6 +4693,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
@@ -4297,7 +4738,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
/*
@@ -4352,7 +4793,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -4468,8 +4909,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -5372,29 +5813,13 @@ static inline void hrtick_update(struct rq *rq)
#endif
#ifdef CONFIG_SMP
+static bool __cpu_overutilized(int cpu, int delta);
static bool cpu_overutilized(int cpu);
-static inline unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu);
#else
-#define boosted_cpu_util(cpu) cpu_util(cpu)
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
#endif
-#if defined(CONFIG_SMP) && defined(CONFIG_CPU_FREQ_GOV_SCHED)
-static void update_capacity_of(int cpu)
-{
- unsigned long req_cap;
-
- if (!sched_freq())
- return;
-
- /* Convert scale-invariant capacity to cpu. */
- req_cap = boosted_cpu_util(cpu);
- req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
- set_cfs_cpu_capacity(cpu, true, req_cap);
-}
-#else
-#define update_capacity_of(X) do {} while(0)
-#endif /* SMP and CPU_FREQ_GOV_SCHED */
-
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5407,9 +5832,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
int task_new = flags & ENQUEUE_WAKEUP_NEW;
- int task_wakeup = flags & ENQUEUE_WAKEUP;
#endif
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -5421,7 +5853,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -5438,8 +5870,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se) {
@@ -5474,19 +5906,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->rd->overutilized = true;
trace_sched_overutilized(true);
}
-
- }
-
- if (!se) {
- /*
- * We want to potentially trigger a freq switch
- * request only for tasks that are waking up; this is
- * because we get here also during load balancing, but
- * in these cases it seems wise to trigger as single
- * request after load balancing is done.
- */
- if (task_new || task_wakeup)
- update_capacity_of(cpu_of(rq));
}
#endif /* CONFIG_SMP */
@@ -5544,8 +5963,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se) {
@@ -5564,23 +5983,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
schedtune_dequeue_task(p, cpu_of(rq));
- if (!se) {
- /*
- * We want to potentially trigger a freq switch
- * request only for tasks that are going to sleep;
- * this is because we get here also during load
- * balancing, but in these cases it seems wise to
- * trigger as single request after load balancing is
- * done.
- */
- if (task_sleep) {
- if (rq->cfs.nr_running)
- update_capacity_of(cpu_of(rq));
- else if (sched_freq())
- set_cfs_cpu_capacity(cpu_of(rq), false, 0);
- }
- }
-
#endif /* CONFIG_SMP */
hrtick_update(rq);
@@ -6000,6 +6402,7 @@ struct energy_env {
int util_delta;
int src_cpu;
int dst_cpu;
+ int trg_cpu;
int energy;
int payoff;
struct task_struct *task;
@@ -6016,11 +6419,14 @@ struct energy_env {
} cap;
};
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
/*
* __cpu_norm_util() returns the cpu util relative to a specific capacity,
- * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
- * energy calculations. Using the scale-invariant util returned by
- * cpu_util() and approximating scale-invariant util by:
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
*
* util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
*
@@ -6030,34 +6436,32 @@ struct energy_env {
*
* norm_util = running_time/time ~ util/capacity
*/
-static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
{
- int util = __cpu_util(cpu, delta);
-
if (util >= capacity)
return SCHED_CAPACITY_SCALE;
return (util << SCHED_CAPACITY_SHIFT)/capacity;
}
-static int calc_util_delta(struct energy_env *eenv, int cpu)
-{
- if (cpu == eenv->src_cpu)
- return -eenv->util_delta;
- if (cpu == eenv->dst_cpu)
- return eenv->util_delta;
- return 0;
-}
-
-static
-unsigned long group_max_util(struct energy_env *eenv)
+static unsigned long group_max_util(struct energy_env *eenv)
{
- int i, delta;
unsigned long max_util = 0;
+ unsigned long util;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+ util = cpu_util_wake(cpu, eenv->task);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->trg_cpu))
+ util += eenv->util_delta;
- for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
- delta = calc_util_delta(eenv, i);
- max_util = max(max_util, __cpu_util(i, delta));
+ max_util = max(max_util, util);
}
return max_util;
@@ -6065,49 +6469,63 @@ unsigned long group_max_util(struct energy_env *eenv)
/*
* group_norm_util() returns the approximated group util relative to it's
- * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
- * energy calculations. Since task executions may or may not overlap in time in
- * the group the true normalized util is between max(cpu_norm_util(i)) and
- * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
- * latter is used as the estimate as it leads to a more pessimistic energy
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
* estimate (more busy).
*/
static unsigned
long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
{
- int i, delta;
- unsigned long util_sum = 0;
unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long util, util_sum = 0;
+ int cpu;
- for_each_cpu(i, sched_group_cpus(sg)) {
- delta = calc_util_delta(eenv, i);
- util_sum += __cpu_norm_util(i, capacity, delta);
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ util = cpu_util_wake(cpu, eenv->task);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->trg_cpu))
+ util += eenv->util_delta;
+
+ util_sum += __cpu_norm_util(util, capacity);
}
- if (util_sum > SCHED_CAPACITY_SCALE)
- return SCHED_CAPACITY_SCALE;
- return util_sum;
+ return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
}
static int find_new_capacity(struct energy_env *eenv,
const struct sched_group_energy * const sge)
{
- int idx;
+ int idx, max_idx = sge->nr_cap_states - 1;
unsigned long util = group_max_util(eenv);
+ /* default is max_cap if we don't find a match */
+ eenv->cap_idx = max_idx;
+
for (idx = 0; idx < sge->nr_cap_states; idx++) {
- if (sge->cap_states[idx].cap >= util)
+ if (sge->cap_states[idx].cap >= util) {
+ eenv->cap_idx = idx;
break;
+ }
}
- eenv->cap_idx = idx;
-
- return idx;
+ return eenv->cap_idx;
}
-static int group_idle_state(struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
{
int i, state = INT_MAX;
+ int src_in_grp, dst_in_grp;
+ long grp_util = 0;
/* Find the shallowest idle state in the sched group. */
for_each_cpu(i, sched_group_cpus(sg))
@@ -6116,6 +6534,53 @@ static int group_idle_state(struct sched_group *sg)
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
+ src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ if (src_in_grp == dst_in_grp) {
+ /* both CPUs under consideration are in the same group or not in
+ * either group, migration should leave idle state the same.
+ */
+ goto end;
+ }
+
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
+ */
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ grp_util += cpu_util_wake(i, eenv->task);
+ if (unlikely(i == eenv->trg_cpu))
+ grp_util += eenv->util_delta;
+ }
+
+ if (grp_util <=
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+ /* after moving, this group is at most partly
+ * occupied, so it should have some idle time.
+ */
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+ int new_state = grp_util * max_idle_state_idx;
+ if (grp_util <= 0)
+ /* group will have no util, use lowest state */
+ new_state = max_idle_state_idx + 1;
+ else {
+ /* for partially idle, linearly map util to idle
+ * states, excluding the lowest one. This does not
+ * correspond to the state we expect to enter in
+ * reality, but an indication of what might happen.
+ */
+ new_state = min(max_idle_state_idx, (int)
+ (new_state / sg->sgc->max_capacity));
+ new_state = max_idle_state_idx - new_state;
+ }
+ state = new_state;
+ } else {
+ /* After moving, the group will be fully occupied
+ * so assume it will not be idle at all.
+ */
+ state = 0;
+ }
+end:
return state;
}
@@ -6131,39 +6596,43 @@ static int group_idle_state(struct sched_group *sg)
*/
static int sched_group_energy(struct energy_env *eenv)
{
- struct sched_domain *sd;
- int cpu, total_energy = 0;
struct cpumask visit_cpus;
- struct sched_group *sg;
+ u64 total_energy = 0;
+ int cpu_count;
WARN_ON(!eenv->sg_top->sge);
cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+ /* If a cpu is hotplugged in while we are in this function,
+ * it does not appear in the existing visit_cpus mask
+ * which came from the sched_group pointer of the
+ * sched_domain pointed at by sd_ea for either the prev
+ * or next cpu and was dereferenced in __energy_diff.
+ * Since we will dereference sd_scs later as we iterate
+ * through the CPUs we expect to visit, new CPUs can
+ * be present which are not in the visit_cpus mask.
+ * Guard this with cpu_count.
+ */
+ cpu_count = cpumask_weight(&visit_cpus);
while (!cpumask_empty(&visit_cpus)) {
struct sched_group *sg_shared_cap = NULL;
-
- cpu = cpumask_first(&visit_cpus);
+ int cpu = cpumask_first(&visit_cpus);
+ struct sched_domain *sd;
/*
* Is the group utilization affected by cpus outside this
* sched_group?
+ * This sd may have groups with cpus which were not present
+ * when we took visit_cpus.
*/
sd = rcu_dereference(per_cpu(sd_scs, cpu));
- if (!sd)
- /*
- * We most probably raced with hotplug; returning a
- * wrong energy estimation is better than entering an
- * infinite loop.
- */
- return -EINVAL;
-
- if (sd->parent)
+ if (sd && sd->parent)
sg_shared_cap = sd->parent->groups;
for_each_domain(cpu, sd) {
- sg = sd->groups;
+ struct sched_group *sg = sd->groups;
/* Has this sched_domain already been visited? */
if (sd->child && group_first_cpu(sg) != cpu)
@@ -6183,43 +6652,69 @@ static int sched_group_energy(struct energy_env *eenv)
if (sg->group_weight == 1) {
/* Remove capacity of src CPU (before task move) */
- if (eenv->util_delta == 0 &&
+ if (eenv->trg_cpu == eenv->src_cpu &&
cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
eenv->cap.delta -= eenv->cap.before;
}
/* Add capacity of dst CPU (after task move) */
- if (eenv->util_delta != 0 &&
+ if (eenv->trg_cpu == eenv->dst_cpu &&
cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
eenv->cap.delta += eenv->cap.after;
}
}
- idle_idx = group_idle_state(sg);
+ idle_idx = group_idle_state(eenv, sg);
group_util = group_norm_util(eenv, sg);
- sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
- >> SCHED_CAPACITY_SHIFT;
+
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
- * sg->sge->idle_states[idle_idx].power)
- >> SCHED_CAPACITY_SHIFT;
+ * sg->sge->idle_states[idle_idx].power);
total_energy += sg_busy_energy + sg_idle_energy;
- if (!sd->child)
+ if (!sd->child) {
+ /*
+ * cpu_count here is the number of
+ * cpus we expect to visit in this
+ * calculation. If we race against
+ * hotplug, we can have extra cpus
+ * added to the groups we are
+ * iterating which do not appear in
+ * the visit_cpus mask. In that case
+ * we are not able to calculate energy
+ * without restarting so we will bail
+ * out and use prev_cpu this time.
+ */
+ if (!cpu_count)
+ return -EINVAL;
cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+ cpu_count--;
+ }
if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
goto next_cpu;
} while (sg = sg->next, sg != sd->groups);
}
+
+ /*
+ * If we raced with hotplug and got an sd NULL-pointer;
+ * returning a wrong energy estimation is better than
+ * entering an infinite loop.
+ * Specifically: If a cpu is unplugged after we took
+ * the visit_cpus mask, it no longer has an sd_scs
+ * pointer, so when we dereference it, we get NULL.
+ */
+ if (cpumask_test_cpu(cpu, &visit_cpus))
+ return -EINVAL;
next_cpu:
cpumask_clear_cpu(cpu, &visit_cpus);
continue;
}
- eenv->energy = total_energy;
+ eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
return 0;
}
@@ -6228,6 +6723,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
+static inline unsigned long task_util(struct task_struct *p);
+
/*
* energy_diff(): Estimate the energy impact of changing the utilization
* distribution. eenv specifies the change: utilisation amount, source, and
@@ -6240,13 +6737,16 @@ static inline int __energy_diff(struct energy_env *eenv)
struct sched_domain *sd;
struct sched_group *sg;
int sd_cpu = -1, energy_before = 0, energy_after = 0;
+ int diff, margin;
struct energy_env eenv_before = {
- .util_delta = 0,
+ .util_delta = task_util(eenv->task),
.src_cpu = eenv->src_cpu,
.dst_cpu = eenv->dst_cpu,
+ .trg_cpu = eenv->src_cpu,
.nrg = { 0, 0, 0, 0},
.cap = { 0, 0, 0 },
+ .task = eenv->task,
};
if (eenv->src_cpu == eenv->dst_cpu)
@@ -6282,12 +6782,22 @@ static inline int __energy_diff(struct energy_env *eenv)
eenv->nrg.after = energy_after;
eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
eenv->payoff = 0;
-
+#ifndef CONFIG_SCHED_TUNE
trace_sched_energy_diff(eenv->task,
eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
eenv->cap.before, eenv->cap.after, eenv->cap.delta,
eenv->nrg.delta, eenv->payoff);
+#endif
+ /*
+ * Dead-zone margin preventing too many migrations.
+ */
+
+ margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+ diff = eenv->nrg.after - eenv->nrg.before;
+
+ eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
return eenv->nrg.diff;
}
@@ -6296,29 +6806,42 @@ static inline int __energy_diff(struct energy_env *eenv)
struct target_nrg schedtune_target_nrg;
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+extern bool schedtune_initialized;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
/*
* System energy normalization
- * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
* corresponding to the specified energy variation.
*/
static inline int
normalize_energy(int energy_diff)
{
u32 normalized_nrg;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ /* during early setup, we don't know the extents */
+ if (unlikely(!schedtune_initialized))
+ return energy_diff < 0 ? -1 : 1 ;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
#ifdef CONFIG_SCHED_DEBUG
+ {
int max_delta;
/* Check for boundaries */
max_delta = schedtune_target_nrg.max_power;
max_delta -= schedtune_target_nrg.min_power;
WARN_ON(abs(energy_diff) >= max_delta);
+ }
#endif
/* Do scaling using positive numbers to increase the range */
normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
/* Scale by energy magnitude */
- normalized_nrg <<= SCHED_LOAD_SHIFT;
+ normalized_nrg <<= SCHED_CAPACITY_SHIFT;
/* Normalize on max energy for target platform */
normalized_nrg = reciprocal_divide(
@@ -6337,8 +6860,14 @@ energy_diff(struct energy_env *eenv)
__energy_diff(eenv);
/* Return energy diff when boost margin is 0 */
- if (boost == 0)
+ if (boost == 0) {
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ 0, -eenv->nrg.diff);
return eenv->nrg.diff;
+ }
/* Compute normalized energy diff */
nrg_delta = normalize_energy(eenv->nrg.diff);
@@ -6349,6 +6878,12 @@ energy_diff(struct energy_env *eenv)
eenv->cap.delta,
eenv->task);
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
/*
* When SchedTune is enabled, the energy_diff() function will return
* the computed energy payoff value. Since the energy_diff() return
@@ -6375,31 +6910,34 @@ energy_diff(struct energy_env *eenv)
* being client/server, worker/dispatcher, interrupt source or whatever is
* irrelevant, spread criteria is apparent partner count exceeds socket size.
*/
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int llc_size = this_cpu_read(sd_llc_size);
+
+ if (sibling_count_hint >= llc_size)
+ return 1;
if (master < slave)
swap(master, slave);
- if (slave < factor || master < slave * factor)
+ if (slave < llc_size || master < slave * llc_size)
return 0;
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
@@ -6459,8 +6997,6 @@ static inline unsigned long task_util(struct task_struct *p)
return p->se.avg.util_avg;
}
-unsigned int capacity_margin = 1280; /* ~20% margin */
-
static inline unsigned long boosted_task_util(struct task_struct *task);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
@@ -6486,18 +7022,20 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
return __task_fits(p, cpu, 0);
}
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
+static bool __cpu_overutilized(int cpu, int delta)
{
- return __task_fits(p, cpu, cpu_util(cpu));
+ return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
}
static bool cpu_overutilized(int cpu)
{
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ return __cpu_overutilized(cpu, 0);
}
#ifdef CONFIG_SCHED_TUNE
+struct reciprocal_value schedtune_spc_rdiv;
+
static long
schedtune_margin(unsigned long signal, long boost)
{
@@ -6508,29 +7046,16 @@ schedtune_margin(unsigned long signal, long boost)
*
* The Boost (B) value is used to compute a Margin (M) which is
* proportional to the complement of the original Signal (S):
- * M = B * (SCHED_LOAD_SCALE - S), if B is positive
- * M = B * S, if B is negative
+ * M = B * (SCHED_CAPACITY_SCALE - S)
* The obtained M could be used by the caller to "boost" S.
*/
if (boost >= 0) {
- margin = SCHED_LOAD_SCALE - signal;
+ margin = SCHED_CAPACITY_SCALE - signal;
margin *= boost;
} else
margin = -signal * boost;
- /*
- * Fast integer division by constant:
- * Constant : (C) = 100
- * Precision : 0.1% (P) = 0.1
- * Reference : C * 100 / P (R) = 100000
- *
- * Thus:
- * Shift bits : ceil(log(R,2)) (S) = 17
- * Mult const : round(2^S/C) (M) = 1311
- *
- *
- */
- margin *= 1311;
- margin >>= 17;
+
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
if (boost < 0)
margin *= -1;
@@ -6580,10 +7105,10 @@ schedtune_task_margin(struct task_struct *task)
#endif /* CONFIG_SCHED_TUNE */
-static inline unsigned long
+unsigned long
boosted_cpu_util(int cpu)
{
- unsigned long util = cpu_util(cpu);
+ unsigned long util = cpu_util_freq(cpu);
long margin = schedtune_cpu_margin(util, cpu);
trace_sched_boost_cpu(cpu, util, margin);
@@ -6602,19 +7127,25 @@ boosted_task_util(struct task_struct *task)
return util + margin;
}
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *fit_group = NULL, *spare_group = NULL;
- unsigned long min_load = ULONG_MAX, this_load = 0;
- unsigned long fit_capacity = ULONG_MAX;
- unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
+ struct sched_group *most_spare_sg = NULL;
+ unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -6622,7 +7153,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load, spare_capacity;
+ unsigned long load, avg_load, spare_cap, max_spare_cap;
int local_group;
int i;
@@ -6634,8 +7165,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -6646,24 +7181,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
- /*
- * Look for most energy-efficient group that can fit
- * that can fit the task.
- */
- if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
- fit_capacity = capacity_of(i);
- fit_group = group;
- }
+ spare_cap = capacity_spare_wake(i, p);
- /*
- * Look for group which has most spare capacity on a
- * single cpu.
- */
- spare_capacity = capacity_of(i) - cpu_util(i);
- if (spare_capacity > max_spare_capacity) {
- max_spare_capacity = spare_capacity;
- spare_group = group;
- }
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
@@ -6671,28 +7192,51 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (local_group) {
this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_spare = max_spare_cap;
+ } else {
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (fit_group)
- return fit_group;
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
+ */
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
- if (spare_group)
- return spare_group;
+ if (this_spare > task_util(p) / 2 &&
+ imbalance*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
+skip_spare:
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -6701,9 +7245,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (task_fits_spare(p, i)) {
+ if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -6715,8 +7263,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (idle_cpu(i) &&
- (!idle || idle->exit_latency == min_exit_latency) &&
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -6725,13 +7272,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (shallowest_idle_cpu == -1) {
- /*
- * If we haven't found an idle CPU yet
- * pick a non-idle one that can fit the task as
- * fallback.
- */
- shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -6743,29 +7283,99 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
+
+ if (wu) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq(), eas_stats.cas_attempts);
+ }
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (wu)
+ schedstat_inc(sd, eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq(), eas_stats.cas_count);
+ }
+
+ return new_cpu;
}
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
struct sched_group *sg;
- int i = task_cpu(p);
- int best_idle = -1;
- int best_idle_cstate = -1;
- int best_idle_capacity = INT_MAX;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ unsigned long best_idle_capacity = ULONG_MAX;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
+ schedstat_inc(this_rq(), eas_stats.sis_attempts);
if (!sysctl_sched_cstate_aware) {
- if (idle_cpu(target))
+ if (idle_cpu(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
+ schedstat_inc(this_rq(), eas_stats.sis_idle);
return target;
+ }
/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
+ schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
+ return prev;
+ }
}
if (!(current->flags & PF_WAKE_UP_IDLE) &&
@@ -6779,24 +7389,30 @@ static int select_idle_sibling(struct task_struct *p, int target)
for_each_lower_domain(sd) {
sg = sd->groups;
do {
+ int i;
if (!cpumask_intersects(sched_group_cpus(sg),
tsk_cpus_allowed(p)))
goto next;
if (sysctl_sched_cstate_aware) {
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
- struct rq *rq = cpu_rq(i);
- int idle_idx = idle_get_state_idx(rq);
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
unsigned long new_usage = boosted_task_util(p);
unsigned long capacity_orig = capacity_orig_of(i);
+
if (new_usage > capacity_orig || !idle_cpu(i))
goto next;
- if (i == target && new_usage <= capacity_curr_of(target))
+ if (i == target && new_usage <= capacity_curr_of(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
+ schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
+ schedstat_inc(sd, eas_stats.sis_suff_cap);
return target;
+ }
- if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
- best_idle = i;
+ if (idle_idx < best_idle_cstate &&
+ capacity_orig <= best_idle_capacity) {
+ best_idle_cpu = i;
best_idle_cstate = idle_idx;
best_idle_capacity = capacity_orig;
}
@@ -6809,231 +7425,462 @@ static int select_idle_sibling(struct task_struct *p, int target)
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
+ schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
+ schedstat_inc(sd, eas_stats.sis_idle_cpu);
goto done;
}
next:
sg = sg->next;
} while (sg != sd->groups);
}
- if (best_idle > 0)
- target = best_idle;
+
+ if (best_idle_cpu >= 0)
+ target = best_idle_cpu;
done:
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
+ schedstat_inc(this_rq(), eas_stats.sis_count);
+
return target;
}
-static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed. check_for_migration() looks for a better CPU of
+ * rq->curr. For that case we should return cpu util with contributions from
+ * currently running task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
{
- int iter_cpu;
- int target_cpu = -1;
- int target_util = 0;
- int backup_capacity = 0;
- int best_idle_cpu = -1;
- int best_idle_cstate = INT_MAX;
- int backup_cpu = -1;
- unsigned long task_util_boosted, new_util;
-
- task_util_boosted = boosted_task_util(p);
- for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
- int cur_capacity;
- struct rq *rq;
- int idle_idx;
-
- /*
- * Iterate from higher cpus for boosted tasks.
- */
- int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
-
- if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
- continue;
+ unsigned long util, capacity;
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- new_util = cpu_util(i) + task_util_boosted;
-
- /*
- * Ensure minimum capacity to grant the required boost.
- * The target CPU can be already at a capacity level higher
- * than the one required to boost the task.
- */
- if (new_util > capacity_orig_of(i))
- continue;
-
- /*
- * Unconditionally favoring tasks that prefer idle cpus to
- * improve latency.
- */
- if (idle_cpu(i) && prefer_idle) {
- if (best_idle_cpu < 0)
- best_idle_cpu = i;
- continue;
- }
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
- cur_capacity = capacity_curr_of(i);
- rq = cpu_rq(i);
- idle_idx = idle_get_state_idx(rq);
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
- if (new_util < cur_capacity) {
- if (cpu_rq(i)->nr_running) {
- if (prefer_idle) {
- /* Find a target cpu with highest
- * utilization.
- */
- if (target_util == 0 ||
- target_util < new_util) {
- target_cpu = i;
- target_util = new_util;
- }
- } else {
- /* Find a target cpu with lowest
- * utilization.
- */
- if (target_util == 0 ||
- target_util > new_util) {
- target_cpu = i;
- target_util = new_util;
- }
- }
- } else if (!prefer_idle) {
- if (best_idle_cpu < 0 ||
- (sysctl_sched_cstate_aware &&
- best_idle_cstate > idle_idx)) {
- best_idle_cstate = idle_idx;
- best_idle_cpu = i;
- }
- }
- } else if (backup_capacity == 0 ||
- backup_capacity > cur_capacity) {
- // Find a backup cpu with least capacity.
- backup_capacity = cur_capacity;
- backup_cpu = i;
- }
- }
+ return (util >= capacity) ? capacity : util;
+}
- if (prefer_idle && best_idle_cpu >= 0)
- target_cpu = best_idle_cpu;
- else if (target_cpu < 0)
- target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+static int start_cpu(bool boosted)
+{
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- return target_cpu;
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
}
-static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle)
{
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
+ unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long best_active_util = ULONG_MAX;
+ int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
- struct sched_group *sg, *sg_target;
- int target_max_cap = INT_MAX;
- int target_cpu = task_cpu(p);
- unsigned long task_util_boosted, new_util;
- int i;
+ struct sched_group *sg;
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
- if (sysctl_sched_sync_hint_enable && sync) {
- int cpu = smp_processor_id();
- cpumask_t search_cpus;
- cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
- if (cpumask_test_cpu(cpu, &search_cpus))
- return cpu;
- }
+ *backup_cpu = -1;
- sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq(), eas_stats.fbt_attempts);
- if (!sd)
- return target;
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
+ if (cpu < 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
+ return -1;
+ }
- sg = sd->groups;
- sg_target = sg;
+ /* Find SD for the start CPU */
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
+ if (!sd) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
+ return -1;
+ }
- if (sysctl_sched_is_big_little) {
+ /* Scan CPUs in all SDs */
+ sg = sd->groups;
+ do {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util;
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- /* Assuming all cpus are the same in group */
- int max_cap_cpu = group_first_cpu(sg);
+ if (!cpu_online(i))
+ continue;
- /*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
- */
- if (capacity_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ if (walt_cpu_high_irqload(i))
+ continue;
- task_util_boosted = boosted_task_util(p);
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
* accounting. However, the blocked utilization may be zero.
*/
- new_util = cpu_util(i) + task_util_boosted;
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
/*
* Ensure minimum capacity to grant the required boost.
* The target CPU can be already at a capacity level higher
* than the one required to boost the task.
*/
- if (new_util > capacity_orig_of(i))
+ new_util = max(min_util, new_util);
+ if (new_util > capacity_orig)
continue;
- if (new_util < capacity_curr_of(i)) {
- target_cpu = i;
- if (cpu_rq(i)->nr_running)
- break;
+ /*
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
+ * improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
+ */
+ if (prefer_idle) {
+
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
+
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
+ }
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
}
- /* cpu has capacity at higher OPP, keep it as fallback */
- if (target_cpu == task_cpu(p))
- target_cpu = i;
+ /*
+ * Enforce EAS mode
+ *
+ * For non latency sensitive tasks, skip CPUs that
+ * will be overutilized by moving the task there.
+ *
+ * The goal here is to remain in EAS mode as long as
+ * possible at least for !prefer_idle tasks.
+ */
+ if ((new_util * capacity_margin) >
+ (capacity_orig * SCHED_CAPACITY_SCALE))
+ continue;
+
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Select idle CPU with lower cap_orig */
+ if (capacity_orig > best_idle_min_cap_orig)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ best_idle_min_cap_orig = capacity_orig;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ /* Favor CPUs with smaller capacity */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - new_util) < target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - new_util;
+ target_capacity = capacity_orig;
+ target_cpu = i;
}
- } else {
- /*
- * Find a cpu with sufficient capacity
- */
+
+ } while (sg = sg->next, sg != sd->groups);
+
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq(), eas_stats.fbt_count);
+
+ return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+ struct sched_domain *sd;
+ int target_cpu = prev_cpu, tmp_target, tmp_backup;
+ bool boosted, prefer_idle;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq(), eas_stats.secb_attempts);
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq(), eas_stats.secb_sync);
+ return cpu;
+ }
+ }
+
+ rcu_read_lock();
#ifdef CONFIG_CGROUP_SCHEDTUNE
- bool boosted = schedtune_task_boost(p) > 0;
- bool prefer_idle = schedtune_prefer_idle(p) > 0;
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
#else
- bool boosted = 0;
- bool prefer_idle = 0;
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
#endif
- int tmp_target = find_best_target(p, boosted, prefer_idle);
- if (tmp_target >= 0) {
- target_cpu = tmp_target;
- if ((boosted || prefer_idle) && idle_cpu(target_cpu))
- return target_cpu;
+
+ sync_entity_load_avg(&p->se);
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ /* Find a cpu with sufficient capacity */
+ tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
+
+ if (!sd)
+ goto unlock;
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
+ goto unlock;
}
}
- if (target_cpu != task_cpu(p)) {
+ if (target_cpu != prev_cpu) {
+ int delta = 0;
struct energy_env eenv = {
- .util_delta = task_util(p),
- .src_cpu = task_cpu(p),
- .dst_cpu = target_cpu,
- .task = p,
+ .util_delta = task_util(p),
+ .src_cpu = prev_cpu,
+ .dst_cpu = target_cpu,
+ .task = p,
+ .trg_cpu = target_cpu,
};
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
+ p->state == TASK_WAKING)
+ delta = task_util(p);
+#endif
/* Not enough spare capacity on previous cpu */
- if (cpu_overutilized(task_cpu(p)))
- return target_cpu;
+ if (__cpu_overutilized(prev_cpu, delta)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
+ goto unlock;
+ }
+
+ if (energy_diff(&eenv) >= 0) {
+ /* No energy saving for target_cpu, try backup */
+ target_cpu = tmp_backup;
+ eenv.dst_cpu = target_cpu;
+ eenv.trg_cpu = target_cpu;
+ if (tmp_backup < 0 ||
+ tmp_backup == prev_cpu ||
+ energy_diff(&eenv) >= 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+ }
- if (energy_diff(&eenv) >= 0)
- return task_cpu(p);
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ goto unlock;
}
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq(), eas_stats.secb_count);
+
+unlock:
+ rcu_read_unlock();
+
return target_cpu;
}
@@ -7050,7 +7897,8 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+ int sibling_count_hint)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -7062,10 +7910,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return select_best_cpu(p, prev_cpu, 0, sync);
#endif
- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
- energy_aware();
+ if (sd_flag & SD_BALANCE_WAKE) {
+ record_wakee(p);
+ want_affine = !wake_wide(p, sibling_count_hint) &&
+ !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
+ }
+
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+ return select_energy_cpu_brute(p, prev_cpu, sync);
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -7090,49 +7943,25 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
- if (!sd) {
- if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
- else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
-
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
+ if (!sd) {
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
@@ -8083,10 +8912,6 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
- /*
- * We want to potentially raise target_cpu's OPP.
- */
- update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
@@ -8108,11 +8933,6 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
- /*
- * We want to potentially raise env.dst_cpu's OPP.
- */
- update_capacity_of(env->dst_cpu);
-
raw_spin_unlock(&env->dst_rq->lock);
}
@@ -8135,8 +8955,13 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+ true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8196,7 +9021,7 @@ static inline void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8434,13 +9259,14 @@ skip_unlock: __attribute__ ((unused));
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, max_capacity;
+ unsigned long capacity, max_capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -8454,6 +9280,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity = 0;
max_capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -8486,6 +9313,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
max_capacity = max(capacity, max_capacity);
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
@@ -8503,6 +9331,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
if (!cpu_isolated(cpumask_first(cpus))) {
capacity += sgc->capacity;
max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
}
group = group->next;
} while (group != child->groups);
@@ -8510,6 +9339,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
sdg->sgc->capacity = capacity;
sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -8632,6 +9462,38 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8663,6 +9525,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_isolated(i))
continue;
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -8791,15 +9659,21 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
/*
- * Candiate sg has no more than one task per cpu and has higher
- * per-cpu capacity. No reason to pull tasks to less capable cpus.
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
group_smaller_cpu_capacity(sds->local, sg))
return false;
}
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -8850,6 +9724,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+#define lb_sd_parent(sd) \
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -8935,7 +9812,7 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- if (!env->sd->parent) {
+ if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
@@ -9228,8 +10105,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
@@ -9461,6 +10341,7 @@ static int need_active_balance(struct lb_env *env)
if (energy_aware() &&
(capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
env->src_rq->cfs.h_nr_running == 1 &&
cpu_overutilized(env->src_cpu) &&
!cpu_overutilized(env->dst_cpu)) {
@@ -9524,7 +10405,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *continue_balancing)
{
int ld_moved = 0, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
struct sched_group *group = NULL;
struct rq *busiest = NULL;
unsigned long flags;
@@ -9597,6 +10478,7 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
/* The world might have changed. Validate assumptions */
if (busiest->nr_running <= 1) {
@@ -9610,11 +10492,6 @@ more_balance:
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
- /*
- * We want to potentially lower env.src_cpu's OPP.
- */
- if (cur_ld_moved)
- update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
@@ -9864,7 +10741,6 @@ static int idle_balance(struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
- long removed_util=0;
if (cpu_isolated(this_cpu))
return 0;
@@ -9891,17 +10767,6 @@ static int idle_balance(struct rq *this_rq)
raw_spin_unlock(&this_rq->lock);
- /*
- * If removed_util_avg is !0 we most probably migrated some task away
- * from this_cpu. In this case we might be willing to trigger an OPP
- * update, but we want to do so if we don't find anybody else to pull
- * here (we will trigger an OPP update with the pulled task's enqueue
- * anyway).
- *
- * Record removed_util before calling update_blocked_averages, and use
- * it below (before returning) to see if an OPP update is required.
- */
- removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
@@ -9969,12 +10834,6 @@ out:
if (pulled_task) {
idle_exit_fair(this_rq);
this_rq->idle_stamp = 0;
- } else if (removed_util) {
- /*
- * No task pulled and someone has been migrated away.
- * Good case to trigger an OPP update.
- */
- update_capacity_of(this_cpu);
}
return pulled_task;
@@ -9994,7 +10853,7 @@ static int active_load_balance_cpu_stop(void *data)
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd = NULL;
struct task_struct *p = NULL;
- struct task_struct *push_task;
+ struct task_struct *push_task = NULL;
int push_task_detached = 0;
struct lb_env env = {
.sd = sd,
@@ -10054,14 +10913,11 @@ static int active_load_balance_cpu_stop(void *data)
if (likely(sd)) {
env.sd = sd;
schedstat_inc(sd, alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
schedstat_inc(sd, alb_pushed);
- /*
- * We want to potentially lower env.src_cpu's OPP.
- */
- update_capacity_of(env.src_cpu);
moved = true;
} else {
schedstat_inc(sd, alb_failed);
@@ -10114,11 +10970,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
#ifdef CONFIG_SCHED_HMP
static inline int find_new_hmp_ilb(int type)
@@ -10545,6 +11396,10 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
(!energy_aware() || cpu_overutilized(cpu)))
return true;
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware())
+ return rq->misfit_task;
+
return (rq->nr_running >= 2);
}
@@ -10585,7 +11440,7 @@ static inline bool nohz_kick_needed(struct rq *rq, int *type)
#ifndef CONFIG_SCHED_HMP
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd && !energy_aware()) {
+ if (sd) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
@@ -10718,31 +11573,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -10755,8 +11596,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -10808,6 +11648,61 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -10822,8 +11717,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
- /* Catch up with the cfs_rq and remove our load when we leave */
- detach_entity_load_avg(cfs_rq, se);
+ detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
@@ -10831,16 +11725,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
- /* Synchronize task with its cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -10894,12 +11779,23 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -10912,6 +11808,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -10931,8 +11840,9 @@ void free_fair_sched_group(struct task_group *tg)
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -10947,6 +11857,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -10960,6 +11872,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ raw_spin_unlock_irq(&rq->lock);
}
return 1;
@@ -11056,8 +11972,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -11134,7 +12052,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_SCHED_HMP
.inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index ae6876e62c0f..ea066ab8376b 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -1526,6 +1526,10 @@ unsigned int cpu_temp(int cpu)
return 0;
}
+/*
+ * kfree() may wakeup kswapd. So this function should NOT be called
+ * with any CPU's rq->lock acquired.
+ */
void free_task_load_ptrs(struct task_struct *p)
{
kfree(p->ravg.curr_window_cpu);
@@ -2608,7 +2612,8 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
p->cpu_cycles = cur_cycles;
- trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
+ trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles,
+ rq->cc.time, p);
}
static int
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 36c6634236fb..d562efb04775 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,8 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..f8e8d68ed3fd 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -201,8 +201,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +212,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 29345ed74069..05d635c2beec 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,9 +5,13 @@
#include "sched.h"
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
#include <trace/events/sched.h>
+#include <linux/hrtimer.h>
+
+#include "tune.h"
int sched_rr_timeslice = RR_TIMESLICE;
@@ -65,10 +69,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -88,13 +88,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -988,6 +981,70 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
+#define RT_SCHEDTUNE_INTERVAL 50000000ULL
+
+static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
+{
+ struct sched_rt_entity *rt_se = container_of(timer,
+ struct sched_rt_entity,
+ schedtune_timer);
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Nothing to do if:
+ * - task has switched runqueues
+ * - task isn't RT anymore
+ */
+ if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
+ goto out;
+
+ /*
+ * If task got enqueued back during callback time, it means we raced
+ * with the enqueue on another cpu, that's Ok, just do nothing as
+ * enqueue path would have tried to cancel us and we shouldn't run
+ * Also check the schedtune_enqueued flag as class-switch on a
+ * sleeping task may have already canceled the timer and done dq
+ */
+ if (p->on_rq || !rt_se->schedtune_enqueued)
+ goto out;
+
+ /*
+ * RT task is no longer active, cancel boost
+ */
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+out:
+ raw_spin_unlock(&rq->lock);
+
+ /*
+ * This can free the task_struct if no more references.
+ */
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+ struct hrtimer *timer = &rt_se->schedtune_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = rt_schedtune_timer;
+ rt_se->schedtune_enqueued = false;
+}
+
+static void start_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+ struct hrtimer *timer = &rt_se->schedtune_timer;
+
+ hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
+ HRTIMER_MODE_REL_PINNED);
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -1005,6 +1062,9 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1390,6 +1450,33 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (!schedtune_task_boost(p))
+ return;
+
+ /*
+ * If schedtune timer is active, that means a boost was already
+ * done, just cancel the timer so that deboost doesn't happen.
+ * Otherwise, increase the boost. If an enqueued timer was
+ * cancelled, put the task reference.
+ */
+ if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+ put_task_struct(p);
+
+ /*
+ * schedtune_enqueued can be true in the following situation:
+ * enqueue_task_rt grabs rq lock before timer fires
+ * or before its callback acquires rq lock
+ * schedtune_enqueued can be false if timer callback is running
+ * and timer just released rq lock, or if the timer finished
+ * running and canceling the boost
+ */
+ if (rt_se->schedtune_enqueued)
+ return;
+
+ rt_se->schedtune_enqueued = true;
+ schedtune_enqueue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1401,6 +1488,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dec_hmp_sched_stats_rt(rq, p);
dequeue_pushable_task(rq, p);
+
+ if (!rt_se->schedtune_enqueued)
+ return;
+
+ if (flags == DEQUEUE_SLEEP) {
+ get_task_struct(p);
+ start_schedtune_timer(rt_se);
+ return;
+ }
+
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
}
/*
@@ -1456,11 +1556,57 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
}
#endif
+/*
+ * Return whether the task on the given cpu is currently non-preemptible
+ * while handling a potentially long softint, or if the task is likely
+ * to block preemptions soon because it is a ksoftirq thread that is
+ * handling slow softints.
+ */
+bool
+task_may_not_preempt(struct task_struct *task, int cpu)
+{
+ __u32 softirqs = per_cpu(active_softirqs, cpu) |
+ __IRQ_STAT(cpu, __softirq_pending);
+ struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
+
+ return ((softirqs & LONG_SOFTIRQ_MASK) &&
+ (task == cpu_ksoftirqd ||
+ task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+}
+
+/*
+ * Perform a schedtune dequeue and cancelation of boost timers if needed.
+ * Should be called only with the rq->lock held.
+ */
+static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ BUG_ON(!raw_spin_is_locked(&rq->lock));
+
+ if (!rt_se->schedtune_enqueued)
+ return;
+
+ /*
+ * Incase of class change cancel any active timers. If an enqueued
+ * timer was cancelled, put the task ref.
+ */
+ if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+ put_task_struct(p);
+
+ /* schedtune_enqueued is true, deboost it */
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, task_cpu(p));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+}
+
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
struct task_struct *curr;
struct rq *rq;
+ bool may_not_preempt;
#ifdef CONFIG_SCHED_HMP
return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
@@ -1476,7 +1622,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr = READ_ONCE(rq->curr); /* unlocked access */
/*
- * If the current task on @p's runqueue is an RT task, then
+ * If the current task on @p's runqueue is a softirq task,
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
@@ -1497,22 +1653,40 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*/
- if (curr && unlikely(rt_task(curr)) &&
+ may_not_preempt = task_may_not_preempt(curr, cpu);
+ if (may_not_preempt ||
+ (unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ curr->prio <= p->prio))) {
int target = find_lowest_rq(p);
/*
- * Don't bother moving it if the destination CPU is
- * not running a lower priority task.
+ * If cpu is non-preemptible, prefer remote cpu
+ * even if it's running a higher-prio task.
+ * Otherwise: Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
*/
if (target != -1 &&
- p->prio < cpu_rq(target)->rt.highest_prio.curr)
+ (may_not_preempt ||
+ p->prio < cpu_rq(target)->rt.highest_prio.curr))
cpu = target;
}
rcu_read_unlock();
out:
+ /*
+ * If previous CPU was different, make sure to cancel any active
+ * schedtune timers and deboost.
+ */
+ if (task_cpu(p) != cpu) {
+ unsigned long fl;
+ struct rq *prq = task_rq(p);
+
+ raw_spin_lock_irqsave(&prq->lock, fl);
+ schedtune_dequeue_rt(prq, p);
+ raw_spin_unlock_irqrestore(&prq->lock, fl);
+ }
+
return cpu;
}
@@ -1573,41 +1747,6 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-#if defined(CONFIG_SMP) && defined(CONFIG_CPU_FREQ_GOV_SCHED)
-static void sched_rt_update_capacity_req(struct rq *rq)
-{
- u64 total, used, age_stamp, avg;
- s64 delta;
-
- if (!sched_freq())
- return;
-
- sched_avg_update(rq);
- /*
- * Since we're reading these variables without serialization make sure
- * we read them once before doing sanity checks on them.
- */
- age_stamp = READ_ONCE(rq->age_stamp);
- avg = READ_ONCE(rq->rt_avg);
- delta = rq_clock(rq) - age_stamp;
-
- if (unlikely(delta < 0))
- delta = 0;
-
- total = sched_avg_period() + delta;
-
- used = div_u64(avg, total);
- if (unlikely(used > SCHED_CAPACITY_SCALE))
- used = SCHED_CAPACITY_SCALE;
-
- set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
-}
-#else
-static inline void sched_rt_update_capacity_req(struct rq *rq)
-{ }
-
-#endif
-
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
@@ -1676,17 +1815,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
- if (!rt_rq->rt_queued) {
- /*
- * The next task to be picked on this rq will have a lower
- * priority than rt tasks so we can spend some time to update
- * the capacity used by rt tasks based on the last activity.
- * This value will be the used as an estimation of the next
- * activity.
- */
- sched_rt_update_capacity_req(rq);
+ if (!rt_rq->rt_queued)
return NULL;
- }
put_prev_task(rq, prev);
@@ -1785,6 +1915,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
* the best one based on our affinity and topology.
*/
+retry:
for_each_sched_cluster(cluster) {
if (boost_on_big && cluster->capacity != max_possible_capacity)
continue;
@@ -1792,6 +1923,15 @@ static int find_lowest_rq_hmp(struct task_struct *task)
cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
cpumask_andnot(&candidate_mask, &candidate_mask,
cpu_isolated_mask);
+ /*
+ * When placement boost is active, if there is no eligible CPU
+ * in the highest capacity cluster, we fallback to the other
+ * clusters. So clear the CPUs of the traversed cluster from
+ * the lowest_mask.
+ */
+ if (unlikely(boost_on_big))
+ cpumask_andnot(lowest_mask, lowest_mask,
+ &cluster->cpus);
if (cpumask_empty(&candidate_mask))
continue;
@@ -1831,6 +1971,11 @@ static int find_lowest_rq_hmp(struct task_struct *task)
break;
}
+ if (unlikely(boost_on_big && best_cpu == -1)) {
+ boost_on_big = 0;
+ goto retry;
+ }
+
return best_cpu;
}
#endif /* CONFIG_SCHED_HMP */
@@ -2064,7 +2209,9 @@ retry:
next_task->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, next_task, 0);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, lowest_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
activate_task(lowest_rq, next_task, 0);
next_task->on_rq = TASK_ON_RQ_QUEUED;
ret = 1;
@@ -2087,160 +2234,172 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
-static int rto_next_cpu(struct rq *rq)
+static int rto_next_cpu(struct root_domain *rd)
{
- int prev_cpu = rq->rt.push_cpu;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
+
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
+}
+
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline void rto_start_unlock(atomic_t *v)
+{
+ atomic_set_release(v, 0);
+}
static void tell_cpu_to_push(struct rq *rq)
{
- int cpu;
-
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+ int cpu = -1;
- /* When here, there's no IPI going around */
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid cpu
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq->rd);
+
+ raw_spin_unlock(&rq->rd->rto_lock);
+
+ rto_start_unlock(&rq->rd->rto_loop_start);
+
+ if (cpu >= 0) {
+ /* Make sure the rd does not get freed while pushing */
+ sched_get_rd(rq->rd);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
+ }
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct root_domain *rd =
+ container_of(work, struct root_domain, rto_push_work);
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
-
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
-
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
+ rq = this_rq();
-again:
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_task(rq);
+ push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
- /*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
- */
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
- }
+ raw_spin_lock(&rd->rto_lock);
- cpu = find_next_push_cpu(src_rq);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rd);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ raw_spin_unlock(&rd->rto_lock);
- if (cpu >= nr_cpu_ids)
+ if (cpu < 0) {
+ sched_put_rd(rd);
return;
-
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
+ }
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
@@ -2250,8 +2409,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -2260,6 +2420,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
@@ -2320,7 +2485,9 @@ static void pull_rt_task(struct rq *this_rq)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(this_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
/*
@@ -2382,6 +2549,13 @@ static void rq_offline_rt(struct rq *rq)
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
/*
+ * On class switch from rt, always cancel active schedtune timers,
+ * this handles the cases where we switch class for a task that is
+ * already rt-dequeued but has a running timer.
+ */
+ schedtune_dequeue_rt(rq, p);
+
+ /*
* If there are other RT tasks then we will reschedule
* and the scheduling of the other RT tasks will handle
* the balancing. But if we are the last RT task
@@ -2500,9 +2674,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
update_curr_rt(rq);
- if (rq->rt.rt_nr_running)
- sched_rt_update_capacity_req(rq);
-
watchdog(rq, p);
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b88f647ea935..b6cd12998f16 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -340,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
extern struct task_group *css_tg(struct cgroup_subsys_state *css);
#else /* CONFIG_CGROUP_SCHED */
@@ -465,6 +473,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -523,7 +532,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
@@ -544,12 +553,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
- int push_flags;
- int push_cpu;
- struct irq_work push_work;
- raw_spinlock_t push_lock;
-#endif
#endif /* CONFIG_SMP */
int rt_queued;
@@ -642,6 +645,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -651,10 +667,18 @@ struct root_domain {
/* Maximum cpu capacity in the system. */
struct max_cpu_capacity max_cpu_capacity;
+
+ /* First cpu with maximum and minimum original capacity */
+ int max_cap_orig_cpu, min_cap_orig_cpu;
};
extern struct root_domain def_root_domain;
+extern void sched_get_rd(struct root_domain *rd);
+extern void sched_put_rd(struct root_domain *rd);
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
#endif /* CONFIG_SMP */
/*
@@ -708,6 +732,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -789,6 +814,19 @@ struct rq {
int curr_top;
#endif
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+ u64 window_start;
+ u64 curr_runnable_sum;
+ u64 prev_runnable_sum;
+ u64 nt_curr_runnable_sum;
+ u64 nt_prev_runnable_sum;
+ u64 cur_irqload;
+ u64 avg_irqload;
+ u64 irqload_ts;
+ u64 cum_window_demand;
+#endif /* CONFIG_SCHED_WALT */
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
#endif
@@ -827,6 +865,9 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
+#ifdef CONFIG_SMP
+ struct eas_stats eas_stats;
+#endif
#endif
#ifdef CONFIG_SMP
@@ -997,6 +1038,7 @@ struct sched_group_capacity {
*/
unsigned long capacity;
unsigned long max_capacity; /* Max per-cpu capacity in group */
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -1217,7 +1259,7 @@ static inline int cpu_min_power_cost(int cpu)
return cpu_rq(cpu)->cluster->min_power_cost;
}
-static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period)
+static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period)
{
return div64_u64(cycles, period);
}
@@ -1232,6 +1274,11 @@ static inline bool is_max_capacity_cpu(int cpu)
return cpu_max_possible_capacity(cpu) == max_possible_capacity;
}
+static inline bool is_min_capacity_cpu(int cpu)
+{
+ return cpu_max_possible_capacity(cpu) == min_max_possible_capacity;
+}
+
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
@@ -1729,6 +1776,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
@@ -1758,7 +1806,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ p->cpu = cpu;
+#else
task_thread_info(p)->cpu = cpu;
+#endif
p->wake_cpu = cpu;
#endif
}
@@ -2011,7 +2063,8 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
+ int subling_count_hint);
void (*migrate_task_rq)(struct task_struct *p);
void (*task_waking) (struct task_struct *task);
@@ -2044,8 +2097,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group)(struct task_struct *p, int type);
#endif
#ifdef CONFIG_SCHED_HMP
void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
@@ -2150,6 +2206,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
@@ -2158,6 +2215,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
static inline void __add_nr_running(struct rq *rq, unsigned count)
{
@@ -2316,7 +2374,7 @@ static inline unsigned long capacity_orig_of(int cpu)
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int walt_ravg_window;
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
@@ -2349,6 +2407,12 @@ static inline unsigned long __cpu_util(int cpu, int delta)
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg,
+ walt_ravg_window >> SCHED_LOAD_SHIFT);
+#endif
+
delta += util;
if (delta < 0)
return 0;
@@ -2361,60 +2425,19 @@ static inline unsigned long cpu_util(int cpu)
return __cpu_util(cpu, 0);
}
-#endif
-
-#ifdef CONFIG_CPU_FREQ_GOV_SCHED
-#define capacity_max SCHED_CAPACITY_SCALE
-extern unsigned int capacity_margin;
-extern struct static_key __sched_freq;
-
-static inline bool sched_freq(void)
+static inline unsigned long cpu_util_freq(int cpu)
{
- return static_key_false(&__sched_freq);
-}
-
-DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
-void update_cpu_capacity_request(int cpu, bool request);
-
-static inline void set_cfs_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{
- struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
-
- if (scr->cfs != capacity) {
- scr->cfs = capacity;
- update_cpu_capacity_request(cpu, request);
- }
-}
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
-static inline void set_rt_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{
- if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
- per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
- update_cpu_capacity_request(cpu, request);
- }
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ util = div64_u64(cpu_rq(cpu)->prev_runnable_sum,
+ walt_ravg_window >> SCHED_LOAD_SHIFT);
+#endif
+ return (util >= capacity) ? capacity : util;
}
-static inline void set_dl_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{
- if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
- per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
- update_cpu_capacity_request(cpu, request);
- }
-}
-#else
-#define sched_freq() false
-static inline void set_cfs_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{ }
-static inline void set_rt_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{ }
-static inline void set_dl_cpu_capacity(int cpu, bool request,
- unsigned long capacity)
-{ }
#endif
#ifdef CONFIG_SCHED_HMP
@@ -2671,6 +2694,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+/*
+ * task_may_not_preempt - check whether a task may not be preemptible soon
+ */
+extern bool task_may_not_preempt(struct task_struct *task, int cpu);
+
#else /* CONFIG_SMP */
/*
@@ -2792,3 +2820,66 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
+{
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_update_util(rq, flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+ return cpu_of(rq) == task_cpu(p) &&
+ (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant() (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant() (false)
+#endif
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
index ba5a326a9fd8..f03ed685f102 100644
--- a/kernel/sched/sched_avg.c
+++ b/kernel/sched/sched_avg.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, 2015-2017, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012, 2015-2017, 2018 The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -119,6 +119,43 @@ void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg,
}
EXPORT_SYMBOL(sched_get_nr_running_avg);
+static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0);
+
+#define BUSY_NR_RUN 3
+#define BUSY_LOAD_FACTOR 10
+
+#ifdef CONFIG_SCHED_HMP
+static inline void update_last_busy_time(int cpu, bool dequeue,
+ unsigned long prev_nr_run, u64 curr_time)
+{
+ bool nr_run_trigger = false, load_trigger = false;
+
+ if (!hmp_capable() || is_min_capacity_cpu(cpu))
+ return;
+
+ if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN)
+ nr_run_trigger = true;
+
+ if (dequeue) {
+ u64 load;
+
+ load = cpu_rq(cpu)->hmp_stats.cumulative_runnable_avg;
+ load = scale_load_to_cpu(load, cpu);
+
+ if (load * BUSY_LOAD_FACTOR > sched_ravg_window)
+ load_trigger = true;
+ }
+
+ if (nr_run_trigger || load_trigger)
+ atomic64_set(&per_cpu(last_busy_time, cpu), curr_time);
+}
+#else
+static inline void update_last_busy_time(int cpu, bool dequeue,
+ unsigned long prev_nr_run, u64 curr_time)
+{
+}
+#endif
+
/**
* sched_update_nr_prod
* @cpu: The core id of the nr running driver.
@@ -147,9 +184,16 @@ void sched_update_nr_prod(int cpu, long delta, bool inc)
if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu))
per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+ update_last_busy_time(cpu, !inc, nr_running, curr_time);
+
per_cpu(nr_prod_sum, cpu) += nr_running * diff;
per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff;
per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff;
spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
}
EXPORT_SYMBOL(sched_update_nr_prod);
+
+u64 sched_get_cpu_last_busy_time(int cpu)
+{
+ return atomic64_read(&per_cpu(last_busy_time, cpu));
+}
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..6d74a7c77c8c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,28 @@
*/
#define SCHEDSTAT_VERSION 15
+#ifdef CONFIG_SMP
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+ /* eas-specific runqueue stats */
+ seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+ stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+ stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+ stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+ stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+ stats->secb_nrg_sav, stats->secb_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu ",
+ stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+ stats->fbt_pref_idle, stats->fbt_count);
+
+ seq_printf(seq, "%llu %llu\n",
+ stats->cas_attempts, stats->cas_count);
+}
+#endif
+
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
@@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+ show_easstat(seq, &rq->eas_stats);
+
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
+
+ show_easstat(seq, &sd->eas_stats);
}
rcu_read_unlock();
#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 134da1cc8fce..3278c81cefb1 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,8 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
return task_cpu(p); /* stop tasks as never migrate */
}
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b0c5fe6d1f3b..9c56841227cc 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -12,11 +12,12 @@
#include "tune.h"
#ifdef CONFIG_CGROUP_SCHEDTUNE
-static bool schedtune_initialized = false;
+bool schedtune_initialized = false;
#endif
unsigned int sysctl_sched_cfs_boost __read_mostly;
+extern struct reciprocal_value schedtune_spc_rdiv;
extern struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
@@ -675,6 +676,9 @@ int schedtune_task_boost(struct task_struct *p)
struct schedtune *st;
int task_boost;
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
/* Get task boost value */
rcu_read_lock();
st = task_schedtune(p);
@@ -689,6 +693,9 @@ int schedtune_prefer_idle(struct task_struct *p)
struct schedtune *st;
int prefer_idle;
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
/* Get prefer_idle value */
rcu_read_lock();
st = task_schedtune(p);
@@ -1121,9 +1128,12 @@ schedtune_init(void)
pr_info("schedtune: configured to support global boosting only\n");
#endif
+ schedtune_spc_rdiv = reciprocal_value(100);
+
return 0;
nodata:
+ pr_warning("schedtune: disabled!\n");
rcu_read_unlock();
return -EINVAL;
}
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 6e053bd9830c..8d25ffbe4fed 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -20,7 +20,6 @@
*/
#include <linux/syscore_ops.h>
-#include <linux/cpufreq.h>
#include <trace/events/sched.h>
#include "sched.h"
#include "walt.h"
@@ -42,40 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0;
unsigned int sysctl_sched_walt_init_task_load_pct = 15;
-/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
-unsigned int __read_mostly walt_disabled = 0;
-
-static unsigned int max_possible_efficiency = 1024;
-static unsigned int min_possible_efficiency = 1024;
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
/*
- * Maximum possible frequency across all cpus. Task demand and cpu
- * capacity (cpu_power) metrics are scaled in reference to it.
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
*/
-static unsigned int max_possible_freq = 1;
-
-/*
- * Minimum possible max_freq across all cpus. This will be same as
- * max_possible_freq on homogeneous systems and could be different from
- * max_possible_freq on heterogenous systems. min_max_freq is used to derive
- * capacity (cpu_power) of cpus.
- */
-static unsigned int min_max_freq = 1;
-
-static unsigned int max_load_scale_factor = 1024;
-static unsigned int max_possible_capacity = 1024;
-
-/* Mask of all CPUs that have max_possible_capacity */
-static cpumask_t mpc_mask = CPU_MASK_ALL;
-
-/* Window size (in ns) */
-__read_mostly unsigned int walt_ravg_window = 20000000;
-
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
-
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+__read_mostly unsigned int walt_ravg_window =
+ (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
static unsigned int sync_cpu;
static ktime_t ktime_last;
@@ -86,11 +62,28 @@ static unsigned int task_load(struct task_struct *p)
return p->ravg.demand;
}
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+ rq->cum_window_demand += delta;
+ if (unlikely((s64)rq->cum_window_demand < 0))
+ rq->cum_window_demand = 0;
+}
+
void
walt_inc_cumulative_runnable_avg(struct rq *rq,
struct task_struct *p)
{
rq->cumulative_runnable_avg += p->ravg.demand;
+
+ /*
+ * Add a task's contribution to the cumulative window demand when
+ *
+ * (1) task is enqueued with on_rq = 1 i.e migration,
+ * prio/cgroup/class change.
+ * (2) task is waking for the first time in this window.
+ */
+ if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+ fixup_cum_window_demand(rq, p->ravg.demand);
}
void
@@ -99,16 +92,28 @@ walt_dec_cumulative_runnable_avg(struct rq *rq,
{
rq->cumulative_runnable_avg -= p->ravg.demand;
BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+ /*
+ * on_rq will be 1 for sleeping tasks. So check if the task
+ * is migrating or dequeuing in RUNNING state to change the
+ * prio/cgroup/class.
+ */
+ if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+ fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
}
static void
fixup_cumulative_runnable_avg(struct rq *rq,
- struct task_struct *p, s64 task_load_delta)
+ struct task_struct *p, u64 new_task_load)
{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+
rq->cumulative_runnable_avg += task_load_delta;
if ((s64)rq->cumulative_runnable_avg < 0)
panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
task_load_delta, task_load(p));
+
+ fixup_cum_window_demand(rq, task_load_delta);
}
u64 walt_ktime_clock(void)
@@ -167,10 +172,28 @@ static int exiting_task(struct task_struct *p)
static int __init set_walt_ravg_window(char *str)
{
+ unsigned int adj_window;
+ bool no_walt = walt_disabled;
+
get_option(&str, &walt_ravg_window);
- walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
- walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+ /* Adjust for CONFIG_HZ */
+ adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+ /* Warn if we're a bit too far away from the expected window size */
+ WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+ "tick-adjusted window size %u, original was %u\n", adj_window,
+ walt_ravg_window);
+
+ walt_ravg_window = adj_window;
+
+ walt_disabled = walt_disabled ||
+ (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+ WARN(!no_walt && walt_disabled,
+ "invalid window size, disabling WALT\n");
+
return 0;
}
@@ -194,26 +217,20 @@ update_window_start(struct rq *rq, u64 wallclock)
nr_windows = div64_u64(delta, walt_ravg_window);
rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+ rq->cum_window_demand = rq->cumulative_runnable_avg;
}
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
static u64 scale_exec_time(u64 delta, struct rq *rq)
{
- unsigned int cur_freq = rq->cur_freq;
- int sf;
-
- if (unlikely(cur_freq > max_possible_freq))
- cur_freq = rq->max_possible_freq;
-
- /* round up div64 */
- delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
- max_possible_freq);
-
- sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+ unsigned long capcurr = capacity_curr_of(cpu_of(rq));
- delta *= sf;
- delta >>= 10;
-
- return delta;
+ return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
}
static int cpu_is_waiting_on_io(struct rq *rq)
@@ -590,10 +607,20 @@ static void update_history(struct rq *rq, struct task_struct *p,
* A throttled deadline sched class task gets dequeued without
* changing p->on_rq. Since the dequeue decrements hmp stats
* avoid decrementing it here again.
+ *
+ * When window is rolled over, the cumulative window demand
+ * is reset to the cumulative runnable average (contribution from
+ * the tasks on the runqueue). If the current task is dequeued
+ * already, it's demand is not included in the cumulative runnable
+ * average. So add the task demand separately to cumulative window
+ * demand.
*/
- if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
- !p->dl.dl_throttled))
- fixup_cumulative_runnable_avg(rq, p, demand);
+ if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+ if (task_on_rq_queued(p))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+ else if (rq->curr == p)
+ fixup_cum_window_demand(rq, demand);
+ }
p->ravg.demand = demand;
@@ -736,33 +763,6 @@ done:
p->ravg.mark_start = wallclock;
}
-unsigned long __weak arch_get_cpu_efficiency(int cpu)
-{
- return SCHED_LOAD_SCALE;
-}
-
-void walt_init_cpu_efficiency(void)
-{
- int i, efficiency;
- unsigned int max = 0, min = UINT_MAX;
-
- for_each_possible_cpu(i) {
- efficiency = arch_get_cpu_efficiency(i);
- cpu_rq(i)->efficiency = efficiency;
-
- if (efficiency > max)
- max = efficiency;
- if (efficiency < min)
- min = efficiency;
- }
-
- if (max)
- max_possible_efficiency = max;
-
- if (min)
- min_possible_efficiency = min;
-}
-
static void reset_task_stats(struct task_struct *p)
{
u32 sum = 0;
@@ -794,11 +794,11 @@ void walt_set_window_start(struct rq *rq)
int cpu = cpu_of(rq);
struct rq *sync_rq = cpu_rq(sync_cpu);
- if (rq->window_start)
+ if (likely(rq->window_start))
return;
if (cpu == sync_cpu) {
- rq->window_start = walt_ktime_clock();
+ rq->window_start = 1;
} else {
raw_spin_unlock(&rq->lock);
double_rq_lock(rq, sync_rq);
@@ -841,6 +841,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+ /*
+ * When a task is migrating during the wakeup, adjust
+ * the task's contribution towards cumulative window
+ * demand.
+ */
+ if (p->state == TASK_WAKING &&
+ p->last_sleep_ts >= src_rq->window_start) {
+ fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+ fixup_cum_window_demand(dest_rq, p->ravg.demand);
+ }
+
if (p->ravg.curr_window) {
src_rq->curr_runnable_sum -= p->ravg.curr_window;
dest_rq->curr_runnable_sum += p->ravg.curr_window;
@@ -867,242 +878,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
double_rq_unlock(src_rq, dest_rq);
}
-/*
- * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
- * least efficient cpu gets capacity of 1024
- */
-static unsigned long capacity_scale_cpu_efficiency(int cpu)
-{
- return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
-}
-
-/*
- * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
- * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
- */
-static unsigned long capacity_scale_cpu_freq(int cpu)
-{
- return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
- * that "most" efficient cpu gets a load_scale_factor of 1
- */
-static unsigned long load_scale_cpu_efficiency(int cpu)
-{
- return DIV_ROUND_UP(1024 * max_possible_efficiency,
- cpu_rq(cpu)->efficiency);
-}
-
-/*
- * Return load_scale_factor of a cpu in reference to cpu with best max_freq
- * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
- * of 1.
- */
-static unsigned long load_scale_cpu_freq(int cpu)
-{
- return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
-}
-
-static int compute_capacity(int cpu)
-{
- int capacity = 1024;
-
- capacity *= capacity_scale_cpu_efficiency(cpu);
- capacity >>= 10;
-
- capacity *= capacity_scale_cpu_freq(cpu);
- capacity >>= 10;
-
- return capacity;
-}
-
-static int compute_load_scale_factor(int cpu)
-{
- int load_scale = 1024;
-
- /*
- * load_scale_factor accounts for the fact that task load
- * is in reference to "best" performing cpu. Task's load will need to be
- * scaled (up) by a factor to determine suitability to be placed on a
- * (little) cpu.
- */
- load_scale *= load_scale_cpu_efficiency(cpu);
- load_scale >>= 10;
-
- load_scale *= load_scale_cpu_freq(cpu);
- load_scale >>= 10;
-
- return load_scale;
-}
-
-static int cpufreq_notifier_policy(struct notifier_block *nb,
- unsigned long val, void *data)
-{
- struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
- int i, update_max = 0;
- u64 highest_mpc = 0, highest_mplsf = 0;
- const struct cpumask *cpus = policy->related_cpus;
- unsigned int orig_min_max_freq = min_max_freq;
- unsigned int orig_max_possible_freq = max_possible_freq;
- /* Initialized to policy->max in case policy->related_cpus is empty! */
- unsigned int orig_max_freq = policy->max;
-
- if (val != CPUFREQ_NOTIFY)
- return 0;
-
- for_each_cpu(i, policy->related_cpus) {
- cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
- policy->related_cpus);
- orig_max_freq = cpu_rq(i)->max_freq;
- cpu_rq(i)->min_freq = policy->min;
- cpu_rq(i)->max_freq = policy->max;
- cpu_rq(i)->cur_freq = policy->cur;
- cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
- }
-
- max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
- if (min_max_freq == 1)
- min_max_freq = UINT_MAX;
- min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
- BUG_ON(!min_max_freq);
- BUG_ON(!policy->max);
-
- /* Changes to policy other than max_freq don't require any updates */
- if (orig_max_freq == policy->max)
- return 0;
-
- /*
- * A changed min_max_freq or max_possible_freq (possible during bootup)
- * needs to trigger re-computation of load_scale_factor and capacity for
- * all possible cpus (even those offline). It also needs to trigger
- * re-computation of nr_big_task count on all online cpus.
- *
- * A changed rq->max_freq otoh needs to trigger re-computation of
- * load_scale_factor and capacity for just the cluster of cpus involved.
- * Since small task definition depends on max_load_scale_factor, a
- * changed load_scale_factor of one cluster could influence
- * classification of tasks in another cluster. Hence a changed
- * rq->max_freq will need to trigger re-computation of nr_big_task
- * count on all online cpus.
- *
- * While it should be sufficient for nr_big_tasks to be
- * re-computed for only online cpus, we have inadequate context
- * information here (in policy notifier) with regard to hotplug-safety
- * context in which notification is issued. As a result, we can't use
- * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
- * fixed up to issue notification always in hotplug-safe context,
- * re-compute nr_big_task for all possible cpus.
- */
-
- if (orig_min_max_freq != min_max_freq ||
- orig_max_possible_freq != max_possible_freq) {
- cpus = cpu_possible_mask;
- update_max = 1;
- }
-
- /*
- * Changed load_scale_factor can trigger reclassification of tasks as
- * big or small. Make this change "atomic" so that tasks are accounted
- * properly due to changed load_scale_factor
- */
- for_each_cpu(i, cpus) {
- struct rq *rq = cpu_rq(i);
-
- rq->capacity = compute_capacity(i);
- rq->load_scale_factor = compute_load_scale_factor(i);
-
- if (update_max) {
- u64 mpc, mplsf;
-
- mpc = div_u64(((u64) rq->capacity) *
- rq->max_possible_freq, rq->max_freq);
- rq->max_possible_capacity = (int) mpc;
-
- mplsf = div_u64(((u64) rq->load_scale_factor) *
- rq->max_possible_freq, rq->max_freq);
-
- if (mpc > highest_mpc) {
- highest_mpc = mpc;
- cpumask_clear(&mpc_mask);
- cpumask_set_cpu(i, &mpc_mask);
- } else if (mpc == highest_mpc) {
- cpumask_set_cpu(i, &mpc_mask);
- }
-
- if (mplsf > highest_mplsf)
- highest_mplsf = mplsf;
- }
- }
-
- if (update_max) {
- max_possible_capacity = highest_mpc;
- max_load_scale_factor = highest_mplsf;
- }
-
- return 0;
-}
-
-static int cpufreq_notifier_trans(struct notifier_block *nb,
- unsigned long val, void *data)
-{
- struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
- unsigned int cpu = freq->cpu, new_freq = freq->new;
- unsigned long flags;
- int i;
-
- if (val != CPUFREQ_POSTCHANGE)
- return 0;
-
- BUG_ON(!new_freq);
-
- if (cpu_rq(cpu)->cur_freq == new_freq)
- return 0;
-
- for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
- struct rq *rq = cpu_rq(i);
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
- walt_ktime_clock(), 0);
- rq->cur_freq = new_freq;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-
- return 0;
-}
-
-static struct notifier_block notifier_policy_block = {
- .notifier_call = cpufreq_notifier_policy
-};
-
-static struct notifier_block notifier_trans_block = {
- .notifier_call = cpufreq_notifier_trans
-};
-
-static int register_sched_callback(void)
-{
- int ret;
-
- ret = cpufreq_register_notifier(&notifier_policy_block,
- CPUFREQ_POLICY_NOTIFIER);
-
- if (!ret)
- ret = cpufreq_register_notifier(&notifier_trans_block,
- CPUFREQ_TRANSITION_NOTIFIER);
-
- return 0;
-}
-
-/*
- * cpufreq callbacks can be registered at core_initcall or later time.
- * Any registration done prior to that is "forgotten" by cpufreq. See
- * initialization of variable init_cpufreq_transition_notifier_list_called
- * for further information.
- */
-core_initcall(register_sched_callback);
-
void walt_init_new_task_load(struct task_struct *p)
{
int i;
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index e181c87a928d..de7edac43674 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -55,8 +55,10 @@ static inline void walt_migrate_sync_cpu(int cpu) { }
static inline void walt_init_cpu_efficiency(void) { }
static inline u64 walt_ktime_clock(void) { return 0; }
+#define walt_cpu_high_irqload(cpu) false
+
#endif /* CONFIG_SCHED_WALT */
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
#endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 15a1795bbba1..efd384f3f852 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -457,14 +457,19 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
+void __get_seccomp_filter(struct seccomp_filter *filter)
+{
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&filter->usage);
+}
+
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
if (!orig)
return;
- /* Reference count is bounded by the number of total processes. */
- atomic_inc(&orig->usage);
+ __get_seccomp_filter(orig);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -475,10 +480,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
}
}
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
+static void __put_seccomp_filter(struct seccomp_filter *orig)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
@@ -487,6 +490,12 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ __put_seccomp_filter(tsk->seccomp.filter);
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
@@ -927,13 +936,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
if (!data)
goto out;
- get_seccomp_filter(task);
+ __get_seccomp_filter(filter);
spin_unlock_irq(&task->sighand->siglock);
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
- put_seccomp_filter(task);
+ __put_seccomp_filter(filter);
return ret;
out:
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a972fd..4a548c6a4118 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -72,7 +72,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !force)
+ handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return 1;
return sig_handler_ignored(handler, sig);
@@ -88,13 +88,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, force))
- return 0;
-
/*
- * Tracers may want to know about even ignored signals.
+ * Tracers may want to know about even ignored signal unless it
+ * is SIGKILL which can't be reported anyway but can be ignored
+ * by SIGNAL_UNKILLABLE task.
*/
- return !t->ptrace;
+ if (t->ptrace && sig != SIGKILL)
+ return 0;
+
+ return sig_task_ignored(t, sig, force);
}
/*
@@ -346,7 +348,7 @@ static bool task_participate_group_stop(struct task_struct *task)
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
- sig->flags = SIGNAL_STOP_STOPPED;
+ signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
return true;
}
return false;
@@ -503,7 +505,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
+ bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
@@ -525,6 +528,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
+
+ *resched_timer =
+ (first->flags & SIGQUEUE_PREALLOC) &&
+ (info->si_code == SI_TIMER) &&
+ (info->si_sys_private);
+
__sigqueue_free(first);
} else {
/*
@@ -541,12 +550,12 @@ still_pending:
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
- siginfo_t *info)
+ siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
- collect_signal(sig, pending, info);
+ collect_signal(sig, pending, info, resched_timer);
return sig;
}
@@ -558,15 +567,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
+ bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
- signr = __dequeue_signal(&tsk->pending, mask, info);
+ signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
- mask, info);
+ mask, info, &resched_timer);
/*
* itimer signal ?
*
@@ -611,7 +621,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
- if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
@@ -837,7 +847,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
- signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
@@ -909,9 +919,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL || !t->ptrace)) {
+ (sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f787..d69b77fc7cc1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(__u32, active_softirqs);
+
const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -227,13 +234,24 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage __visible void __do_softirq(void)
+#define softirq_deferred_for_rt(pending) \
+({ \
+ __u32 deferred = 0; \
+ if (cpupri_check_rt()) { \
+ deferred = pending & LONG_SOFTIRQ_MASK; \
+ pending &= ~LONG_SOFTIRQ_MASK; \
+ } \
+ deferred; \
+})
+
+asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
+ __u32 deferred;
__u32 pending;
int softirq_bit;
@@ -245,14 +263,15 @@ asmlinkage __visible void __do_softirq(void)
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
+ deferred = softirq_deferred_for_rt(pending);
account_irq_enter_time(current);
-
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
restart:
/* Reset the pending bitmask before enabling irqs */
- set_softirq_pending(0);
+ set_softirq_pending(deferred);
+ __this_cpu_write(active_softirqs, pending);
local_irq_enable();
@@ -282,18 +301,21 @@ restart:
pending >>= softirq_bit;
}
+ __this_cpu_write(active_softirqs, 0);
rcu_bh_qs();
local_irq_disable();
pending = local_softirq_pending();
+ deferred = softirq_deferred_for_rt(pending);
+
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
--max_restart)
goto restart;
-
- wakeup_softirqd();
}
+ if (pending | deferred)
+ wakeup_softirqd();
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 816999804a16..bc4ca30ddc21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -522,13 +522,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
- .procname = "sched_is_big_little",
- .data = &sysctl_sched_is_big_little,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "sched_sync_hint_enable",
.data = &sysctl_sched_sync_hint_enable,
.maxlen = sizeof(unsigned int),
@@ -536,13 +529,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "sched_initial_task_util",
- .data = &sysctl_sched_initial_task_util,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "sched_cstate_aware",
.data = &sysctl_sched_cstate_aware,
.maxlen = sizeof(unsigned int),
@@ -1420,6 +1406,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = timer_migration_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#ifdef CONFIG_BPF_SYSCALL
@@ -2387,9 +2375,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
+ *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 4a816bab38a2..d7612fcba10a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -255,6 +255,7 @@ static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
{ CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" },
{ CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" },
{ CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
+ { CTL_INT, NET_IPV4_CONF_NF_IPV4_DEFRAG_SKIP, "nf_ipv4_defrag_skip" },
{}
};
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2af5687b83c9..ceec77c652b5 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -569,7 +569,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add(start, base->gettime());
+ start = ktime_add_safe(start, base->gettime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -655,7 +655,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
overrun++;
}
- alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
return overrun;
}
EXPORT_SYMBOL_GPL(alarm_forward);
@@ -843,13 +843,22 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+
+ /*
+ * Rate limit to the tick as a hot fix to prevent DOS. Will be
+ * mopped up later.
+ */
+ if (timr->it.alarm.interval.tv64 &&
+ ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC)
+ timr->it.alarm.interval = ktime_set(0, TICK_NSEC);
+
exp = timespec_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
- exp = ktime_add(now, exp);
+ exp = ktime_add_safe(now, exp);
}
alarm_start(&timr->it.alarm.alarmtimer, exp);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e7c2392666cb..beafdf94b3b5 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -312,7 +312,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns);
*/
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
- ktime_t res = ktime_add(lhs, rhs);
+ ktime_t res = ktime_add_unsafe(lhs, rhs);
/*
* We use KTIME_SEC_MAX here, the maximum timeout which we can
@@ -669,7 +669,9 @@ static void hrtimer_reprogram(struct hrtimer *timer,
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
{
base->expires_next.tv64 = KTIME_MAX;
+ base->hang_detected = 0;
base->hres_active = 0;
+ base->next_timer = NULL;
}
/*
@@ -1116,7 +1118,12 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
cpu_base = raw_cpu_ptr(&hrtimer_bases);
- if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
+ /*
+ * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
+ * clock modifications, so they needs to become CLOCK_MONOTONIC to
+ * ensure POSIX compliance.
+ */
+ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
base = hrtimer_clockid_to_base(clock_id);
@@ -1587,6 +1594,7 @@ static void init_hrtimers_cpu(int cpu)
timerqueue_init_head(&cpu_base->clock_base[i].active);
}
+ cpu_base->active_bases = 0;
cpu_base->cpu = cpu;
hrtimer_init_hres(cpu_base);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f2826c35e918..fc7c37ad90a0 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -507,17 +507,22 @@ static struct pid *good_sigevent(sigevent_t * event)
{
struct task_struct *rtn = current->group_leader;
- if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
- (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) ||
- !same_thread_group(rtn, current) ||
- (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL))
+ switch (event->sigev_notify) {
+ case SIGEV_SIGNAL | SIGEV_THREAD_ID:
+ rtn = find_task_by_vpid(event->sigev_notify_thread_id);
+ if (!rtn || !same_thread_group(rtn, current))
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_SIGNAL:
+ case SIGEV_THREAD:
+ if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
+ return NULL;
+ /* FALLTHRU */
+ case SIGEV_NONE:
+ return task_pid(rtn);
+ default:
return NULL;
-
- if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
- ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
- return NULL;
-
- return task_pid(rtn);
+ }
}
void posix_timers_register_clock(const clockid_t clock_id,
@@ -745,8 +750,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
/* interval timer ? */
if (iv.tv64)
cur_setting->it_interval = ktime_to_timespec(iv);
- else if (!hrtimer_active(timer) &&
- (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ else if (!hrtimer_active(timer) && timr->it_sigev_notify != SIGEV_NONE)
return;
now = timer->base->get_time();
@@ -757,7 +761,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
* expiry is > now.
*/
if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING ||
- (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+ timr->it_sigev_notify == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
remaining = __hrtimer_expires_remaining_adjusted(timer, now);
@@ -767,7 +771,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
* A single shot SIGEV_NONE timer must return 0, when
* it is expired !
*/
- if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
+ if (timr->it_sigev_notify != SIGEV_NONE)
cur_setting->it_value.tv_nsec = 1;
} else
cur_setting->it_value = ktime_to_timespec(remaining);
@@ -865,7 +869,7 @@ common_timer_set(struct k_itimer *timr, int flags,
timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
/* SIGEV_NONE timers are not queued ! See common_timer_get */
- if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
+ if (timr->it_sigev_notify == SIGEV_NONE) {
/* Setup correct expiry time for relative timers */
if (mode == HRTIMER_MODE_REL) {
hrtimer_add_expires(timer, timer->base->get_time());
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ec2102104cb8..6579be96e041 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -589,6 +589,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -605,8 +610,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
@@ -896,6 +911,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fa544f3f560..7902ecbce8ec 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -70,6 +70,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk)
tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
+ while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
+ tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ tk->raw_sec++;
+ }
}
static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -116,6 +120,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
@@ -173,7 +197,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -207,7 +231,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
cycle_t cycle_now, delta;
/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -235,12 +259,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
- tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
- tk->tkr_raw.read = clock->read;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
@@ -259,18 +281,19 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Go back from cycles -> shifted ns */
tk->xtime_interval = (u64) interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
- tk->raw_interval =
- ((u64) interval * clock->mult) >> clock->shift;
+ tk->raw_interval = interval * clock->mult;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
- if (shift_change < 0)
+ if (shift_change < 0) {
tk->tkr_mono.xtime_nsec >>= -shift_change;
- else
+ tk->tkr_raw.xtime_nsec >>= -shift_change;
+ } else {
tk->tkr_mono.xtime_nsec <<= shift_change;
+ tk->tkr_raw.xtime_nsec <<= shift_change;
+ }
}
- tk->tkr_raw.xtime_nsec = 0;
tk->tkr_mono.shift = clock->shift;
tk->tkr_raw.shift = clock->shift;
@@ -404,7 +427,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
- tkr->read(tkr->clock),
+ tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -461,6 +484,10 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
return cycles_at_suspend;
}
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -477,13 +504,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
- tkr_dummy.read = dummy_clock_read;
+ cycles_at_suspend = tk_clock_read(tkr);
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- tkr_dummy.read = dummy_clock_read;
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
@@ -595,9 +622,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
- /* Update the monotonic raw base */
- tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
-
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
@@ -607,6 +631,9 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
+
+ /* Update the monotonic raw base */
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
@@ -647,11 +674,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
- s64 nsec;
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -661,10 +686,13 @@ static void timekeeping_forward_now(struct timekeeper *tk)
/* If arch requires, add in get_arch_timeoffset() */
tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
- tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
- timespec64_add_ns(&tk->raw_time, nsec);
+ tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
+
+ tk_normalize_xtime(tk);
}
/**
@@ -1158,19 +1186,18 @@ int timekeeping_notify(struct clocksource *clock)
void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts64;
unsigned long seq;
s64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->raw_sec;
nsecs = timekeeping_get_ns(&tk->tkr_raw);
- ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
- timespec64_add_ns(&ts64, nsecs);
- *ts = ts64;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getrawmonotonic64);
@@ -1294,8 +1321,7 @@ void __init timekeeping_init(void)
tk_setup_internals(tk, clock);
tk_set_xtime(tk, &now);
- tk->raw_time.tv_sec = 0;
- tk->raw_time.tv_nsec = 0;
+ tk->raw_sec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -1434,7 +1460,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
@@ -1775,7 +1801,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
unsigned int *clock_set)
{
cycle_t interval = tk->cycle_interval << shift;
- u64 raw_nsecs;
+ u64 snsec_per_sec;
/* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
@@ -1790,14 +1816,12 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = (u64)tk->raw_interval << shift;
- raw_nsecs += tk->raw_time.tv_nsec;
- if (raw_nsecs >= NSEC_PER_SEC) {
- u64 raw_secs = raw_nsecs;
- raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- tk->raw_time.tv_sec += raw_secs;
+ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
+ snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
+ tk->tkr_raw.xtime_nsec -= snsec_per_sec;
+ tk->raw_sec++;
}
- tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
@@ -1829,7 +1853,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 90a82deece45..903705687b52 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -131,7 +131,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
int ret;
mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
timers_update_migration(false);
mutex_unlock(&mutex);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 048bf074bef9..3c7b7a9bcad1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -190,6 +190,17 @@ config FUNCTION_GRAPH_TRACER
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+ enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+ be disabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2963266fb7bf..a0177ae43058 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b6127653a37..b674a7a8d655 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -57,7 +57,8 @@ static struct tracer_flags blk_tracer_flags = {
};
/* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
@@ -300,11 +301,26 @@ static void blk_trace_free(struct blk_trace *bt)
kfree(bt);
}
+static void get_probe_ref(void)
+{
+ mutex_lock(&blk_probe_mutex);
+ if (++blk_probes_ref == 1)
+ blk_register_tracepoints();
+ mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+ mutex_lock(&blk_probe_mutex);
+ if (!--blk_probes_ref)
+ blk_unregister_tracepoints();
+ mutex_unlock(&blk_probe_mutex);
+}
+
static void blk_trace_cleanup(struct blk_trace *bt)
{
blk_trace_free(bt);
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
+ put_probe_ref();
}
int blk_trace_remove(struct request_queue *q)
@@ -522,8 +538,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (cmpxchg(&q->blk_trace, NULL, bt))
goto err;
- if (atomic_inc_return(&blk_probes_ref) == 1)
- blk_register_tracepoints();
+ get_probe_ref();
return 0;
err:
@@ -1518,9 +1533,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
- if (atomic_dec_and_test(&blk_probes_ref))
- blk_unregister_tracepoints();
-
+ put_probe_ref();
blk_trace_free(bt);
return 0;
}
@@ -1551,8 +1564,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
if (cmpxchg(&q->blk_trace, NULL, bt))
goto free_bt;
- if (atomic_inc_return(&blk_probes_ref) == 1)
- blk_register_tracepoints();
+ get_probe_ref();
return 0;
free_bt:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 34b2a0d5cf1a..ac758a53fcea 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2667,13 +2667,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are control ops, they still need their
- * per_cpu field freed. Since, function tracing is
+ * If these are dynamic or control ops, they still
+ * need their data freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_CONTROL)
- control_ops_free(ops);
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL))
+ goto free_ops;
+
return 0;
}
@@ -2728,6 +2729,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
schedule_on_each_cpu(ftrace_sync);
+ free_ops:
arch_ftrace_trampoline_free(ops);
if (ops->flags & FTRACE_OPS_FL_CONTROL)
@@ -3535,7 +3537,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
int exclude_mod = 0;
int found = 0;
int ret;
- int clear_filter;
+ int clear_filter = 0;
if (func) {
func_g.type = filter_parse_regex(func, len, &func_g.search,
@@ -3843,7 +3845,6 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
func_g.type = filter_parse_regex(glob, strlen(glob),
&func_g.search, &not);
func_g.len = strlen(func_g.search);
- func_g.search = glob;
/* we do not support '!' for function probes */
if (WARN_ON(not))
@@ -4313,9 +4314,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -5905,17 +5903,6 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * Function graph does not allocate the trampoline, but
- * other global_ops do. We need to reset the ALLOC_TRAMP flag
- * if one was used.
- */
- global_ops.trampoline = save_global_trampoline;
- if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
- global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1275175b0946..d9cd6191760b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
*/
size_t ring_buffer_page_len(void *page)
{
- return local_read(&((struct buffer_data_page *)page)->commit)
+ struct buffer_data_page *bpage = page;
+
+ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
+ BUF_PAGE_HDR_SIZE;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60d246c4eefa..9510d540b48e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1362,7 +1362,7 @@ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
- unsigned *saved_tgids;
+ unsigned *map_cmdline_to_tgid;
unsigned cmdline_num;
int cmdline_idx;
char *saved_cmdlines;
@@ -1396,9 +1396,10 @@ static int allocate_cmdlines_buffer(unsigned int val,
return -ENOMEM;
}
- s->saved_tgids = kmalloc_array(val, sizeof(*s->saved_tgids),
- GFP_KERNEL);
- if (!s->saved_tgids) {
+ s->map_cmdline_to_tgid = kmalloc_array(val,
+ sizeof(*s->map_cmdline_to_tgid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_tgid) {
kfree(s->map_cmdline_to_pid);
kfree(s->saved_cmdlines);
return -ENOMEM;
@@ -1410,8 +1411,8 @@ static int allocate_cmdlines_buffer(unsigned int val,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
- memset(s->saved_tgids, 0,
- val * sizeof(*s->saved_tgids));
+ memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_tgid));
return 0;
}
@@ -1577,14 +1578,17 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
+ preempt_disable();
/*
* It's not the end of the world if we don't get
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!arch_spin_trylock(&trace_cmdline_lock))
+ if (!arch_spin_trylock(&trace_cmdline_lock)) {
+ preempt_enable();
return 0;
+ }
idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
@@ -1607,8 +1611,9 @@ static int trace_save_cmdline(struct task_struct *tsk)
}
set_cmdline(idx, tsk->comm);
- savedcmd->saved_tgids[idx] = tsk->tgid;
+ savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return 1;
}
@@ -1650,19 +1655,29 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
-int trace_find_tgid(int pid)
+static int __find_tgid_locked(int pid)
{
unsigned map;
int tgid;
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
map = savedcmd->map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
- tgid = savedcmd->saved_tgids[map];
+ tgid = savedcmd->map_cmdline_to_tgid[map];
else
tgid = -1;
+ return tgid;
+}
+
+int trace_find_tgid(int pid)
+{
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ tgid = __find_tgid_locked(pid);
+
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
@@ -3288,11 +3303,17 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->current_trace->print_max)
+ trace_buf = &tr->max_buffer;
+#endif
if (cpu == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(&tr->trace_buffer, cpu);
+ tracing_reset(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -3440,37 +3461,30 @@ static const struct file_operations show_traces_fops = {
.llseek = seq_lseek,
};
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
-
static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
+ char *mask_str;
int len;
- mutex_lock(&tracing_cpumask_update_lock);
+ len = snprintf(NULL, 0, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str)
+ return -ENOMEM;
- len = snprintf(mask_str, count, "%*pb\n",
+ len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
if (len >= count) {
count = -EINVAL;
goto out_err;
}
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
out_err:
- mutex_unlock(&tracing_cpumask_update_lock);
+ kfree(mask_str);
return count;
}
@@ -3490,8 +3504,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (err)
goto err_unlock;
- mutex_lock(&tracing_cpumask_update_lock);
-
local_irq_disable();
arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
@@ -3514,8 +3526,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-
- mutex_unlock(&tracing_cpumask_update_lock);
free_cpumask_var(tracing_cpumask_new);
return count;
@@ -3979,10 +3989,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
{
char buf[64];
int r;
+ unsigned int n;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ n = savedcmd->cmdline_num;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ r = scnprintf(buf, sizeof(buf), "%u\n", n);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -3991,7 +4006,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s)
{
kfree(s->saved_cmdlines);
kfree(s->map_cmdline_to_pid);
- kfree(s->saved_tgids);
+ kfree(s->map_cmdline_to_tgid);
kfree(s);
}
@@ -4008,10 +4023,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -4230,33 +4247,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf,
char *file_buf;
char *buf;
int len = 0;
- int pid;
int i;
+ int *pids;
+ int n = 0;
- file_buf = kmalloc(savedcmd->cmdline_num*(16+1+16), GFP_KERNEL);
- if (!file_buf)
- return -ENOMEM;
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
- buf = file_buf;
+ pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL);
+ if (!pids) {
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ return -ENOMEM;
+ }
for (i = 0; i < savedcmd->cmdline_num; i++) {
- int tgid;
- int r;
+ int pid;
pid = savedcmd->map_cmdline_to_pid[i];
if (pid == -1 || pid == NO_CMDLINE_MAP)
continue;
- tgid = trace_find_tgid(pid);
- r = sprintf(buf, "%d %d\n", pid, tgid);
+ pids[n] = pid;
+ pids[n+1] = __find_tgid_locked(pid);
+ n += 2;
+ }
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ if (n == 0) {
+ kfree(pids);
+ return 0;
+ }
+
+ /* enough to hold max pair of pids + space, lr and nul */
+ len = n * 12;
+ file_buf = kmalloc(len, GFP_KERNEL);
+ if (!file_buf) {
+ kfree(pids);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+ for (i = 0; i < n && len > 0; i += 2) {
+ int r;
+
+ r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]);
buf += r;
- len += r;
+ len -= r;
}
len = simple_read_from_buffer(ubuf, cnt, ppos,
- file_buf, len);
+ file_buf, buf - file_buf);
kfree(file_buf);
+ kfree(pids);
return len;
}
@@ -4808,7 +4853,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracing_is_on() && iter->pos)
+ if (!tracer_tracing_is_on(iter->tr) && iter->pos)
break;
mutex_unlock(&iter->mutex);
@@ -5347,7 +5392,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ if (tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
tracing_reset_online_cpus(&tr->max_buffer);
#endif
@@ -5869,7 +5914,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int entries, size, i;
+ int entries, i;
ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5920,14 +5965,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- /*
- * zero out any left over data, this is going to
- * user land.
- */
- size = ring_buffer_page_len(ref->page);
- if (size < PAGE_SIZE)
- memset(ref->page + size, 0, PAGE_SIZE - size);
-
page = virt_to_page(ref->page);
spd.pages[i] = page;
@@ -6654,6 +6691,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
buf->data = alloc_percpu(struct trace_array_cpu);
if (!buf->data) {
ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
return -ENOMEM;
}
@@ -6677,7 +6715,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot ? size : 1);
if (WARN_ON(ret)) {
ring_buffer_free(tr->trace_buffer.buffer);
+ tr->trace_buffer.buffer = NULL;
free_percpu(tr->trace_buffer.data);
+ tr->trace_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
@@ -6847,6 +6887,7 @@ static int instance_rmdir(const char *name)
}
kfree(tr->topts);
+ free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 26960e49bb8c..1235f9fd9fbd 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2301,6 +2301,7 @@ void trace_event_enum_update(struct trace_enum_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
+ bool first = false;
int last_i;
int i;
@@ -2308,15 +2309,28 @@ void trace_event_enum_update(struct trace_enum_map **map, int len)
list_for_each_entry_safe(call, p, &ftrace_events, list) {
/* events are usually grouped together with systems */
if (!last_system || call->class->system != last_system) {
+ first = true;
last_i = 0;
last_system = call->class->system;
}
+ /*
+ * Since calls are grouped by systems, the likelyhood that the
+ * next call in the iteration belongs to the same system as the
+ * previous call is high. As an optimization, we skip seaching
+ * for a map[] that matches the call's system if the last call
+ * was from the same system. That's what last_i is for. If the
+ * call has the same system as the previous call, then last_i
+ * will be the index of the first map[] that has a matching
+ * system.
+ */
for (i = last_i; i < len; i++) {
if (call->class->system == map[i]->system) {
/* Save the first system if need be */
- if (!last_i)
+ if (first) {
last_i = i;
+ first = false;
+ }
update_event_printk(call, map[i]);
}
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 6816302542b2..f0e5408499b6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1979,6 +1979,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 96c75b0e9831..a804ee1b3ec6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -8,6 +8,7 @@
*/
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/fs.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index be3222b7d72e..21b162c07e83 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -451,63 +455,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -770,3 +754,100 @@ __init static int init_irqsoff_tracer(void)
return 0;
}
core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 12ea4ea619ee..e9092a0247bf 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -659,30 +659,25 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
- /* an address specified */
- ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
- if (ret) {
- pr_info("Failed to parse address.\n");
- return ret;
- }
- } else {
+
+ /* try to parse an address. if that fails, try to read the
+ * input as a symbol. */
+ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
- pr_info("Failed to parse symbol.\n");
+ pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
if (offset && is_return) {
pr_info("Return probe must be used without offset.\n");
return -EINVAL;
}
+ } else if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
}
argc -= 2; argv += 2;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea77881..ca70d11b8aa7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -272,7 +272,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
goto out_free;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
- goto out;
+ goto out_free;
}
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d58cc4d8f0d1..651aaa5221ec 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -190,6 +190,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 73c018d7df00..a719a4ad2e74 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -70,6 +70,7 @@ enum {
* attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
+ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
/* worker flags */
@@ -167,7 +168,6 @@ struct worker_pool {
/* L: hash of busy workers */
/* see manage_workers() for details on the two manager mutexes */
- struct mutex manager_arb; /* manager arbitration */
struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
@@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -812,7 +813,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = mutex_is_locked(&pool->manager_arb);
+ bool managing = pool->flags & POOL_MANAGER_ACTIVE;
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
@@ -1492,6 +1493,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ WARN_ON_ONCE(!wq);
WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
timer->data != (unsigned long)dwork);
WARN_ON_ONCE(timer_pending(timer));
@@ -1964,24 +1966,17 @@ static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- /*
- * Anyone who successfully grabs manager_arb wins the arbitration
- * and becomes the manager. mutex_trylock() on pool->manager_arb
- * failure while holding pool->lock reliably indicates that someone
- * else is managing the pool and the worker which failed trylock
- * can proceed to executing work items. This means that anyone
- * grabbing manager_arb is responsible for actually performing
- * manager duties. If manager_arb is grabbed and released without
- * actual management, the pool may stall indefinitely.
- */
- if (!mutex_trylock(&pool->manager_arb))
+ if (pool->flags & POOL_MANAGER_ACTIVE)
return false;
+
+ pool->flags |= POOL_MANAGER_ACTIVE;
pool->manager = worker;
maybe_create_worker(pool);
pool->manager = NULL;
- mutex_unlock(&pool->manager_arb);
+ pool->flags &= ~POOL_MANAGER_ACTIVE;
+ wake_up(&wq_manager_wait);
return true;
}
@@ -3141,7 +3136,6 @@ static int init_worker_pool(struct worker_pool *pool)
setup_timer(&pool->mayday_timer, pool_mayday_timeout,
(unsigned long)pool);
- mutex_init(&pool->manager_arb);
mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
@@ -3211,13 +3205,15 @@ static void put_unbound_pool(struct worker_pool *pool)
hash_del(&pool->hash_node);
/*
- * Become the manager and destroy all workers. Grabbing
- * manager_arb prevents @pool's workers from blocking on
- * attach_mutex.
+ * Become the manager and destroy all workers. This prevents
+ * @pool's workers from blocking on attach_mutex. We're the last
+ * manager and @pool gets freed with the flag set.
*/
- mutex_lock(&pool->manager_arb);
-
spin_lock_irq(&pool->lock);
+ wait_event_lock_irq(wq_manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
+ pool->flags |= POOL_MANAGER_ACTIVE;
+
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3231,8 +3227,6 @@ static void put_unbound_pool(struct worker_pool *pool)
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
- mutex_unlock(&pool->manager_arb);
-
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
@@ -3669,8 +3663,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
@@ -3856,6 +3854,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4044,13 +4052,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5178,7 +5187,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 45215870ac6c..3fa9c146fccb 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -9,6 +9,7 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/preempt.h>
struct worker_pool;
@@ -59,7 +60,7 @@ struct worker {
*/
static inline struct worker *current_wq_worker(void)
{
- if (current->flags & PF_WQ_WORKER)
+ if (in_task() && (current->flags & PF_WQ_WORKER))
return kthread_data(current);
return NULL;
}