summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/verifier.c21
-rw-r--r--kernel/cgroup.c19
-rw-r--r--kernel/cpu.c3
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/events/core.c60
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/cpuhotplug.c24
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/msi.c2
-rw-r--r--kernel/irq/proc.c5
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c96
-rw-r--r--kernel/locking/osq_lock.c35
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c78
-rw-r--r--kernel/sched/core_ctl.c79
-rw-r--r--kernel/sched/cpufreq.c63
-rw-r--r--kernel/sched/cpufreq_sched.c220
-rw-r--r--kernel/sched/cpufreq_schedutil.c770
-rw-r--r--kernel/sched/cpupri.c37
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/debug.c26
-rw-r--r--kernel/sched/fair.c1497
-rw-r--r--kernel/sched/hmp.c78
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/rt.c50
-rw-r--r--kernel/sched/sched.h78
-rw-r--r--kernel/sched/sched_avg.c40
-rw-r--r--kernel/sched/stats.c26
-rw-r--r--kernel/sched/tune.c13
-rw-r--r--kernel/signal.c20
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/alarmtimer.c14
-rw-r--r--kernel/time/hrtimer.c49
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/timekeeping.c47
-rw-r--r--kernel/trace/trace_kprobe.c5
44 files changed, 2787 insertions, 745 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cbfba78d3db..85de5094b936 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -313,7 +313,8 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct verifier_env *env,
+ const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -377,9 +378,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
} else {
verbose("BUG_ld_%02x\n", insn->code);
return;
@@ -1758,7 +1769,7 @@ static int do_check(struct verifier_env *env)
if (log_level) {
verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
+ print_bpf_insn(env, insn);
}
if (class == BPF_ALU || class == BPF_ALU64) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25cf44889559..077bb52e2d47 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -717,10 +717,10 @@ static void css_set_move_task(struct task_struct *task,
if (to_cset) {
/*
- * We are synchronized through cgroup_threadgroup_rwsem
- * against PF_EXITING setting such that we can't race
- * against cgroup_exit() changing the css_set to
- * init_css_set and dropping the old one.
+ * We are synchronized through css_set_lock against
+ * PF_EXITING setting such that we can't race against
+ * cgroup_exit() disassociating the task from the
+ * css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -5701,19 +5701,22 @@ void cgroup_exit(struct task_struct *tsk)
int i;
/*
- * Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check css_set and cg_list without synchronization.
+ * Avoid potential race with the migrate path.
+ */
+ spin_lock_irq(&css_set_lock);
+ /*
+ * Unlink from @tsk from its css_set.
*/
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
- spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
- spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
}
+ spin_unlock_irq(&css_set_lock);
+
/* see cgroup_post_fork() for details */
for_each_subsys_which(ss, i, &have_exit_callback)
ss->exit(tsk);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e822cb0e18d5..1a26ef5b7d58 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -361,6 +361,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
+ if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1)
+ return -EBUSY;
+
cpu_hotplug_begin();
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 29c7240172d3..03dbc231a4a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -174,9 +174,9 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
+static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags);
+ return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95c447e658f7..7fee87daac56 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1693,33 +1693,7 @@ static int __perf_remove_from_context(void *info)
return 0;
}
-
-#ifdef CONFIG_SMP
-static void perf_retry_remove(struct perf_event *event,
- struct remove_event *rep)
-{
- int up_ret;
- /*
- * CPU was offline. Bring it online so we can
- * gracefully exit a perf context.
- */
- up_ret = cpu_up(event->cpu);
- if (!up_ret)
- /* Try the remove call once again. */
- cpu_function_call(event->cpu, __perf_remove_from_context,
- rep);
- else
- pr_err("Failed to bring up CPU: %d, ret: %d\n",
- event->cpu, up_ret);
-}
-#else
-static void perf_retry_remove(struct perf_event *event,
- struct remove_event *rep)
-{
-}
-#endif
-
- /*
+/*
* Remove the event from a task's (or a CPU's) list of events.
*
* CPU events are removed with a smp call. For task events we only
@@ -1754,9 +1728,6 @@ static void __ref perf_remove_from_context(struct perf_event *event,
*/
ret = cpu_function_call(event->cpu, __perf_remove_from_context,
&re);
- if (ret == -ENXIO)
- perf_retry_remove(event, &re);
-
return;
}
@@ -6595,6 +6566,21 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
+static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ * To avoid leaking information to userspace, we must always
+ * reject kernel samples when exclude_kernel is set.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -6642,6 +6628,12 @@ static int __perf_event_overflow(struct perf_event *event,
}
/*
+ * For security, drop the skid kernel samples if necessary.
+ */
+ if (!sample_is_allowed(event, regs))
+ return ret;
+
+ /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -7117,8 +7109,6 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .events_across_hotplug = 1,
};
#ifdef CONFIG_EVENT_TRACING
@@ -7240,8 +7230,6 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .events_across_hotplug = 1,
};
static inline void perf_tp_register(void)
@@ -7529,8 +7517,6 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
-
- .events_across_hotplug = 1,
};
/*
@@ -7612,8 +7598,6 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
-
- .events_across_hotplug = 1,
};
static void perf_pmu_nop_void(struct pmu *pmu)
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 7da5b674d16e..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -614,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .events_across_hotplug = 1,
};
int __init init_hw_breakpoint(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 622571d2a833..246b8a57a32d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -370,7 +370,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_int();
+ tsk->stack_canary = get_random_long();
#endif
/*
@@ -1596,11 +1596,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
recalc_sigpending();
if (signal_pending(current)) {
- spin_unlock(&current->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
+ if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ retval = -ENOMEM;
+ goto bad_fork_cancel_cgroup;
+ }
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1651,6 +1653,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
bad_fork_cancel_cgroup:
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
threadgroup_change_end(current);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9812d9c0d483..e0449956298e 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -810,8 +810,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
if (!desc)
return;
- __irq_do_set_handler(desc, handle, 1, NULL);
desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
irq_put_desc_busunlock(desc, flags);
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6c8e154c7384..4684b7595e63 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -36,10 +36,32 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = &available_cpus;
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ /*
+ * The order of preference for selecting a fallback CPU is
+ *
+ * (1) online and un-isolated CPU from default affinity
+ * (2) online and un-isolated CPU
+ * (3) online CPU
+ */
cpumask_andnot(&available_cpus, cpu_online_mask,
cpu_isolated_mask);
- if (cpumask_empty(affinity))
+ if (cpumask_intersects(&available_cpus, irq_default_affinity))
+ cpumask_and(&available_cpus, &available_cpus,
+ irq_default_affinity);
+ else if (cpumask_empty(&available_cpus))
affinity = cpu_online_mask;
+
+ /*
+ * We are overriding the affinity with all online and
+ * un-isolated cpus. irq_set_affinity_locked() call
+ * below notify this mask to PM QOS affinity listener.
+ * That results in applying the CPU_DMA_LATENCY QOS
+ * to all the CPUs specified in the mask. But the low
+ * level irqchip driver sets the affinity of an irq
+ * to only one CPU. So pick only one CPU from the
+ * prepared mask while overriding the user affinity.
+ */
+ affinity = cpumask_of(cpumask_any(affinity));
ret = true;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e5c70dcb7f8e..2c2effdb4437 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1305,8 +1305,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret)
+ if (ret) {
+ irq_release_resources(desc);
goto out_mask;
+ }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index cd6009006510..41b40f310c28 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -268,7 +268,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq;
+ int i, ret, virq = 0;
ret = ops->msi_check(domain, info, dev);
if (ret == 0)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a24c5b909047..b05509af0352 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
goto free_cpumask;
}
+ if (cpumask_subset(new_value, cpu_isolated_mask)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d10ab6b9b5e0..695763516908 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -563,7 +563,7 @@ static void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 850b255649a2..698b8dec3074 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -604,6 +604,19 @@ repeat:
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ lockdep_assert_held(&worker->lock);
+
+ return !list_empty(&work->node) || work->canceling;
+}
+
/* insert @work before @pos in @worker */
static void insert_kthread_work(struct kthread_worker *worker,
struct kthread_work *work,
@@ -633,7 +646,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
unsigned long flags;
spin_lock_irqsave(&worker->lock, flags);
- if (list_empty(&work->node)) {
+ if (!queuing_blocked(worker, work)) {
insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
@@ -694,6 +707,87 @@ retry:
}
EXPORT_SYMBOL_GPL(flush_kthread_work);
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ * %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work,
+ unsigned long *flags)
+{
+ /*
+ * Try to remove the work from a worker list. It might either
+ * be from worker->work_list or from worker->delayed_work_list.
+ */
+ if (!list_empty(&work->node)) {
+ list_del_init(&work->node);
+ return true;
+ }
+
+ return false;
+}
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work)
+{
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+ int ret = false;
+
+ if (!worker)
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ ret = __kthread_cancel_work(work, &flags);
+
+ if (worker->current_work != work)
+ goto out_fast;
+
+ /*
+ * The work is in progress and we need to wait with the lock released.
+ * In the meantime, block any queuing by setting the canceling counter.
+ */
+ work->canceling++;
+ spin_unlock_irqrestore(&worker->lock, flags);
+ flush_kthread_work(work);
+ spin_lock_irqsave(&worker->lock, flags);
+ work->canceling--;
+
+out_fast:
+ spin_unlock_irqrestore(&worker->lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+ return __kthread_cancel_work_sync(work);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
/**
* flush_kthread_worker - flush all current works on a kthread_worker
* @worker: worker to flush
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a37857ab55..1e6a51cc25c4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,6 +1,7 @@
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
+#include <linux/sched/rt.h>
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
+ struct task_struct *task = current;
int curr = encode_cpu(smp_processor_id());
int old;
@@ -104,6 +106,32 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * We need to avoid reordering of link updation sequence of osq.
+ * A case in which the status of optimistic spin queue is
+ * CPU6->CPU2 in which CPU6 has acquired the lock. At this point
+ * if CPU0 comes in to acquire osq_lock, it will update the tail
+ * count. After tail count update if CPU2 starts to unqueue itself
+ * from optimistic spin queue, it will find updated tail count with
+ * CPU0 and update CPU2 node->next to NULL in osq_wait_next(). If
+ * reordering of following stores happen then prev->next where prev
+ * being CPU2 would be updated to point to CPU0 node:
+ * node->prev = prev;
+ * WRITE_ONCE(prev->next, node);
+ *
+ * At this point if next instruction
+ * WRITE_ONCE(next->prev, prev);
+ * in CPU2 path is committed before the update of CPU0 node->prev =
+ * prev then CPU0 node->prev will point to CPU6 node. At this point
+ * if CPU0 path's node->prev = prev is committed resulting in change
+ * of CPU0 prev back to CPU2 node. CPU2 node->next is NULL, so if
+ * CPU0 gets into unqueue path of osq_lock it will keep spinning
+ * in infinite loop as condition prev->next == node will never be
+ * true.
+ */
+ smp_mb();
+
WRITE_ONCE(prev->next, node);
/*
@@ -118,8 +146,13 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
+ * If a task spins on owner on a CPU after acquiring
+ * osq_lock while a RT task spins on another CPU to
+ * acquire osq_lock, it will starve the owner from
+ * completing if owner is to be scheduled on the same CPU.
+ * It will be a live lock.
*/
- if (need_resched())
+ if (need_resched() || rt_task(task))
goto unqueue;
cpu_relax_lowlatency();
diff --git a/kernel/padata.c b/kernel/padata.c
index 401227e3967c..ecc7b3f452c7 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -357,7 +357,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.cbcpu);
+ free_cpumask_var(pd->cpumask.pcpu);
return -ENOMEM;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index 982a52352cfc..679254405510 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -172,7 +172,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a65ba137fd15..567ecc826bc8 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -255,7 +255,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* if reparented.
*/
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->nr_hashed == init_pids)
break;
schedule();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c7e8ed99c953..5e2cd1030702 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,19 +28,25 @@
#include <linux/compat.h>
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
rcu_read_lock();
- child->ptracer_cred = get_cred(__task_cred(new_parent));
+ __ptrace_link(child, new_parent, __task_cred(new_parent));
rcu_read_unlock();
}
@@ -353,7 +359,7 @@ static int ptrace_attach(struct task_struct *task, long request,
flags |= PT_SEIZED;
task->ptrace = flags;
- __ptrace_link(task, current);
+ ptrace_link(task, current);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -420,7 +426,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 308f80ce2e43..a353df46c8e4 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,4 +22,6 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d2b8834dd3b..0071785e698b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2623,9 +2623,9 @@ void wake_up_new_task(struct task_struct *p)
*/
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
-
rq = __task_rq_lock(p);
mark_task_starting(p);
+ post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -3154,9 +3154,10 @@ unsigned long sum_capacity_reqs(unsigned long cfs_cap,
return total += scr->dl;
}
+unsigned long boosted_cpu_util(int cpu);
static void sched_freq_tick_pelt(int cpu)
{
- unsigned long cpu_utilization = capacity_max;
+ unsigned long cpu_utilization = boosted_cpu_util(cpu);
unsigned long capacity_curr = capacity_curr_of(cpu);
struct sched_capacity_reqs *scr;
@@ -6460,9 +6461,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
return -1;
}
@@ -6555,8 +6553,12 @@ static inline bool sched_debug(void)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
+ if (cpumask_weight(sched_domain_span(sd)) == 1) {
+ if (sd->groups->sge)
+ sd->flags &= ~SD_LOAD_BALANCE;
+ else
+ return 1;
+ }
/* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE |
@@ -6564,6 +6566,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN |
SD_SHARE_CAP_STATES)) {
@@ -6595,11 +6598,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN |
SD_SHARE_CAP_STATES);
+ if (parent->groups->sge) {
+ parent->flags &= ~SD_LOAD_BALANCE;
+ return 0;
+ }
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -6680,6 +6688,9 @@ static int init_rootdomain(struct root_domain *rd)
goto free_rto_mask;
init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+ rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
return 0;
free_rto_mask:
@@ -6996,6 +7007,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -7291,11 +7303,19 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- * SD_SHARE_CAP_STATES - describes shared capacity states
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ * SD_SHARE_CAP_STATES - describes shared capacity states
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -7305,11 +7325,13 @@ static int sched_domains_curr_level;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN | \
SD_SHARE_CAP_STATES)
static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+ struct sched_domain *child, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
int sd_weight, sd_flags = 0;
@@ -7361,6 +7383,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
+ .child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
@@ -7370,6 +7393,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
@@ -7816,16 +7846,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
+ struct sched_domain *sd = sd_init(tl, child, cpu);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
- sd->child = child;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
@@ -7859,7 +7886,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
- struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7877,8 +7903,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
}
}
@@ -7914,8 +7938,19 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+ int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+ if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+ cpu_rq(max_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+ if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+ cpu_rq(min_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
sd = *per_cpu_ptr(d.sd, i);
+
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
@@ -8339,6 +8374,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
index 0b5f2dea18a1..ce15ae7fe76b 100644
--- a/kernel/sched/core_ctl.c
+++ b/kernel/sched/core_ctl.c
@@ -39,11 +39,13 @@ struct cluster_data {
cpumask_t cpu_mask;
unsigned int need_cpus;
unsigned int task_thres;
+ unsigned int max_nr;
s64 need_ts;
struct list_head lru;
bool pending;
spinlock_t pending_lock;
bool is_big_cluster;
+ bool enable;
int nrrun;
bool nrrun_changed;
struct task_struct *core_ctl_thread;
@@ -60,6 +62,7 @@ struct cpu_data {
struct cluster_data *cluster;
struct list_head sib;
bool isolated_by_us;
+ unsigned int max_nr;
};
static DEFINE_PER_CPU(struct cpu_data, cpu_state);
@@ -244,6 +247,29 @@ static ssize_t show_is_big_cluster(const struct cluster_data *state, char *buf)
return snprintf(buf, PAGE_SIZE, "%u\n", state->is_big_cluster);
}
+static ssize_t store_enable(struct cluster_data *state,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+ bool bval;
+
+ if (sscanf(buf, "%u\n", &val) != 1)
+ return -EINVAL;
+
+ bval = !!val;
+ if (bval != state->enable) {
+ state->enable = bval;
+ apply_need(state);
+ }
+
+ return count;
+}
+
+static ssize_t show_enable(const struct cluster_data *state, char *buf)
+{
+ return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable);
+}
+
static ssize_t show_need_cpus(const struct cluster_data *state, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus);
@@ -372,6 +398,7 @@ core_ctl_attr_ro(need_cpus);
core_ctl_attr_ro(active_cpus);
core_ctl_attr_ro(global_state);
core_ctl_attr_rw(not_preferred);
+core_ctl_attr_rw(enable);
static struct attribute *default_attrs[] = {
&min_cpus.attr,
@@ -381,6 +408,7 @@ static struct attribute *default_attrs[] = {
&busy_down_thres.attr,
&task_thres.attr,
&is_big_cluster.attr,
+ &enable.attr,
&need_cpus.attr,
&active_cpus.attr,
&global_state.attr,
@@ -429,7 +457,6 @@ static struct kobj_type ktype_core_ctl = {
#define RQ_AVG_TOLERANCE 2
#define RQ_AVG_DEFAULT_MS 20
-#define NR_RUNNING_TOLERANCE 5
static unsigned int rq_avg_period_ms = RQ_AVG_DEFAULT_MS;
static s64 rq_avg_timestamp_ms;
@@ -437,6 +464,7 @@ static s64 rq_avg_timestamp_ms;
static void update_running_avg(bool trigger_update)
{
int avg, iowait_avg, big_avg, old_nrrun;
+ int old_max_nr, max_nr, big_max_nr;
s64 now;
unsigned long flags;
struct cluster_data *cluster;
@@ -450,40 +478,23 @@ static void update_running_avg(bool trigger_update)
return;
}
rq_avg_timestamp_ms = now;
- sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg);
+ sched_get_nr_running_avg(&avg, &iowait_avg, &big_avg,
+ &max_nr, &big_max_nr);
spin_unlock_irqrestore(&state_lock, flags);
- /*
- * Round up to the next integer if the average nr running tasks
- * is within NR_RUNNING_TOLERANCE/100 of the next integer.
- * If normal rounding up is used, it will allow a transient task
- * to trigger online event. By the time core is onlined, the task
- * has finished.
- * Rounding to closest suffers same problem because scheduler
- * might only provide running stats per jiffy, and a transient
- * task could skew the number for one jiffy. If core control
- * samples every 2 jiffies, it will observe 0.5 additional running
- * average which rounds up to 1 task.
- */
- avg = (avg + NR_RUNNING_TOLERANCE) / 100;
- big_avg = (big_avg + NR_RUNNING_TOLERANCE) / 100;
-
for_each_cluster(cluster, index) {
if (!cluster->inited)
continue;
+
old_nrrun = cluster->nrrun;
- /*
- * Big cluster only need to take care of big tasks, but if
- * there are not enough big cores, big tasks need to be run
- * on little as well. Thus for little's runqueue stat, it
- * has to use overall runqueue average, or derive what big
- * tasks would have to be run on little. The latter approach
- * is not easy to get given core control reacts much slower
- * than scheduler, and can't predict scheduler's behavior.
- */
+ old_max_nr = cluster->max_nr;
cluster->nrrun = cluster->is_big_cluster ? big_avg : avg;
- if (cluster->nrrun != old_nrrun) {
+ cluster->max_nr = cluster->is_big_cluster ? big_max_nr : max_nr;
+
+ if (cluster->nrrun != old_nrrun ||
+ cluster->max_nr != old_max_nr) {
+
if (trigger_update)
apply_need(cluster);
else
@@ -493,6 +504,7 @@ static void update_running_avg(bool trigger_update)
return;
}
+#define MAX_NR_THRESHOLD 4
/* adjust needed CPUs based on current runqueue information */
static unsigned int apply_task_need(const struct cluster_data *cluster,
unsigned int new_need)
@@ -503,7 +515,15 @@ static unsigned int apply_task_need(const struct cluster_data *cluster,
/* only unisolate more cores if there are tasks to run */
if (cluster->nrrun > new_need)
- return new_need + 1;
+ new_need = new_need + 1;
+
+ /*
+ * We don't want tasks to be overcrowded in a cluster.
+ * If any CPU has more than MAX_NR_THRESHOLD in the last
+ * window, bring another CPU to help out.
+ */
+ if (cluster->max_nr > MAX_NR_THRESHOLD)
+ new_need = new_need + 1;
return new_need;
}
@@ -549,7 +569,7 @@ static bool eval_need(struct cluster_data *cluster)
spin_lock_irqsave(&state_lock, flags);
- if (cluster->boost) {
+ if (cluster->boost || !cluster->enable) {
need_cpus = cluster->max_cpus;
} else {
cluster->active_cpus = get_active_cpu_count(cluster);
@@ -1046,6 +1066,7 @@ static int cluster_init(const struct cpumask *mask)
cluster->offline_delay_ms = 100;
cluster->task_thres = UINT_MAX;
cluster->nrrun = cluster->num_cpus;
+ cluster->enable = true;
INIT_LIST_HEAD(&cluster->lru);
spin_lock_init(&cluster->pending_lock);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..dbc51442ecbc
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned int flags))
+{
+ if (WARN_ON(!data || !func))
+ return;
+
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index d751bc2d0d6e..f10d9f7d6d07 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -32,6 +32,12 @@ static struct cpufreq_governor cpufreq_gov_sched;
static DEFINE_PER_CPU(unsigned long, enabled);
DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+struct gov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int up_throttle_nsec;
+ unsigned int down_throttle_nsec;
+};
+
/**
* gov_data - per-policy data internal to the governor
* @up_throttle: next throttling period expiry if increasing OPP
@@ -53,8 +59,8 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
struct gov_data {
ktime_t up_throttle;
ktime_t down_throttle;
- unsigned int up_throttle_nsec;
- unsigned int down_throttle_nsec;
+ struct gov_tunables *tunables;
+ struct list_head tunables_hook;
struct task_struct *task;
struct irq_work irq_work;
unsigned int requested_freq;
@@ -71,8 +77,10 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
- gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
- gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
+ gd->up_throttle = ktime_add_ns(ktime_get(),
+ gd->tunables->up_throttle_nsec);
+ gd->down_throttle = ktime_add_ns(ktime_get(),
+ gd->tunables->down_throttle_nsec);
up_write(&policy->rwsem);
}
@@ -262,12 +270,70 @@ static inline void clear_sched_freq(void)
static_key_slow_dec(&__sched_freq);
}
-static struct attribute_group sched_attr_group_gov_pol;
-static struct attribute_group *get_sysfs_attr(void)
+/* Tunables */
+static struct gov_tunables *global_tunables;
+
+static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
{
- return &sched_attr_group_gov_pol;
+ return container_of(attr_set, struct gov_tunables, attr_set);
}
+static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
+}
+
+static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->up_throttle_nsec = val;
+ return count;
+}
+
+static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
+}
+
+static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->down_throttle_nsec = val;
+ return count;
+}
+
+static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
+static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
+
+static struct attribute *schedfreq_attributes[] = {
+ &up_throttle_nsec.attr,
+ &down_throttle_nsec.attr,
+ NULL
+};
+
+static struct kobj_type tunables_ktype = {
+ .default_attrs = schedfreq_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
{
struct gov_data *gd;
@@ -282,17 +348,39 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
if (!gd)
return -ENOMEM;
- gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
- policy->cpuinfo.transition_latency :
- THROTTLE_UP_NSEC;
- gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
- pr_debug("%s: throttle threshold = %u [ns]\n",
- __func__, gd->up_throttle_nsec);
-
- rc = sysfs_create_group(&policy->kobj, get_sysfs_attr());
- if (rc) {
- pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
- goto err;
+ policy->governor_data = gd;
+
+ if (!global_tunables) {
+ gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
+ if (!gd->tunables)
+ goto free_gd;
+
+ gd->tunables->up_throttle_nsec =
+ policy->cpuinfo.transition_latency ?
+ policy->cpuinfo.transition_latency :
+ THROTTLE_UP_NSEC;
+ gd->tunables->down_throttle_nsec =
+ THROTTLE_DOWN_NSEC;
+
+ rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
+ &tunables_ktype,
+ get_governor_parent_kobj(policy),
+ "%s", cpufreq_gov_sched.name);
+ if (rc)
+ goto free_tunables;
+
+ gov_attr_set_init(&gd->tunables->attr_set,
+ &gd->tunables_hook);
+
+ pr_debug("%s: throttle_threshold = %u [ns]\n",
+ __func__, gd->tunables->up_throttle_nsec);
+
+ if (!have_governor_per_policy())
+ global_tunables = gd->tunables;
+ } else {
+ gd->tunables = global_tunables;
+ gov_attr_set_get(&global_tunables->attr_set,
+ &gd->tunables_hook);
}
policy->governor_data = gd;
@@ -304,7 +392,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
if (IS_ERR_OR_NULL(gd->task)) {
pr_err("%s: failed to create kschedfreq thread\n",
__func__);
- goto err;
+ goto free_tunables;
}
get_task_struct(gd->task);
kthread_bind_mask(gd->task, policy->related_cpus);
@@ -316,7 +404,9 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
return 0;
-err:
+free_tunables:
+ kfree(gd->tunables);
+free_gd:
policy->governor_data = NULL;
kfree(gd);
return -ENOMEM;
@@ -324,6 +414,7 @@ err:
static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
{
+ unsigned int count;
struct gov_data *gd = policy->governor_data;
clear_sched_freq();
@@ -332,7 +423,12 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
put_task_struct(gd->task);
}
- sysfs_remove_group(&policy->kobj, get_sysfs_attr());
+ count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
+ if (!count) {
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+ kfree(gd->tunables);
+ }
policy->governor_data = NULL;
@@ -394,88 +490,6 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy,
return 0;
}
-/* Tunables */
-static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
-{
- return sprintf(buf, "%u\n", gd->up_throttle_nsec);
-}
-
-static ssize_t store_up_throttle_nsec(struct gov_data *gd,
- const char *buf, size_t count)
-{
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- gd->up_throttle_nsec = val;
- return count;
-}
-
-static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
-{
- return sprintf(buf, "%u\n", gd->down_throttle_nsec);
-}
-
-static ssize_t store_down_throttle_nsec(struct gov_data *gd,
- const char *buf, size_t count)
-{
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- gd->down_throttle_nsec = val;
- return count;
-}
-
-/*
- * Create show/store routines
- * - sys: One governor instance for complete SYSTEM
- * - pol: One governor instance per struct cpufreq_policy
- */
-#define show_gov_pol_sys(file_name) \
-static ssize_t show_##file_name##_gov_pol \
-(struct cpufreq_policy *policy, char *buf) \
-{ \
- return show_##file_name(policy->governor_data, buf); \
-}
-
-#define store_gov_pol_sys(file_name) \
-static ssize_t store_##file_name##_gov_pol \
-(struct cpufreq_policy *policy, const char *buf, size_t count) \
-{ \
- return store_##file_name(policy->governor_data, buf, count); \
-}
-
-#define gov_pol_attr_rw(_name) \
- static struct freq_attr _name##_gov_pol = \
- __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
-
-#define show_store_gov_pol_sys(file_name) \
- show_gov_pol_sys(file_name); \
- store_gov_pol_sys(file_name)
-#define tunable_handlers(file_name) \
- show_gov_pol_sys(file_name); \
- store_gov_pol_sys(file_name); \
- gov_pol_attr_rw(file_name)
-
-tunable_handlers(down_throttle_nsec);
-tunable_handlers(up_throttle_nsec);
-
-/* Per policy governor instance */
-static struct attribute *sched_attributes_gov_pol[] = {
- &up_throttle_nsec_gov_pol.attr,
- &down_throttle_nsec_gov_pol.attr,
- NULL,
-};
-
-static struct attribute_group sched_attr_group_gov_pol = {
- .attrs = sched_attributes_gov_pol,
- .name = "sched",
-};
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
static
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..75bfbb336722
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,770 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned long boosted_cpu_util(int cpu);
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER (1000)
+#define SUGOV_KTHREAD_PRIORITY 50
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int up_rate_limit_us;
+ unsigned int down_rate_limit_us;
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 min_rate_limit_ns;
+ s64 up_rate_delay_ns;
+ s64 down_rate_delay_ns;
+ unsigned int next_freq;
+
+ /* The next fields are only needed if fast switch cannot be used. */
+ struct irq_work irq_work;
+ struct kthread_work work;
+ struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
+ bool work_in_progress;
+
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+
+ unsigned int cached_raw_freq;
+ unsigned long iowait_boost;
+ unsigned long iowait_boost_max;
+ u64 last_update;
+
+ /* The fields below are only needed when sharing a policy. */
+ unsigned long util;
+ unsigned long max;
+ unsigned int flags;
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+
+ if (sg_policy->work_in_progress)
+ return false;
+
+ if (unlikely(sg_policy->need_freq_update)) {
+ sg_policy->need_freq_update = false;
+ /*
+ * This happens when limits change, so forget the previous
+ * next_freq value and force an update.
+ */
+ sg_policy->next_freq = UINT_MAX;
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ /* No need to recalculate next freq for min_rate_limit_us at least */
+ return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ s64 delta_ns;
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ if (next_freq > sg_policy->next_freq &&
+ delta_ns < sg_policy->up_rate_delay_ns)
+ return true;
+
+ if (next_freq < sg_policy->next_freq &&
+ delta_ns < sg_policy->down_rate_delay_ns)
+ return true;
+
+ return false;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+ return;
+
+ if (policy->fast_switch_enabled) {
+ if (sg_policy->next_freq == next_freq) {
+ trace_cpu_frequency(policy->cur, smp_processor_id());
+ return;
+ }
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (next_freq == CPUFREQ_ENTRY_INVALID)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+ } else if (sg_policy->next_freq != next_freq) {
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
+ }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+ unsigned long max)
+{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int freq = arch_scale_freq_invariant() ?
+ policy->cpuinfo.max_freq : policy->cur;
+
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_cpu->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static inline bool use_pelt(void)
+{
+#ifdef CONFIG_SCHED_WALT
+ return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+ return true;
+#endif
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long max_cap, rt;
+ s64 delta;
+
+ max_cap = arch_scale_cpu_capacity(NULL, cpu);
+
+ sched_avg_update(rq);
+ delta = time - rq->age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+ rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+ rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
+
+ *util = boosted_cpu_util(cpu);
+ if (likely(use_pelt()))
+ *util = min((*util + rt), max_cap);
+
+ *max = max_cap;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ if (flags & SCHED_CPUFREQ_IOWAIT) {
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC)
+ sg_cpu->iowait_boost = 0;
+ }
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+ unsigned long *max)
+{
+ unsigned long boost_util = sg_cpu->iowait_boost;
+ unsigned long boost_max = sg_cpu->iowait_boost_max;
+
+ if (!boost_util)
+ return;
+
+ if (*util * boost_max < *max * boost_util) {
+ *util = boost_util;
+ *max = boost_max;
+ }
+ sg_cpu->iowait_boost >>= 1;
+}
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util, max;
+ unsigned int next_f;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ if (!sugov_should_update_freq(sg_policy, time))
+ return;
+
+ if (flags & SCHED_CPUFREQ_DL) {
+ next_f = policy->cpuinfo.max_freq;
+ } else {
+ sugov_get_util(&util, &max, time);
+ sugov_iowait_boost(sg_cpu, &util, &max);
+ next_f = get_next_freq(sg_cpu, util, max);
+ }
+ sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
+ unsigned long util, unsigned long max,
+ unsigned int flags)
+{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int max_f = policy->cpuinfo.max_freq;
+ u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned int j;
+
+ if (flags & SCHED_CPUFREQ_DL)
+ return max_f;
+
+ sugov_iowait_boost(sg_cpu, &util, &max);
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu;
+ unsigned long j_util, j_max;
+ s64 delta_ns;
+
+ if (j == smp_processor_id())
+ continue;
+
+ j_sg_cpu = &per_cpu(sugov_cpu, j);
+ /*
+ * If the CPU utilization was last updated before the previous
+ * frequency update and the time elapsed between the last update
+ * of the CPU utilization and the last frequency update is long
+ * enough, don't take the CPU into account as it probably is
+ * idle now (and clear iowait_boost for it).
+ */
+ delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ if (delta_ns > TICK_NSEC) {
+ j_sg_cpu->iowait_boost = 0;
+ continue;
+ }
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
+ return max_f;
+
+ j_util = j_sg_cpu->util;
+ j_max = j_sg_cpu->max;
+ if (j_util * max > j_max * util) {
+ util = j_util;
+ max = j_max;
+ }
+
+ sugov_iowait_boost(j_sg_cpu, &util, &max);
+ }
+
+ return get_next_freq(sg_cpu, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned long util, max;
+ unsigned int next_f;
+
+ sugov_get_util(&util, &max, time);
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sg_cpu->util = util;
+ sg_cpu->max = max;
+ sg_cpu->flags = flags;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
+ sugov_update_commit(sg_policy, time, next_f);
+ }
+
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct kthread_work *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+ mutex_lock(&sg_policy->work_lock);
+ __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+ CPUFREQ_RELATION_L);
+ mutex_unlock(&sg_policy->work_lock);
+
+ sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+
+ /*
+ * For Real Time and Deadline tasks, schedutil governor shoots the
+ * frequency to maximum. And special care must be taken to ensure that
+ * this kthread doesn't result in that.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() and before that schedutil
+ * rejects all other frequency scaling requests.
+ *
+ * Though there is a very rare case where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ queue_kthread_work(&sg_policy->worker, &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+ mutex_lock(&min_rate_lock);
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+ mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->up_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->down_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+ &up_rate_limit_us.attr,
+ &down_rate_limit_us.attr,
+ NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+ .default_attrs = sugov_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ mutex_destroy(&sg_policy->work_lock);
+ kfree(sg_policy);
+}
+
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ init_kthread_work(&sg_policy->work, sugov_work);
+ init_kthread_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ flush_kthread_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ unsigned int lat;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy)
+ return -ENOMEM;
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto stop_kthread;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto stop_kthread;
+ }
+
+ tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat) {
+ tunables->up_rate_limit_us *= lat;
+ tunables->down_rate_limit_us *= lat;
+ }
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ cpufreq_gov_schedutil.name);
+ if (ret)
+ goto fail;
+
+ out:
+ mutex_unlock(&global_tunables_lock);
+
+ cpufreq_enable_fast_switch(policy);
+ return 0;
+
+ fail:
+ policy->governor_data = NULL;
+ sugov_tunables_free(tunables);
+
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+ pr_err("initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ cpufreq_disable_fast_switch(policy);
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ sugov_tunables_free(tunables);
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_kthread_stop(sg_policy);
+ sugov_policy_free(sg_policy);
+
+ return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ sg_policy->up_rate_delay_ns =
+ sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = UINT_MAX;
+ sg_policy->work_in_progress = false;
+ sg_policy->need_freq_update = false;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ sg_cpu->sg_policy = sg_policy;
+ if (policy_is_shared(policy)) {
+ sg_cpu->util = 0;
+ sg_cpu->max = 0;
+ sg_cpu->flags = SCHED_CPUFREQ_DL;
+ sg_cpu->last_update = 0;
+ sg_cpu->cached_raw_freq = 0;
+ sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ sugov_update_shared);
+ } else {
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ sugov_update_single);
+ }
+ }
+ return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_sched();
+
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+
+ return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ if (!policy->fast_switch_enabled) {
+ mutex_lock(&sg_policy->work_lock);
+ cpufreq_policy_apply_limits(policy);
+ mutex_unlock(&sg_policy->work_lock);
+ }
+
+ sg_policy->need_freq_update = true;
+
+ return 0;
+}
+
+static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ switch(event) {
+ case CPUFREQ_GOV_POLICY_INIT:
+ return sugov_init(policy);
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return sugov_exit(policy);
+ case CPUFREQ_GOV_START:
+ return sugov_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return sugov_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ return sugov_limits(policy);
+ default:
+ BUG();
+ }
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil = {
+ .name = "schedutil",
+ .governor = cpufreq_schedutil_cb,
+ .owner = THIS_MODULE,
+};
+
+static int __init sugov_register(void)
+{
+ return cpufreq_register_governor(&cpufreq_gov_schedutil);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..1d00cf8c00fa 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -27,6 +27,8 @@
* of the License.
*/
+#include "sched.h"
+
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -51,6 +53,27 @@ static int convert_prio(int prio)
}
/**
+ * drop_nopreempt_cpus - remove a cpu from the mask if it is likely
+ * non-preemptible
+ * @lowest_mask: mask with selected CPUs (non-NULL)
+ */
+static void
+drop_nopreempt_cpus(struct cpumask *lowest_mask)
+{
+ unsigned int cpu = cpumask_first(lowest_mask);
+
+ while (cpu < nr_cpu_ids) {
+ /* unlocked access */
+ struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr);
+
+ if (task_may_not_preempt(task, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+
+ cpu = cpumask_next(cpu, lowest_mask);
+ }
+}
+
+/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
@@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
{
int idx = 0;
int task_pri = convert_prio(p->prio);
+ bool drop_nopreempts = task_pri <= MAX_RT_PRIO;
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+retry:
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
int skip = 0;
@@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
+ if (drop_nopreempts)
+ drop_nopreempt_cpus(lowest_mask);
/*
* We have to ensure that we have at least one bit
* still set in the array, since the map could have
@@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
return 1;
}
-
+ /*
+ * If we can't find any non-preemptible cpu's, retry so we can
+ * find the lowest priority target and avoid priority inversion.
+ */
+ if (drop_nopreempts) {
+ drop_nopreempts = false;
+ goto retry;
+ }
return 0;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3d55ec89c400..a105e97ab6bf 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -755,6 +755,9 @@ static void update_curr_dl(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c8c4272c61d8..ed8e6bb4531b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -636,6 +636,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
+ /* eas */
+ /* select_idle_sibling() */
+ P(se.statistics.nr_wakeups_sis_attempts);
+ P(se.statistics.nr_wakeups_sis_idle);
+ P(se.statistics.nr_wakeups_sis_cache_affine);
+ P(se.statistics.nr_wakeups_sis_suff_cap);
+ P(se.statistics.nr_wakeups_sis_idle_cpu);
+ P(se.statistics.nr_wakeups_sis_count);
+ /* select_energy_cpu_brute() */
+ P(se.statistics.nr_wakeups_secb_attempts);
+ P(se.statistics.nr_wakeups_secb_sync);
+ P(se.statistics.nr_wakeups_secb_idle_bt);
+ P(se.statistics.nr_wakeups_secb_insuff_cap);
+ P(se.statistics.nr_wakeups_secb_no_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_count);
+ /* find_best_target() */
+ P(se.statistics.nr_wakeups_fbt_attempts);
+ P(se.statistics.nr_wakeups_fbt_no_cpu);
+ P(se.statistics.nr_wakeups_fbt_no_sd);
+ P(se.statistics.nr_wakeups_fbt_pref_idle);
+ P(se.statistics.nr_wakeups_fbt_count);
+ /* cas */
+ /* select_task_rq_fair() */
+ P(se.statistics.nr_wakeups_cas_attempts);
+ P(se.statistics.nr_wakeups_cas_count);
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
__P(load_avg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4d96380b35e8..422438d43d90 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -50,7 +50,6 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-unsigned int sysctl_sched_is_big_little = 0;
unsigned int sysctl_sched_sync_hint_enable = 1;
unsigned int sysctl_sched_initial_task_util = 0;
unsigned int sysctl_sched_cstate_aware = 1;
@@ -119,6 +118,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -294,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -664,7 +709,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
@@ -688,20 +733,115 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = sched_freq() ?
- sysctl_sched_initial_task_util :
- scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * In previous Android versions, we used to have:
+ * sa->util_avg = sched_freq() ?
+ * sysctl_sched_initial_task_util :
+ * scale_load_down(SCHED_LOAD_SCALE);
+ * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ * However, that functionality has been moved to enqueue.
+ * It is unclear if we should restore this in enqueue.
+ */
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ /*
+ * If we wish to restore tuning via setting initial util,
+ * this is where we should do it.
+ */
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ return;
+ }
+ }
+
+ attach_entity_cfs_rq(se);
+}
+
#else
void init_entity_runnable_average(struct sched_entity *se)
{
}
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1425,7 +1565,8 @@ balance:
* Call select_idle_sibling to maybe find a better one.
*/
if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
assign:
assigned = true;
@@ -2410,28 +2551,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
+ load = scale_load_down(cfs_rq->load.weight);
- return tg_weight;
-}
-
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2450,6 +2585,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -2468,16 +2604,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2486,8 +2626,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3240,7 +3381,8 @@ retry:
sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED;
}
} else if (stats.best_cpu >= 0) {
- if (stats.best_cpu != task_cpu(p) &&
+ if (stats.best_sibling_cpu >= 0 &&
+ stats.best_cpu != task_cpu(p) &&
stats.min_cost == stats.best_sibling_cpu_cost) {
stats.best_cpu = stats.best_sibling_cpu;
sbc_flag |= SBC_FLAG_BEST_SIBLING;
@@ -3536,6 +3678,16 @@ kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
static DEFINE_RAW_SPINLOCK(migration_lock);
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+ if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+ && same_cluster(new_cpu, cpu))
+ return false;
+
+ /* Inter cluster high irqload migrations are OK */
+ return new_cpu != cpu;
+}
+
/*
* Check if currently running task should be migrated to a better cpu.
*
@@ -3553,7 +3705,7 @@ void check_for_migration(struct rq *rq, struct task_struct *p)
raw_spin_lock(&migration_lock);
new_cpu = select_best_cpu(p, cpu, reason, 0);
- if (new_cpu != cpu) {
+ if (do_migration(reason, new_cpu, cpu)) {
active_balance = kick_active_balance(rq, p, new_cpu);
if (active_balance)
mark_reserved(new_cpu);
@@ -3779,25 +3931,262 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/*
@@ -3817,23 +4206,43 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+ removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3848,68 +4257,89 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq == &rq_of(cfs_rq)->cfs)
trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
return decayed || removed;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int cpu = cpu_of(rq_of(cfs_rq));
+ int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
+
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
- if (entity_is_task(se))
- trace_sched_load_avg_task(task_of(se), &se->avg);
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -3917,34 +4347,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3973,13 +4389,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* Newly created task or never used group entity should not be removed
@@ -3988,9 +4416,7 @@ void remove_entity_load_avg(struct sched_entity *se)
if (se->avg.last_update_time == 0)
return;
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -4027,7 +4453,16 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
@@ -4176,9 +4611,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -4251,6 +4687,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
@@ -4286,7 +4732,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
/*
@@ -4341,7 +4787,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -4457,8 +4903,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -5102,6 +5548,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
+ /* Synchronize hierarchical throttle counter: */
+ if (unlikely(!cfs_rq->throttle_uptodate)) {
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_rq *pcfs_rq;
+ struct task_group *tg;
+
+ cfs_rq->throttle_uptodate = 1;
+
+ /* Get closest up-to-date node, because leaves go first: */
+ for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+ pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+ if (pcfs_rq->throttle_uptodate)
+ break;
+ }
+ if (tg) {
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ }
+ }
+
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -5342,7 +5808,7 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static bool cpu_overutilized(int cpu);
-static inline unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu);
#else
#define boosted_cpu_util(cpu) cpu_util(cpu)
#endif
@@ -5379,6 +5845,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int task_wakeup = flags & ENQUEUE_WAKEUP;
#endif
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -5407,8 +5881,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se) {
@@ -5492,15 +5966,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && parent_entity(se))
- set_next_buddy(parent_entity(se));
-
- /* avoid re-evaluating load for this entity */
- se = parent_entity(se);
+ if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+ set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
@@ -5514,8 +5987,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se) {
@@ -6121,15 +6594,7 @@ static int sched_group_energy(struct energy_env *eenv)
*/
sd = rcu_dereference(per_cpu(sd_scs, cpu));
- if (!sd)
- /*
- * We most probably raced with hotplug; returning a
- * wrong energy estimation is better than entering an
- * infinite loop.
- */
- return -EINVAL;
-
- if (sd->parent)
+ if (sd && sd->parent)
sg_shared_cap = sd->parent->groups;
for_each_domain(cpu, sd) {
@@ -6184,6 +6649,14 @@ static int sched_group_energy(struct energy_env *eenv)
} while (sg = sg->next, sg != sd->groups);
}
+
+ /*
+ * If we raced with hotplug and got an sd NULL-pointer;
+ * returning a wrong energy estimation is better than
+ * entering an infinite loop.
+ */
+ if (cpumask_test_cpu(cpu, &visit_cpus))
+ return -EINVAL;
next_cpu:
cpumask_clear_cpu(cpu, &visit_cpus);
continue;
@@ -6210,6 +6683,7 @@ static inline int __energy_diff(struct energy_env *eenv)
struct sched_domain *sd;
struct sched_group *sg;
int sd_cpu = -1, energy_before = 0, energy_after = 0;
+ int diff, margin;
struct energy_env eenv_before = {
.util_delta = 0,
@@ -6252,12 +6726,22 @@ static inline int __energy_diff(struct energy_env *eenv)
eenv->nrg.after = energy_after;
eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
eenv->payoff = 0;
-
+#ifndef CONFIG_SCHED_TUNE
trace_sched_energy_diff(eenv->task,
eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
eenv->cap.before, eenv->cap.after, eenv->cap.delta,
eenv->nrg.delta, eenv->payoff);
+#endif
+ /*
+ * Dead-zone margin preventing too many migrations.
+ */
+
+ margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+ diff = eenv->nrg.after - eenv->nrg.before;
+
+ eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
return eenv->nrg.diff;
}
@@ -6265,30 +6749,37 @@ static inline int __energy_diff(struct energy_env *eenv)
#ifdef CONFIG_SCHED_TUNE
struct target_nrg schedtune_target_nrg;
-
+extern bool schedtune_initialized;
/*
* System energy normalization
- * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
* corresponding to the specified energy variation.
*/
static inline int
normalize_energy(int energy_diff)
{
u32 normalized_nrg;
+
+ /* during early setup, we don't know the extents */
+ if (unlikely(!schedtune_initialized))
+ return energy_diff < 0 ? -1 : 1 ;
+
#ifdef CONFIG_SCHED_DEBUG
+ {
int max_delta;
/* Check for boundaries */
max_delta = schedtune_target_nrg.max_power;
max_delta -= schedtune_target_nrg.min_power;
WARN_ON(abs(energy_diff) >= max_delta);
+ }
#endif
/* Do scaling using positive numbers to increase the range */
normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
/* Scale by energy magnitude */
- normalized_nrg <<= SCHED_LOAD_SHIFT;
+ normalized_nrg <<= SCHED_CAPACITY_SHIFT;
/* Normalize on max energy for target platform */
normalized_nrg = reciprocal_divide(
@@ -6319,6 +6810,12 @@ energy_diff(struct energy_env *eenv)
eenv->cap.delta,
eenv->task);
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
/*
* When SchedTune is enabled, the energy_diff() function will return
* the computed energy payoff value. Since the energy_diff() return
@@ -6358,18 +6855,18 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
@@ -6429,8 +6926,6 @@ static inline unsigned long task_util(struct task_struct *p)
return p->se.avg.util_avg;
}
-unsigned int capacity_margin = 1280; /* ~20% margin */
-
static inline unsigned long boosted_task_util(struct task_struct *task);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
@@ -6456,11 +6951,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
return __task_fits(p, cpu, 0);
}
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
-{
- return __task_fits(p, cpu, cpu_util(cpu));
-}
-
static bool cpu_overutilized(int cpu)
{
return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
@@ -6468,6 +6958,8 @@ static bool cpu_overutilized(int cpu)
#ifdef CONFIG_SCHED_TUNE
+struct reciprocal_value schedtune_spc_rdiv;
+
static long
schedtune_margin(unsigned long signal, long boost)
{
@@ -6478,29 +6970,16 @@ schedtune_margin(unsigned long signal, long boost)
*
* The Boost (B) value is used to compute a Margin (M) which is
* proportional to the complement of the original Signal (S):
- * M = B * (SCHED_LOAD_SCALE - S), if B is positive
- * M = B * S, if B is negative
+ * M = B * (SCHED_CAPACITY_SCALE - S)
* The obtained M could be used by the caller to "boost" S.
*/
if (boost >= 0) {
- margin = SCHED_LOAD_SCALE - signal;
+ margin = SCHED_CAPACITY_SCALE - signal;
margin *= boost;
} else
margin = -signal * boost;
- /*
- * Fast integer division by constant:
- * Constant : (C) = 100
- * Precision : 0.1% (P) = 0.1
- * Reference : C * 100 / P (R) = 100000
- *
- * Thus:
- * Shift bits : ceil(log(R,2)) (S) = 17
- * Mult const : round(2^S/C) (M) = 1311
- *
- *
- */
- margin *= 1311;
- margin >>= 17;
+
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
if (boost < 0)
margin *= -1;
@@ -6550,7 +7029,7 @@ schedtune_task_margin(struct task_struct *task)
#endif /* CONFIG_SCHED_TUNE */
-static inline unsigned long
+unsigned long
boosted_cpu_util(int cpu)
{
unsigned long util = cpu_util(cpu);
@@ -6572,6 +7051,13 @@ boosted_task_util(struct task_struct *task)
return util + margin;
}
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -6581,10 +7067,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *fit_group = NULL, *spare_group = NULL;
+ struct sched_group *most_spare_sg = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
- unsigned long fit_capacity = ULONG_MAX;
- unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -6592,7 +7077,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load, spare_capacity;
+ unsigned long load, avg_load, spare_cap, max_spare_cap;
int local_group;
int i;
@@ -6604,8 +7089,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -6616,24 +7105,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
- /*
- * Look for most energy-efficient group that can fit
- * that can fit the task.
- */
- if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
- fit_capacity = capacity_of(i);
- fit_group = group;
- }
+ spare_cap = capacity_spare_wake(i, p);
- /*
- * Look for group which has most spare capacity on a
- * single cpu.
- */
- spare_capacity = capacity_of(i) - cpu_util(i);
- if (spare_capacity > max_spare_capacity) {
- max_spare_capacity = spare_capacity;
- spare_group = group;
- }
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
@@ -6641,17 +7116,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (local_group) {
this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_spare = max_spare_cap;
+ } else {
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (fit_group)
- return fit_group;
-
- if (spare_group)
- return spare_group;
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ */
+ if (this_spare > task_util(p) / 2 &&
+ imbalance*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
@@ -6671,9 +7161,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (task_fits_spare(p, i)) {
+ if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -6685,8 +7179,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (idle_cpu(i) &&
- (!idle || idle->exit_latency == min_exit_latency) &&
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -6695,13 +7188,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (shallowest_idle_cpu == -1) {
- /*
- * If we haven't found an idle CPU yet
- * pick a non-idle one that can fit the task as
- * fallback.
- */
- shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -6718,24 +7204,32 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
struct sched_group *sg;
- int i = task_cpu(p);
- int best_idle = -1;
- int best_idle_cstate = -1;
- int best_idle_capacity = INT_MAX;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ unsigned long best_idle_capacity = ULONG_MAX;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
+ schedstat_inc(this_rq(), eas_stats.sis_attempts);
if (!sysctl_sched_cstate_aware) {
- if (idle_cpu(target))
+ if (idle_cpu(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
+ schedstat_inc(this_rq(), eas_stats.sis_idle);
return target;
+ }
/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
+ schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
+ return prev;
+ }
}
if (!(current->flags & PF_WAKE_UP_IDLE) &&
@@ -6749,24 +7243,30 @@ static int select_idle_sibling(struct task_struct *p, int target)
for_each_lower_domain(sd) {
sg = sd->groups;
do {
+ int i;
if (!cpumask_intersects(sched_group_cpus(sg),
tsk_cpus_allowed(p)))
goto next;
if (sysctl_sched_cstate_aware) {
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
- struct rq *rq = cpu_rq(i);
- int idle_idx = idle_get_state_idx(rq);
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
unsigned long new_usage = boosted_task_util(p);
unsigned long capacity_orig = capacity_orig_of(i);
+
if (new_usage > capacity_orig || !idle_cpu(i))
goto next;
- if (i == target && new_usage <= capacity_curr_of(target))
+ if (i == target && new_usage <= capacity_curr_of(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
+ schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
+ schedstat_inc(sd, eas_stats.sis_suff_cap);
return target;
+ }
- if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
- best_idle = i;
+ if (idle_idx < best_idle_cstate &&
+ capacity_orig <= best_idle_capacity) {
+ best_idle_cpu = i;
best_idle_cstate = idle_idx;
best_idle_capacity = capacity_orig;
}
@@ -6779,231 +7279,283 @@ static int select_idle_sibling(struct task_struct *p, int target)
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
+ schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
+ schedstat_inc(sd, eas_stats.sis_idle_cpu);
goto done;
}
next:
sg = sg->next;
} while (sg != sd->groups);
}
- if (best_idle > 0)
- target = best_idle;
+
+ if (best_idle_cpu >= 0)
+ target = best_idle_cpu;
done:
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
+ schedstat_inc(this_rq(), eas_stats.sis_count);
+
return target;
}
-static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
{
- int iter_cpu;
- int target_cpu = -1;
- int target_util = 0;
- int backup_capacity = 0;
- int best_idle_cpu = -1;
- int best_idle_cstate = INT_MAX;
- int backup_cpu = -1;
- unsigned long task_util_boosted, new_util;
-
- task_util_boosted = boosted_task_util(p);
- for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
- int cur_capacity;
- struct rq *rq;
- int idle_idx;
-
- /*
- * Iterate from higher cpus for boosted tasks.
- */
- int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+ unsigned long util, capacity;
- if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
- continue;
-
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- new_util = cpu_util(i) + task_util_boosted;
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
- /*
- * Ensure minimum capacity to grant the required boost.
- * The target CPU can be already at a capacity level higher
- * than the one required to boost the task.
- */
- if (new_util > capacity_orig_of(i))
- continue;
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
- /*
- * Unconditionally favoring tasks that prefer idle cpus to
- * improve latency.
- */
- if (idle_cpu(i) && prefer_idle) {
- if (best_idle_cpu < 0)
- best_idle_cpu = i;
- continue;
- }
+ return (util >= capacity) ? capacity : util;
+}
- cur_capacity = capacity_curr_of(i);
- rq = cpu_rq(i);
- idle_idx = idle_get_state_idx(rq);
+static int start_cpu(bool boosted)
+{
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- if (new_util < cur_capacity) {
- if (cpu_rq(i)->nr_running) {
- if (prefer_idle) {
- /* Find a target cpu with highest
- * utilization.
- */
- if (target_util == 0 ||
- target_util < new_util) {
- target_cpu = i;
- target_util = new_util;
- }
- } else {
- /* Find a target cpu with lowest
- * utilization.
- */
- if (target_util == 0 ||
- target_util > new_util) {
- target_cpu = i;
- target_util = new_util;
- }
- }
- } else if (!prefer_idle) {
- if (best_idle_cpu < 0 ||
- (sysctl_sched_cstate_aware &&
- best_idle_cstate > idle_idx)) {
- best_idle_cstate = idle_idx;
- best_idle_cpu = i;
- }
- }
- } else if (backup_capacity == 0 ||
- backup_capacity > cur_capacity) {
- // Find a backup cpu with least capacity.
- backup_capacity = cur_capacity;
- backup_cpu = i;
- }
- }
+ RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
- if (prefer_idle && best_idle_cpu >= 0)
- target_cpu = best_idle_cpu;
- else if (target_cpu < 0)
- target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
-
- return target_cpu;
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
}
-static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
{
+ int target_cpu = -1;
+ unsigned long target_util = prefer_idle ? ULONG_MAX : 0;
+ unsigned long backup_capacity = ULONG_MAX;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ int backup_cpu = -1;
+ unsigned long min_util = boosted_task_util(p);
struct sched_domain *sd;
- struct sched_group *sg, *sg_target;
- int target_max_cap = INT_MAX;
- int target_cpu = task_cpu(p);
- unsigned long task_util_boosted, new_util;
- int i;
+ struct sched_group *sg;
+ int cpu = start_cpu(boosted);
- if (sysctl_sched_sync_hint_enable && sync) {
- int cpu = smp_processor_id();
- cpumask_t search_cpus;
- cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
- if (cpumask_test_cpu(cpu, &search_cpus))
- return cpu;
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq(), eas_stats.fbt_attempts);
+
+ if (cpu < 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
+ return target_cpu;
}
- sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
- if (!sd)
- return target;
+ if (!sd) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
+ return target_cpu;
+ }
sg = sd->groups;
- sg_target = sg;
- if (sysctl_sched_is_big_little) {
+ do {
+ int i;
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- /* Assuming all cpus are the same in group */
- int max_cap_cpu = group_first_cpu(sg);
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ unsigned long cur_capacity, new_util, wake_util;
+ unsigned long min_wake_util = ULONG_MAX;
- /*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
- */
- if (capacity_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ if (!cpu_online(i))
+ continue;
- task_util_boosted = boosted_task_util(p);
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
* accounting. However, the blocked utilization may be zero.
*/
- new_util = cpu_util(i) + task_util_boosted;
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
/*
* Ensure minimum capacity to grant the required boost.
* The target CPU can be already at a capacity level higher
* than the one required to boost the task.
*/
+ new_util = max(min_util, new_util);
+
if (new_util > capacity_orig_of(i))
continue;
- if (new_util < capacity_curr_of(i)) {
- target_cpu = i;
- if (cpu_rq(i)->nr_running)
- break;
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
+ return i;
+ }
+
+ cur_capacity = capacity_curr_of(i);
+
+ if (new_util < cur_capacity) {
+ if (cpu_rq(i)->nr_running) {
+ /*
+ * Find a target cpu with the lowest/highest
+ * utilization if prefer_idle/!prefer_idle.
+ */
+ if (prefer_idle) {
+ /* Favor the CPU that last ran the task */
+ if (new_util > target_util ||
+ wake_util > min_wake_util)
+ continue;
+ min_wake_util = wake_util;
+ target_util = new_util;
+ target_cpu = i;
+ } else if (target_util < new_util) {
+ target_util = new_util;
+ target_cpu = i;
+ }
+ } else if (!prefer_idle) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ if (best_idle_cpu < 0 ||
+ (sysctl_sched_cstate_aware &&
+ best_idle_cstate > idle_idx)) {
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ }
+ }
+ } else if (backup_capacity > cur_capacity) {
+ /* Find a backup cpu with least capacity. */
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
}
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ if (target_cpu < 0)
+ target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+ if (target_cpu >= 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq(), eas_stats.fbt_count);
+ }
+
+ return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+ struct sched_domain *sd;
+ int target_cpu = prev_cpu, tmp_target;
+ bool boosted, prefer_idle;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq(), eas_stats.secb_attempts);
- /* cpu has capacity at higher OPP, keep it as fallback */
- if (target_cpu == task_cpu(p))
- target_cpu = i;
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq(), eas_stats.secb_sync);
+ return cpu;
}
- } else {
- /*
- * Find a cpu with sufficient capacity
- */
+ }
+
+ rcu_read_lock();
#ifdef CONFIG_CGROUP_SCHEDTUNE
- bool boosted = schedtune_task_boost(p) > 0;
- bool prefer_idle = schedtune_prefer_idle(p) > 0;
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
#else
- bool boosted = 0;
- bool prefer_idle = 0;
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
#endif
- int tmp_target = find_best_target(p, boosted, prefer_idle);
- if (tmp_target >= 0) {
- target_cpu = tmp_target;
- if ((boosted || prefer_idle) && idle_cpu(target_cpu))
- return target_cpu;
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ /* Find a cpu with sufficient capacity */
+ tmp_target = find_best_target(p, boosted, prefer_idle);
+
+ if (!sd)
+ goto unlock;
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
+ goto unlock;
}
}
- if (target_cpu != task_cpu(p)) {
+ if (target_cpu != prev_cpu) {
struct energy_env eenv = {
- .util_delta = task_util(p),
- .src_cpu = task_cpu(p),
- .dst_cpu = target_cpu,
- .task = p,
+ .util_delta = task_util(p),
+ .src_cpu = prev_cpu,
+ .dst_cpu = target_cpu,
+ .task = p,
};
/* Not enough spare capacity on previous cpu */
- if (cpu_overutilized(task_cpu(p)))
- return target_cpu;
+ if (cpu_overutilized(prev_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
+ goto unlock;
+ }
- if (energy_diff(&eenv) >= 0)
- return task_cpu(p);
+ if (energy_diff(&eenv) >= 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ goto unlock;
}
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq(), eas_stats.secb_count);
+
+unlock:
+ rcu_read_unlock();
+
return target_cpu;
}
@@ -7032,10 +7584,19 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return select_best_cpu(p, prev_cpu, 0, sync);
#endif
- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
- energy_aware();
+ if (sd_flag & SD_BALANCE_WAKE) {
+ /*
+ * do wake_cap unconditionally as it causes task and cpu
+ * utilization to be synced, and we need that for energy
+ * aware wakeups
+ */
+ int _wake_cap = wake_cap(p, cpu, prev_cpu);
+ want_affine = !wake_wide(p) && !_wake_cap
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ }
+
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+ return select_energy_cpu_brute(p, prev_cpu, sync);
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -7060,49 +7621,65 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
- if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
- else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
+ } else {
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
+ if (wu) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq(), eas_stats.cas_attempts);
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
+ while (sd) {
+ struct sched_group *group;
+ int weight;
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
+ if (wu)
+ schedstat_inc(sd, eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
}
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq(), eas_stats.cas_count);
}
- /* while loop will break here if sd == NULL */
}
rcu_read_unlock();
@@ -8105,8 +8682,13 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+ true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8166,7 +8748,7 @@ static inline void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8404,13 +8986,14 @@ skip_unlock: __attribute__ ((unused));
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, max_capacity;
+ unsigned long capacity, max_capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -8424,6 +9007,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity = 0;
max_capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -8456,6 +9040,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
max_capacity = max(capacity, max_capacity);
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
@@ -8473,6 +9058,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
if (!cpu_isolated(cpumask_first(cpus))) {
capacity += sgc->capacity;
max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
}
group = group->next;
} while (group != child->groups);
@@ -8480,6 +9066,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
sdg->sgc->capacity = capacity;
sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -8761,15 +9348,21 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
/*
- * Candiate sg has no more than one task per cpu and has higher
- * per-cpu capacity. No reason to pull tasks to less capable cpus.
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
group_smaller_cpu_capacity(sds->local, sg))
return false;
}
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -8820,6 +9413,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+#define lb_sd_parent(sd) \
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -8905,7 +9501,7 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- if (!env->sd->parent) {
+ if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
@@ -9494,7 +10090,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *continue_balancing)
{
int ld_moved = 0, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
struct sched_group *group = NULL;
struct rq *busiest = NULL;
unsigned long flags;
@@ -10778,6 +11374,61 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -10792,8 +11443,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
- /* Catch up with the cfs_rq and remove our load when we leave */
- detach_entity_load_avg(cfs_rq, se);
+ detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
@@ -10801,16 +11451,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
- /* Synchronize task with its cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -10864,6 +11505,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
@@ -10901,8 +11545,9 @@ void free_fair_sched_group(struct task_group *tg)
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -10917,6 +11562,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -10930,6 +11577,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ raw_spin_unlock_irq(&rq->lock);
}
return 1;
@@ -11026,8 +11677,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index df47c26ab6d2..ae6876e62c0f 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -1602,7 +1602,7 @@ unsigned int nr_eligible_big_tasks(int cpu)
int nr_big = rq->hmp_stats.nr_big_tasks;
int nr = rq->nr_running;
- if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
+ if (!is_max_capacity_cpu(cpu))
return nr_big;
return nr;
@@ -2521,10 +2521,42 @@ static inline u32 predict_and_update_buckets(struct rq *rq,
return pred_demand;
}
-static void update_task_cpu_cycles(struct task_struct *p, int cpu)
+#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC)
+
+/*
+ * Assumes rq_lock is held and wallclock was recorded in the same critical
+ * section as this function's invocation.
+ */
+static inline u64 read_cycle_counter(int cpu, u64 wallclock)
+{
+ struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+ u64 delta;
+
+ if (unlikely(!cluster))
+ return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+ /*
+ * Why don't we need locking here? Let's say that delta is negative
+ * because some other CPU happened to update last_cc_update with a
+ * more recent timestamp. We simply read the conter again in that case
+ * with no harmful side effects. This can happen if there is an FIQ
+ * between when we read the wallclock and when we use it here.
+ */
+ delta = wallclock - atomic64_read(&cluster->last_cc_update);
+ if (delta > THRESH_CC_UPDATE) {
+ atomic64_set(&cluster->cycles,
+ cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu));
+ atomic64_set(&cluster->last_cc_update, wallclock);
+ }
+
+ return atomic64_read(&cluster->cycles);
+}
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu,
+ u64 wallclock)
{
if (use_cycle_counter)
- p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+ p->cpu_cycles = read_cycle_counter(cpu, wallclock);
}
static void
@@ -2542,7 +2574,7 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
return;
}
- cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+ cur_cycles = read_cycle_counter(cpu, wallclock);
/*
* If current task is idle task and irqtime == 0 CPU was
@@ -2579,7 +2611,8 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
}
-static int account_busy_for_task_demand(struct task_struct *p, int event)
+static int
+account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
{
/*
* No need to bother updating task demand for exiting tasks
@@ -2598,6 +2631,17 @@ static int account_busy_for_task_demand(struct task_struct *p, int event)
(event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
return 0;
+ /*
+ * TASK_UPDATE can be called on sleeping task, when its moved between
+ * related groups
+ */
+ if (event == TASK_UPDATE) {
+ if (rq->curr == p)
+ return 1;
+
+ return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
+ }
+
return 1;
}
@@ -2738,7 +2782,7 @@ static u64 update_task_demand(struct task_struct *p, struct rq *rq,
u64 runtime;
new_window = mark_start < window_start;
- if (!account_busy_for_task_demand(p, event)) {
+ if (!account_busy_for_task_demand(rq, p, event)) {
if (new_window)
/*
* If the time accounted isn't being accounted as
@@ -2822,7 +2866,7 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
update_window_start(rq, wallclock);
if (!p->ravg.mark_start) {
- update_task_cpu_cycles(p, cpu_of(rq));
+ update_task_cpu_cycles(p, cpu_of(rq), wallclock);
goto done;
}
@@ -2890,7 +2934,7 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
if (is_idle_task(curr)) {
/* We're here without rq->lock held, IRQ disabled */
raw_spin_lock(&rq->lock);
- update_task_cpu_cycles(curr, cpu);
+ update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
raw_spin_unlock(&rq->lock);
}
}
@@ -2935,7 +2979,7 @@ void mark_task_starting(struct task_struct *p)
p->ravg.mark_start = p->last_wake_ts = wallclock;
p->last_cpu_selected_ts = wallclock;
p->last_switch_out_ts = 0;
- update_task_cpu_cycles(p, cpu_of(rq));
+ update_task_cpu_cycles(p, cpu_of(rq), wallclock);
}
void set_window_start(struct rq *rq)
@@ -3548,7 +3592,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
update_task_ravg(p, task_rq(p), TASK_MIGRATE,
wallclock, 0);
- update_task_cpu_cycles(p, new_cpu);
+ update_task_cpu_cycles(p, new_cpu, wallclock);
new_task = is_new_task(p);
/* Protected by rq_lock */
@@ -4303,8 +4347,20 @@ void note_task_waking(struct task_struct *p, u64 wallclock)
{
u64 sleep_time = wallclock - p->last_switch_out_ts;
- p->last_wake_ts = wallclock;
+ /*
+ * When a short burst and short sleeping task goes for a long
+ * sleep, the task's avg_sleep_time gets boosted. It will not
+ * come below short_sleep threshold for a lot of time and it
+ * results in incorrect packing. The idead behind tracking
+ * avg_sleep_time is to detect if a task is short sleeping
+ * or not. So limit the sleep time to twice the short sleep
+ * threshold. For regular long sleeping tasks, the avg_sleep_time
+ * would be higher than threshold, and packing happens correctly.
+ */
+ sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep);
update_avg(&p->ravg.avg_sleep_time, sleep_time);
+
+ p->last_wake_ts = wallclock;
}
#ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..f8e8d68ed3fd 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -201,8 +201,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +212,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 29345ed74069..c03d51a017bf 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,6 +5,7 @@
#include "sched.h"
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
#include <trace/events/sched.h>
@@ -1005,6 +1006,9 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1456,11 +1460,30 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
}
#endif
+/*
+ * Return whether the task on the given cpu is currently non-preemptible
+ * while handling a potentially long softint, or if the task is likely
+ * to block preemptions soon because it is a ksoftirq thread that is
+ * handling slow softints.
+ */
+bool
+task_may_not_preempt(struct task_struct *task, int cpu)
+{
+ __u32 softirqs = per_cpu(active_softirqs, cpu) |
+ __IRQ_STAT(cpu, __softirq_pending);
+ struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
+
+ return ((softirqs & LONG_SOFTIRQ_MASK) &&
+ (task == cpu_ksoftirqd ||
+ task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+}
+
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ bool may_not_preempt;
#ifdef CONFIG_SCHED_HMP
return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
@@ -1476,7 +1499,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr = READ_ONCE(rq->curr); /* unlocked access */
/*
- * If the current task on @p's runqueue is an RT task, then
+ * If the current task on @p's runqueue is a softirq task,
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
@@ -1497,17 +1530,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*/
- if (curr && unlikely(rt_task(curr)) &&
+ may_not_preempt = task_may_not_preempt(curr, cpu);
+ if (may_not_preempt ||
+ (unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ curr->prio <= p->prio))) {
int target = find_lowest_rq(p);
/*
- * Don't bother moving it if the destination CPU is
- * not running a lower priority task.
+ * If cpu is non-preemptible, prefer remote cpu
+ * even if it's running a higher-prio task.
+ * Otherwise: Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
*/
if (target != -1 &&
- p->prio < cpu_rq(target)->rt.highest_prio.curr)
+ (may_not_preempt ||
+ p->prio < cpu_rq(target)->rt.highest_prio.curr))
cpu = target;
}
rcu_read_unlock();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 276a2387f06f..67b7da81f8a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -397,6 +397,8 @@ struct sched_cluster {
unsigned int static_cluster_pwr_cost;
int notifier_sent;
bool wake_up_idle;
+ atomic64_t last_cc_update;
+ atomic64_t cycles;
};
extern unsigned long all_cluster_ids[];
@@ -463,6 +465,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -509,7 +512,7 @@ struct cfs_rq {
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
- int throttled, throttle_count;
+ int throttled, throttle_count, throttle_uptodate;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -649,6 +652,9 @@ struct root_domain {
/* Maximum cpu capacity in the system. */
struct max_cpu_capacity max_cpu_capacity;
+
+ /* First cpu with maximum and minimum original capacity */
+ int max_cap_orig_cpu, min_cap_orig_cpu;
};
extern struct root_domain def_root_domain;
@@ -706,6 +712,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -825,6 +832,9 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
+#ifdef CONFIG_SMP
+ struct eas_stats eas_stats;
+#endif
#endif
#ifdef CONFIG_SMP
@@ -995,6 +1005,7 @@ struct sched_group_capacity {
*/
unsigned long capacity;
unsigned long max_capacity; /* Max per-cpu capacity in group */
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -1225,6 +1236,11 @@ static inline bool hmp_capable(void)
return max_possible_capacity != min_max_possible_capacity;
}
+static inline bool is_max_capacity_cpu(int cpu)
+{
+ return cpu_max_possible_capacity(cpu) == max_possible_capacity;
+}
+
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
@@ -1601,6 +1617,8 @@ static inline unsigned int nr_eligible_big_tasks(int cpu)
return 0;
}
+static inline bool is_max_capacity_cpu(int cpu) { return true; }
+
static inline int pct_task_load(struct task_struct *p) { return 0; }
static inline int cpu_capacity(int cpu)
@@ -2149,6 +2167,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
static inline void __add_nr_running(struct rq *rq, unsigned count)
{
@@ -2662,6 +2681,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+/*
+ * task_may_not_preempt - check whether a task may not be preemptible soon
+ */
+extern bool task_may_not_preempt(struct task_struct *task, int cpu);
+
#else /* CONFIG_SMP */
/*
@@ -2783,3 +2807,55 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
+{
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_update_util(rq, flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant() (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant() (false)
+#endif
diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c
index 29d8a26a78ed..ba5a326a9fd8 100644
--- a/kernel/sched/sched_avg.c
+++ b/kernel/sched/sched_avg.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012, 2015-2016, The Linux Foundation. All rights reserved.
+/* Copyright (c) 2012, 2015-2017, The Linux Foundation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 and
@@ -26,11 +26,13 @@ static DEFINE_PER_CPU(u64, nr_prod_sum);
static DEFINE_PER_CPU(u64, last_time);
static DEFINE_PER_CPU(u64, nr_big_prod_sum);
static DEFINE_PER_CPU(u64, nr);
+static DEFINE_PER_CPU(u64, nr_max);
static DEFINE_PER_CPU(unsigned long, iowait_prod_sum);
static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
static s64 last_get_time;
+#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
/**
* sched_get_nr_running_avg
* @return: Average nr_running, iowait and nr_big_tasks value since last poll.
@@ -40,7 +42,8 @@ static s64 last_get_time;
* Obtains the average nr_running value since the last poll.
* This function may not be called concurrently with itself
*/
-void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg)
+void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg,
+ unsigned int *max_nr, unsigned int *big_max_nr)
{
int cpu;
u64 curr_time = sched_clock();
@@ -50,6 +53,8 @@ void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg)
*avg = 0;
*iowait_avg = 0;
*big_avg = 0;
+ *max_nr = 0;
+ *big_max_nr = 0;
if (!diff)
return;
@@ -78,17 +83,35 @@ void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg)
per_cpu(nr_big_prod_sum, cpu) = 0;
per_cpu(iowait_prod_sum, cpu) = 0;
+ if (*max_nr < per_cpu(nr_max, cpu))
+ *max_nr = per_cpu(nr_max, cpu);
+
+ if (is_max_capacity_cpu(cpu)) {
+ if (*big_max_nr < per_cpu(nr_max, cpu))
+ *big_max_nr = per_cpu(nr_max, cpu);
+ }
+
+ per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
}
diff = curr_time - last_get_time;
last_get_time = curr_time;
- *avg = (int)div64_u64(tmp_avg * 100, diff);
- *big_avg = (int)div64_u64(tmp_big_avg * 100, diff);
- *iowait_avg = (int)div64_u64(tmp_iowait * 100, diff);
-
- trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg);
+ /*
+ * Any task running on BIG cluster and BIG tasks running on little
+ * cluster contributes to big_avg. Small or medium tasks can also
+ * run on BIG cluster when co-location and scheduler boost features
+ * are activated. We don't want these tasks to downmigrate to little
+ * cluster when BIG CPUs are available but isolated. Round up the
+ * average values so that core_ctl aggressively unisolate BIG CPUs.
+ */
+ *avg = (int)DIV64_U64_ROUNDUP(tmp_avg, diff);
+ *big_avg = (int)DIV64_U64_ROUNDUP(tmp_big_avg, diff);
+ *iowait_avg = (int)DIV64_U64_ROUNDUP(tmp_iowait, diff);
+
+ trace_sched_get_nr_running_avg(*avg, *big_avg, *iowait_avg,
+ *max_nr, *big_max_nr);
BUG_ON(*avg < 0 || *big_avg < 0 || *iowait_avg < 0);
pr_debug("%s - avg:%d big_avg:%d iowait_avg:%d\n",
@@ -121,6 +144,9 @@ void sched_update_nr_prod(int cpu, long delta, bool inc)
BUG_ON((s64)per_cpu(nr, cpu) < 0);
+ if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu))
+ per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
+
per_cpu(nr_prod_sum, cpu) += nr_running * diff;
per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff;
per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff;
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..6d74a7c77c8c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,28 @@
*/
#define SCHEDSTAT_VERSION 15
+#ifdef CONFIG_SMP
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+ /* eas-specific runqueue stats */
+ seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+ stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+ stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+ stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+ stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+ stats->secb_nrg_sav, stats->secb_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu ",
+ stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+ stats->fbt_pref_idle, stats->fbt_count);
+
+ seq_printf(seq, "%llu %llu\n",
+ stats->cas_attempts, stats->cas_count);
+}
+#endif
+
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
@@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+ show_easstat(seq, &rq->eas_stats);
+
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
+
+ show_easstat(seq, &sd->eas_stats);
}
rcu_read_unlock();
#endif
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b0c5fe6d1f3b..a71e94cecdb6 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -12,11 +12,12 @@
#include "tune.h"
#ifdef CONFIG_CGROUP_SCHEDTUNE
-static bool schedtune_initialized = false;
+bool schedtune_initialized = false;
#endif
unsigned int sysctl_sched_cfs_boost __read_mostly;
+extern struct reciprocal_value schedtune_spc_rdiv;
extern struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
@@ -675,6 +676,9 @@ int schedtune_task_boost(struct task_struct *p)
struct schedtune *st;
int task_boost;
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
/* Get task boost value */
rcu_read_lock();
st = task_schedtune(p);
@@ -689,6 +693,9 @@ int schedtune_prefer_idle(struct task_struct *p)
struct schedtune *st;
int prefer_idle;
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
/* Get prefer_idle value */
rcu_read_lock();
st = task_schedtune(p);
@@ -822,6 +829,7 @@ schedtune_boostgroup_init(struct schedtune *st)
bg = &per_cpu(cpu_boost_groups, cpu);
bg->group[st->idx].boost = 0;
bg->group[st->idx].tasks = 0;
+ raw_spin_lock_init(&bg->lock);
}
return 0;
@@ -1121,9 +1129,12 @@ schedtune_init(void)
pr_info("schedtune: configured to support global boosting only\n");
#endif
+ schedtune_spc_rdiv = reciprocal_value(100);
+
return 0;
nodata:
+ pr_warning("schedtune: disabled!\n");
rcu_read_unlock();
return -EINVAL;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a972fd..b92a047ddc82 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -503,7 +503,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
+ bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
@@ -525,6 +526,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
+
+ *resched_timer =
+ (first->flags & SIGQUEUE_PREALLOC) &&
+ (info->si_code == SI_TIMER) &&
+ (info->si_sys_private);
+
__sigqueue_free(first);
} else {
/*
@@ -541,12 +548,12 @@ still_pending:
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
- siginfo_t *info)
+ siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
- collect_signal(sig, pending, info);
+ collect_signal(sig, pending, info, resched_timer);
return sig;
}
@@ -558,15 +565,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
+ bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
- signr = __dequeue_signal(&tsk->pending, mask, info);
+ signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
- mask, info);
+ mask, info, &resched_timer);
/*
* itimer signal ?
*
@@ -611,7 +619,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
- if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f787..39ffd41594ce 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(__u32, active_softirqs);
+
const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -253,6 +260,7 @@ asmlinkage __visible void __do_softirq(void)
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
+ __this_cpu_write(active_softirqs, pending);
local_irq_enable();
@@ -282,6 +290,7 @@ restart:
pending >>= softirq_bit;
}
+ __this_cpu_write(active_softirqs, 0);
rcu_bh_qs();
local_irq_disable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 816999804a16..f27d2ba78d14 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -522,13 +522,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
- .procname = "sched_is_big_little",
- .data = &sysctl_sched_is_big_little,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "sched_sync_hint_enable",
.data = &sysctl_sched_sync_hint_enable,
.maxlen = sizeof(unsigned int),
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2af5687b83c9..1a4de0022cc5 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -569,7 +569,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add(start, base->gettime());
+ start = ktime_add_safe(start, base->gettime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -655,7 +655,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
overrun++;
}
- alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
return overrun;
}
EXPORT_SYMBOL_GPL(alarm_forward);
@@ -843,13 +843,21 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+
+ /*
+ * Rate limit to the tick as a hot fix to prevent DOS. Will be
+ * mopped up later.
+ */
+ if (ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC)
+ timr->it.alarm.interval = ktime_set(0, TICK_NSEC);
+
exp = timespec_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
- exp = ktime_add(now, exp);
+ exp = ktime_add_safe(now, exp);
}
alarm_start(&timr->it.alarm.alarmtimer, exp);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 01a49614e942..e7c2392666cb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -49,7 +49,6 @@
#include <linux/sched/deadline.h>
#include <linux/timer.h>
#include <linux/freezer.h>
-#include <linux/delay.h>
#include <asm/uaccess.h>
@@ -1593,42 +1592,22 @@ static void init_hrtimers_cpu(int cpu)
}
#if defined(CONFIG_HOTPLUG_CPU)
-static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base,
- struct hrtimer_cpu_base *new_base,
- unsigned int i,
- bool wait,
+static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+ struct hrtimer_clock_base *new_base,
bool remove_pinned)
{
struct hrtimer *timer;
struct timerqueue_node *node;
struct timerqueue_head pinned;
int is_pinned;
- struct hrtimer_clock_base *old_c_base = &old_base->clock_base[i];
- struct hrtimer_clock_base *new_c_base = &new_base->clock_base[i];
+ bool is_hotplug = !cpu_online(old_base->cpu_base->cpu);
timerqueue_init_head(&pinned);
- while ((node = timerqueue_getnext(&old_c_base->active))) {
+ while ((node = timerqueue_getnext(&old_base->active))) {
timer = container_of(node, struct hrtimer, node);
- if (wait) {
- /* Ensure timers are done running before continuing */
- while (hrtimer_callback_running(timer)) {
- raw_spin_unlock(&old_base->lock);
- raw_spin_unlock(&new_base->lock);
- cpu_relax();
- /*
- * cpu_relax may just be a barrier. Grant the
- * run_hrtimer_list code some time to obtain the
- * spinlock.
- */
- udelay(2);
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock,
- SINGLE_DEPTH_NESTING);
- }
- } else {
+ if (is_hotplug)
BUG_ON(hrtimer_callback_running(timer));
- }
debug_deactivate(timer);
/*
@@ -1636,7 +1615,7 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base,
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_c_base, HRTIMER_STATE_ENQUEUED, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
is_pinned = timer->state & HRTIMER_STATE_PINNED;
if (!remove_pinned && is_pinned) {
@@ -1644,7 +1623,7 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base,
continue;
}
- timer->base = new_c_base;
+ timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
* reprogram the event device in case the timer
@@ -1653,7 +1632,7 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_c_base);
+ enqueue_hrtimer(timer, new_base);
}
/* Re-queue pinned timers for non-hotplug usecase */
@@ -1661,11 +1640,11 @@ static void migrate_hrtimer_list(struct hrtimer_cpu_base *old_base,
timer = container_of(node, struct hrtimer, node);
timerqueue_del(&pinned, &timer->node);
- enqueue_hrtimer(timer, old_c_base);
+ enqueue_hrtimer(timer, old_base);
}
}
-static void __migrate_hrtimers(int scpu, bool wait, bool remove_pinned)
+static void __migrate_hrtimers(int scpu, bool remove_pinned)
{
struct hrtimer_cpu_base *old_base, *new_base;
unsigned long flags;
@@ -1682,8 +1661,8 @@ static void __migrate_hrtimers(int scpu, bool wait, bool remove_pinned)
raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- migrate_hrtimer_list(old_base, new_base, i, wait,
- remove_pinned);
+ migrate_hrtimer_list(&old_base->clock_base[i],
+ &new_base->clock_base[i], remove_pinned);
}
raw_spin_unlock(&old_base->lock);
@@ -1699,12 +1678,12 @@ static void migrate_hrtimers(int scpu)
BUG_ON(cpu_online(scpu));
tick_cancel_sched_timer(scpu);
- __migrate_hrtimers(scpu, false, true);
+ __migrate_hrtimers(scpu, true);
}
void hrtimer_quiesce_cpu(void *cpup)
{
- __migrate_hrtimers(*(int *)cpup, true, false);
+ __migrate_hrtimers(*(int *)cpup, false);
}
#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 80016b329d94..051544aec37c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1250,7 +1250,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
cputime_t *newval, cputime_t *oldval)
{
- unsigned long long now;
+ unsigned long long now = 0;
WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
cpu_timer_sample_group(clock_idx, tsk, &now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fa544f3f560..738f3467d169 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -116,6 +116,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
@@ -173,7 +193,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -207,7 +227,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
cycle_t cycle_now, delta;
/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -235,12 +255,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
- tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
- tk->tkr_raw.read = clock->read;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
@@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
- tkr->read(tkr->clock),
+ tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -461,6 +479,10 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
return cycles_at_suspend;
}
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
- tkr_dummy.read = dummy_clock_read;
+ cycles_at_suspend = tk_clock_read(tkr);
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- tkr_dummy.read = dummy_clock_read;
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
@@ -647,11 +669,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
s64 nsec;
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -1434,7 +1455,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
@@ -1829,7 +1850,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c9956440d0e6..12ea4ea619ee 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1471,6 +1471,11 @@ static __init int kprobe_trace_self_tests_init(void)
end:
release_all_trace_kprobes();
+ /*
+ * Wait for the optimizer work to finish. Otherwise it might fiddle
+ * with probes in already freed __init text.
+ */
+ wait_for_kprobe_optimizer();
if (warn)
pr_cont("NG: Some tests are failed. Please check them.\n");
else