summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorZhiqiang Tu <ztu@codeaurora.org>2017-08-24 15:30:08 +0800
committerZhiqiang Tu <ztu@codeaurora.org>2017-08-24 16:39:56 +0800
commit9df1d44946b2278f14ee1ba90d3f3f281791e6bc (patch)
treed510fb0532267008691b65cca07ebc6324c2222e /kernel
parent1ebddfae1dd64f54b4da86a81eab1616ef8f2d83 (diff)
parentcb1c821645fd08ff03c7ed4d62a6e4ca0a6a6ad5 (diff)
Merge branch 'msm-4.4' into dev/msm-4.4-8996au
Conflicts: drivers/iommu/arm-smmu.c drivers/media/platform/msm/ais/fd/msm_fd_dev.c drivers/media/platform/msm/camera_v2/fd/msm_fd_dev.c drivers/soc/qcom/glink.c include/uapi/linux/msm_ipa.h Change-Id: Id007a850fa2df09f08c413ffcd447a6532fad83c Signed-off-by: Zhiqiang Tu <ztu@codeaurora.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/verifier.c26
-rw-r--r--kernel/cgroup.c19
-rw-r--r--kernel/cpu.c22
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/events/core.c13
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/irq/cpuhotplug.c24
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c5
-rw-r--r--kernel/kthread.c96
-rw-r--r--kernel/locking/osq_lock.c35
-rw-r--r--kernel/locking/rwsem-xadd.c35
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/resource.c13
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c114
-rw-r--r--kernel/sched/cpufreq.c63
-rw-r--r--kernel/sched/cpufreq_sched.c220
-rw-r--r--kernel/sched/cpufreq_schedutil.c788
-rw-r--r--kernel/sched/cpupri.c37
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/debug.c26
-rw-r--r--kernel/sched/fair.c1711
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/rt.c65
-rw-r--r--kernel/sched/sched.h67
-rw-r--r--kernel/sched/stats.c26
-rw-r--r--kernel/sched/tune.c13
-rw-r--r--kernel/sched/walt.c8
-rw-r--r--kernel/sched/walt.h2
-rw-r--r--kernel/signal.c20
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/alarmtimer.c15
-rw-r--r--kernel/time/tick-sched.c12
-rw-r--r--kernel/time/timekeeping.c47
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--kernel/trace/trace_kprobe.c21
41 files changed, 2965 insertions, 646 deletions
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cbfba78d3db..c97bce6a0e0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -313,7 +313,8 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct verifier_env *env,
+ const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -377,9 +378,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
} else {
verbose("BUG_ld_%02x\n", insn->code);
return;
@@ -754,6 +765,11 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1758,7 +1774,7 @@ static int do_check(struct verifier_env *env)
if (log_level) {
verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
+ print_bpf_insn(env, insn);
}
if (class == BPF_ALU || class == BPF_ALU64) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 25cf44889559..077bb52e2d47 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -717,10 +717,10 @@ static void css_set_move_task(struct task_struct *task,
if (to_cset) {
/*
- * We are synchronized through cgroup_threadgroup_rwsem
- * against PF_EXITING setting such that we can't race
- * against cgroup_exit() changing the css_set to
- * init_css_set and dropping the old one.
+ * We are synchronized through css_set_lock against
+ * PF_EXITING setting such that we can't race against
+ * cgroup_exit() disassociating the task from the
+ * css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -5701,19 +5701,22 @@ void cgroup_exit(struct task_struct *tsk)
int i;
/*
- * Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check css_set and cg_list without synchronization.
+ * Avoid potential race with the migrate path.
+ */
+ spin_lock_irq(&css_set_lock);
+ /*
+ * Unlink from @tsk from its css_set.
*/
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
- spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
- spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
}
+ spin_unlock_irq(&css_set_lock);
+
/* see cgroup_post_fork() for details */
for_each_subsys_which(ss, i, &have_exit_callback)
ss->exit(tsk);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e822cb0e18d5..1d6b0a209bc0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -361,6 +361,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
if (!cpu_online(cpu))
return -EINVAL;
+ if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1)
+ return -EBUSY;
+
cpu_hotplug_begin();
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
@@ -372,6 +375,21 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
goto out_release;
}
+ /*
+ * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+ * and RCU users of this state to go away such that all new such users
+ * will observe it.
+ *
+ * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+ * not imply sync_sched(), so wait for both.
+ *
+ * Do sync before park smpboot threads to take care the rcu boost case.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
+
smpboot_park_threads(cpu);
/*
@@ -505,8 +523,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- pr_warn("%s: attempt to bring up CPU %u failed\n",
- __func__, cpu);
+ pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n",
+ __func__, cpu);
goto out_notify;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 29c7240172d3..03dbc231a4a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -174,9 +174,9 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
+static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags);
+ return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 95c447e658f7..87f5d841f796 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3437,22 +3437,27 @@ u64 perf_event_read_local(struct perf_event *event)
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
+ event_cpu = READ_ONCE(event->oncpu);
+
if (event->state == PERF_EVENT_STATE_ACTIVE &&
- !cpu_isolated(event->oncpu)) {
+ !cpu_isolated(event_cpu)) {
struct perf_read_data data = {
.event = event,
.group = group,
.ret = 0,
};
+
+ if ((unsigned int)event_cpu >= nr_cpu_ids)
+ return 0;
if (!event->attr.exclude_idle ||
- !per_cpu(is_idle, event->oncpu)) {
- smp_call_function_single(event->oncpu,
+ !per_cpu(is_idle, event_cpu)) {
+ smp_call_function_single(event_cpu,
__perf_event_read, &data, 1);
ret = data.ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..4f06fc34313f 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,7 +66,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index 2845c5bdc8e3..fef4df444f47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -370,7 +370,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_int();
+ tsk->stack_canary = get_random_long();
#endif
/*
@@ -832,8 +832,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mm = get_task_mm(task);
if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode) &&
- !capable(CAP_SYS_RESOURCE)) {
+ !ptrace_may_access(task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 6c8e154c7384..4684b7595e63 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -36,10 +36,32 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = &available_cpus;
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ /*
+ * The order of preference for selecting a fallback CPU is
+ *
+ * (1) online and un-isolated CPU from default affinity
+ * (2) online and un-isolated CPU
+ * (3) online CPU
+ */
cpumask_andnot(&available_cpus, cpu_online_mask,
cpu_isolated_mask);
- if (cpumask_empty(affinity))
+ if (cpumask_intersects(&available_cpus, irq_default_affinity))
+ cpumask_and(&available_cpus, &available_cpus,
+ irq_default_affinity);
+ else if (cpumask_empty(&available_cpus))
affinity = cpu_online_mask;
+
+ /*
+ * We are overriding the affinity with all online and
+ * un-isolated cpus. irq_set_affinity_locked() call
+ * below notify this mask to PM QOS affinity listener.
+ * That results in applying the CPU_DMA_LATENCY QOS
+ * to all the CPUs specified in the mask. But the low
+ * level irqchip driver sets the affinity of an irq
+ * to only one CPU. So pick only one CPU from the
+ * prepared mask while overriding the user affinity.
+ */
+ affinity = cpumask_of(cpumask_any(affinity));
ret = true;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e5c70dcb7f8e..2c2effdb4437 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1305,8 +1305,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret)
+ if (ret) {
+ irq_release_resources(desc);
goto out_mask;
+ }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a24c5b909047..b05509af0352 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
goto free_cpumask;
}
+ if (cpumask_subset(new_value, cpu_isolated_mask)) {
+ err = -EINVAL;
+ goto free_cpumask;
+ }
+
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 850b255649a2..698b8dec3074 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -604,6 +604,19 @@ repeat:
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ lockdep_assert_held(&worker->lock);
+
+ return !list_empty(&work->node) || work->canceling;
+}
+
/* insert @work before @pos in @worker */
static void insert_kthread_work(struct kthread_worker *worker,
struct kthread_work *work,
@@ -633,7 +646,7 @@ bool queue_kthread_work(struct kthread_worker *worker,
unsigned long flags;
spin_lock_irqsave(&worker->lock, flags);
- if (list_empty(&work->node)) {
+ if (!queuing_blocked(worker, work)) {
insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
@@ -694,6 +707,87 @@ retry:
}
EXPORT_SYMBOL_GPL(flush_kthread_work);
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ * %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work,
+ unsigned long *flags)
+{
+ /*
+ * Try to remove the work from a worker list. It might either
+ * be from worker->work_list or from worker->delayed_work_list.
+ */
+ if (!list_empty(&work->node)) {
+ list_del_init(&work->node);
+ return true;
+ }
+
+ return false;
+}
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work)
+{
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+ int ret = false;
+
+ if (!worker)
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ ret = __kthread_cancel_work(work, &flags);
+
+ if (worker->current_work != work)
+ goto out_fast;
+
+ /*
+ * The work is in progress and we need to wait with the lock released.
+ * In the meantime, block any queuing by setting the canceling counter.
+ */
+ work->canceling++;
+ spin_unlock_irqrestore(&worker->lock, flags);
+ flush_kthread_work(work);
+ spin_lock_irqsave(&worker->lock, flags);
+ work->canceling--;
+
+out_fast:
+ spin_unlock_irqrestore(&worker->lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+ return __kthread_cancel_work_sync(work);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
/**
* flush_kthread_worker - flush all current works on a kthread_worker
* @worker: worker to flush
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 05a37857ab55..1e6a51cc25c4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -1,6 +1,7 @@
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
+#include <linux/sched/rt.h>
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
+ struct task_struct *task = current;
int curr = encode_cpu(smp_processor_id());
int old;
@@ -104,6 +106,32 @@ bool osq_lock(struct optimistic_spin_queue *lock)
prev = decode_cpu(old);
node->prev = prev;
+
+ /*
+ * We need to avoid reordering of link updation sequence of osq.
+ * A case in which the status of optimistic spin queue is
+ * CPU6->CPU2 in which CPU6 has acquired the lock. At this point
+ * if CPU0 comes in to acquire osq_lock, it will update the tail
+ * count. After tail count update if CPU2 starts to unqueue itself
+ * from optimistic spin queue, it will find updated tail count with
+ * CPU0 and update CPU2 node->next to NULL in osq_wait_next(). If
+ * reordering of following stores happen then prev->next where prev
+ * being CPU2 would be updated to point to CPU0 node:
+ * node->prev = prev;
+ * WRITE_ONCE(prev->next, node);
+ *
+ * At this point if next instruction
+ * WRITE_ONCE(next->prev, prev);
+ * in CPU2 path is committed before the update of CPU0 node->prev =
+ * prev then CPU0 node->prev will point to CPU6 node. At this point
+ * if CPU0 path's node->prev = prev is committed resulting in change
+ * of CPU0 prev back to CPU2 node. CPU2 node->next is NULL, so if
+ * CPU0 gets into unqueue path of osq_lock it will keep spinning
+ * in infinite loop as condition prev->next == node will never be
+ * true.
+ */
+ smp_mb();
+
WRITE_ONCE(prev->next, node);
/*
@@ -118,8 +146,13 @@ bool osq_lock(struct optimistic_spin_queue *lock)
while (!READ_ONCE(node->locked)) {
/*
* If we need to reschedule bail... so we can block.
+ * If a task spins on owner on a CPU after acquiring
+ * osq_lock while a RT task spins on another CPU to
+ * acquire osq_lock, it will starve the owner from
+ * completing if owner is to be scheduled on the same CPU.
+ * It will be a live lock.
*/
- if (need_resched())
+ if (need_resched() || rt_task(task))
goto unqueue;
cpu_relax_lowlatency();
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..75c950ede9c7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -511,6 +511,41 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
unsigned long flags;
/*
+ * If a spinner is present, there is a chance that the load of
+ * rwsem_has_spinner() in rwsem_wake() can be reordered with
+ * respect to decrement of rwsem count in __up_write() leading
+ * to wakeup being missed.
+ *
+ * spinning writer up_write caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * spin_lock(wait_lock)
+ * sem->count=0xFFFFFFFF00000001
+ * +0xFFFFFFFF00000000
+ * count=sem->count
+ * MB
+ * sem->count=0xFFFFFFFE00000001
+ * -0xFFFFFFFF00000001
+ * RMB
+ * spin_trylock(wait_lock)
+ * return
+ * rwsem_try_write_lock(count)
+ * spin_unlock(wait_lock)
+ * schedule()
+ *
+ * Reordering of atomic_long_sub_return_release() in __up_write()
+ * and rwsem_has_spinner() in rwsem_wake() can cause missing of
+ * wakeup in up_write() context. In spinning writer, sem->count
+ * and local variable count is 0XFFFFFFFE00000001. It would result
+ * in rwsem_try_write_lock() failing to acquire rwsem and spinning
+ * writer going to sleep in rwsem_down_write_failed().
+ *
+ * The smp_rmb() here is to make sure that the spinner state is
+ * consulted after sem->count is updated in up_write context.
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/panic.c b/kernel/panic.c
index 982a52352cfc..679254405510 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -172,7 +172,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c7e8ed99c953..5e2cd1030702 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,19 +28,25 @@
#include <linux/compat.h>
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
rcu_read_lock();
- child->ptracer_cred = get_cred(__task_cred(new_parent));
+ __ptrace_link(child, new_parent, __task_cred(new_parent));
rcu_read_unlock();
}
@@ -353,7 +359,7 @@ static int ptrace_attach(struct task_struct *task, long request,
flags |= PT_SEIZED;
task->ptrace = flags;
- __ptrace_link(task, current);
+ ptrace_link(task, current);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -420,7 +426,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/resource.c b/kernel/resource.c
index 4c9835c09dcd..c09d484f7b5f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v)
{
struct resource *root = m->private;
struct resource *r = v, *p;
+ unsigned long long start, end;
int width = root->end < 0x10000 ? 4 : 8;
int depth;
for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
if (p->parent == root)
break;
+
+ if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+ start = r->start;
+ end = r->end;
+ } else {
+ start = end = 0;
+ }
+
seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
depth * 2, "",
- width, (unsigned long long) r->start,
- width, (unsigned long long) r->end,
+ width, start,
+ width, end,
r->name ? r->name : "<BAD>");
return 0;
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 308f80ce2e43..a353df46c8e4 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,4 +22,6 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d2b8834dd3b..4ecca604e64b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2623,9 +2623,9 @@ void wake_up_new_task(struct task_struct *p)
*/
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
-
rq = __task_rq_lock(p);
mark_task_starting(p);
+ post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -3154,9 +3154,10 @@ unsigned long sum_capacity_reqs(unsigned long cfs_cap,
return total += scr->dl;
}
+unsigned long boosted_cpu_util(int cpu);
static void sched_freq_tick_pelt(int cpu)
{
- unsigned long cpu_utilization = capacity_max;
+ unsigned long cpu_utilization = boosted_cpu_util(cpu);
unsigned long capacity_curr = capacity_curr_of(cpu);
struct sched_capacity_reqs *scr;
@@ -6460,9 +6461,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if (!(sd->flags & SD_LOAD_BALANCE)) {
printk("does not load-balance\n");
- if (sd->parent)
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
- " has parent");
return -1;
}
@@ -6555,8 +6553,12 @@ static inline bool sched_debug(void)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpumask_weight(sched_domain_span(sd)) == 1)
- return 1;
+ if (cpumask_weight(sched_domain_span(sd)) == 1) {
+ if (sd->groups->sge)
+ sd->flags &= ~SD_LOAD_BALANCE;
+ else
+ return 1;
+ }
/* Following flags need at least 2 groups */
if (sd->flags & (SD_LOAD_BALANCE |
@@ -6564,6 +6566,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN |
SD_SHARE_CAP_STATES)) {
@@ -6595,11 +6598,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
+ SD_ASYM_CPUCAPACITY |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN |
SD_SHARE_CAP_STATES);
+ if (parent->groups->sge) {
+ parent->flags &= ~SD_LOAD_BALANCE;
+ return 0;
+ }
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -6680,6 +6688,9 @@ static int init_rootdomain(struct root_domain *rd)
goto free_rto_mask;
init_max_cpu_capacity(&rd->max_cpu_capacity);
+
+ rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
return 0;
free_rto_mask:
@@ -6913,6 +6924,9 @@ enum s_alloc {
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
+ * Only CPUs that can arrive at this group should be considered to continue
+ * balancing.
+ *
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
@@ -6924,18 +6938,31 @@ enum s_alloc {
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ const struct cpumask *sg_span = sched_group_cpus(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
continue;
cpumask_set_cpu(i, sched_group_mask(sg));
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
}
/*
@@ -6996,6 +7023,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -7291,11 +7319,19 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
- * SD_SHARE_CPUCAPACITY - describes SMT topologies
- * SD_SHARE_PKG_RESOURCES - describes shared caches
- * SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
- * SD_SHARE_CAP_STATES - describes shared capacity states
+ * These flags are purely descriptive of the topology and do not prescribe
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
+ * function:
+ *
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
+ * SD_SHARE_CAP_STATES - describes shared capacity states
+ *
+ * Odd one out, which beside describing the topology has a quirk also
+ * prescribes the desired behaviour that goes along with it:
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -7305,11 +7341,13 @@ static int sched_domains_curr_level;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
+ SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN | \
SD_SHARE_CAP_STATES)
static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+ struct sched_domain *child, int cpu)
{
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
int sd_weight, sd_flags = 0;
@@ -7361,6 +7399,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
.smt_gain = 0,
.max_newidle_lb_cost = 0,
.next_decay_max_lb_cost = jiffies,
+ .child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
@@ -7370,6 +7409,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
+ if (sd->flags & SD_ASYM_CPUCAPACITY) {
+ struct sched_domain *t = sd;
+
+ for_each_lower_domain(t)
+ t->flags |= SD_BALANCE_WAKE;
+ }
+
if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
@@ -7816,16 +7862,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu);
- if (!sd)
- return child;
+ struct sched_domain *sd = sd_init(tl, child, cpu);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
- sd->child = child;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
@@ -7859,7 +7902,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
- struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7877,8 +7919,6 @@ static int build_sched_domains(const struct cpumask *cpu_map,
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
- if (cpumask_equal(cpu_map, sched_domain_span(sd)))
- break;
}
}
@@ -7914,8 +7954,19 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- rq = cpu_rq(i);
+ int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+ int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
+ if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+ cpu_rq(max_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+ if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+ cpu_rq(min_cpu)->cpu_capacity_orig))
+ WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
sd = *per_cpu_ptr(d.sd, i);
+
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
@@ -8339,6 +8390,7 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
/*
* How much cpu bandwidth does root_task_group get?
*
@@ -9180,11 +9232,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -9566,6 +9627,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100644
index 000000000000..dbc51442ecbc
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned int flags))
+{
+ if (WARN_ON(!data || !func))
+ return;
+
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index d751bc2d0d6e..f10d9f7d6d07 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -32,6 +32,12 @@ static struct cpufreq_governor cpufreq_gov_sched;
static DEFINE_PER_CPU(unsigned long, enabled);
DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+struct gov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int up_throttle_nsec;
+ unsigned int down_throttle_nsec;
+};
+
/**
* gov_data - per-policy data internal to the governor
* @up_throttle: next throttling period expiry if increasing OPP
@@ -53,8 +59,8 @@ DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
struct gov_data {
ktime_t up_throttle;
ktime_t down_throttle;
- unsigned int up_throttle_nsec;
- unsigned int down_throttle_nsec;
+ struct gov_tunables *tunables;
+ struct list_head tunables_hook;
struct task_struct *task;
struct irq_work irq_work;
unsigned int requested_freq;
@@ -71,8 +77,10 @@ static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
__cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
- gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
- gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
+ gd->up_throttle = ktime_add_ns(ktime_get(),
+ gd->tunables->up_throttle_nsec);
+ gd->down_throttle = ktime_add_ns(ktime_get(),
+ gd->tunables->down_throttle_nsec);
up_write(&policy->rwsem);
}
@@ -262,12 +270,70 @@ static inline void clear_sched_freq(void)
static_key_slow_dec(&__sched_freq);
}
-static struct attribute_group sched_attr_group_gov_pol;
-static struct attribute_group *get_sysfs_attr(void)
+/* Tunables */
+static struct gov_tunables *global_tunables;
+
+static inline struct gov_tunables *to_tunables(struct gov_attr_set *attr_set)
{
- return &sched_attr_group_gov_pol;
+ return container_of(attr_set, struct gov_tunables, attr_set);
}
+static ssize_t up_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_throttle_nsec);
+}
+
+static ssize_t up_throttle_nsec_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->up_throttle_nsec = val;
+ return count;
+}
+
+static ssize_t down_throttle_nsec_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->down_throttle_nsec);
+}
+
+static ssize_t down_throttle_nsec_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct gov_tunables *tunables = to_tunables(attr_set);
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ tunables->down_throttle_nsec = val;
+ return count;
+}
+
+static struct governor_attr up_throttle_nsec = __ATTR_RW(up_throttle_nsec);
+static struct governor_attr down_throttle_nsec = __ATTR_RW(down_throttle_nsec);
+
+static struct attribute *schedfreq_attributes[] = {
+ &up_throttle_nsec.attr,
+ &down_throttle_nsec.attr,
+ NULL
+};
+
+static struct kobj_type tunables_ktype = {
+ .default_attrs = schedfreq_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
{
struct gov_data *gd;
@@ -282,17 +348,39 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
if (!gd)
return -ENOMEM;
- gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
- policy->cpuinfo.transition_latency :
- THROTTLE_UP_NSEC;
- gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
- pr_debug("%s: throttle threshold = %u [ns]\n",
- __func__, gd->up_throttle_nsec);
-
- rc = sysfs_create_group(&policy->kobj, get_sysfs_attr());
- if (rc) {
- pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
- goto err;
+ policy->governor_data = gd;
+
+ if (!global_tunables) {
+ gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);
+ if (!gd->tunables)
+ goto free_gd;
+
+ gd->tunables->up_throttle_nsec =
+ policy->cpuinfo.transition_latency ?
+ policy->cpuinfo.transition_latency :
+ THROTTLE_UP_NSEC;
+ gd->tunables->down_throttle_nsec =
+ THROTTLE_DOWN_NSEC;
+
+ rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,
+ &tunables_ktype,
+ get_governor_parent_kobj(policy),
+ "%s", cpufreq_gov_sched.name);
+ if (rc)
+ goto free_tunables;
+
+ gov_attr_set_init(&gd->tunables->attr_set,
+ &gd->tunables_hook);
+
+ pr_debug("%s: throttle_threshold = %u [ns]\n",
+ __func__, gd->tunables->up_throttle_nsec);
+
+ if (!have_governor_per_policy())
+ global_tunables = gd->tunables;
+ } else {
+ gd->tunables = global_tunables;
+ gov_attr_set_get(&global_tunables->attr_set,
+ &gd->tunables_hook);
}
policy->governor_data = gd;
@@ -304,7 +392,7 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
if (IS_ERR_OR_NULL(gd->task)) {
pr_err("%s: failed to create kschedfreq thread\n",
__func__);
- goto err;
+ goto free_tunables;
}
get_task_struct(gd->task);
kthread_bind_mask(gd->task, policy->related_cpus);
@@ -316,7 +404,9 @@ static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
return 0;
-err:
+free_tunables:
+ kfree(gd->tunables);
+free_gd:
policy->governor_data = NULL;
kfree(gd);
return -ENOMEM;
@@ -324,6 +414,7 @@ err:
static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
{
+ unsigned int count;
struct gov_data *gd = policy->governor_data;
clear_sched_freq();
@@ -332,7 +423,12 @@ static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
put_task_struct(gd->task);
}
- sysfs_remove_group(&policy->kobj, get_sysfs_attr());
+ count = gov_attr_set_put(&gd->tunables->attr_set, &gd->tunables_hook);
+ if (!count) {
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+ kfree(gd->tunables);
+ }
policy->governor_data = NULL;
@@ -394,88 +490,6 @@ static int cpufreq_sched_setup(struct cpufreq_policy *policy,
return 0;
}
-/* Tunables */
-static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
-{
- return sprintf(buf, "%u\n", gd->up_throttle_nsec);
-}
-
-static ssize_t store_up_throttle_nsec(struct gov_data *gd,
- const char *buf, size_t count)
-{
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- gd->up_throttle_nsec = val;
- return count;
-}
-
-static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
-{
- return sprintf(buf, "%u\n", gd->down_throttle_nsec);
-}
-
-static ssize_t store_down_throttle_nsec(struct gov_data *gd,
- const char *buf, size_t count)
-{
- int ret;
- long unsigned int val;
-
- ret = kstrtoul(buf, 0, &val);
- if (ret < 0)
- return ret;
- gd->down_throttle_nsec = val;
- return count;
-}
-
-/*
- * Create show/store routines
- * - sys: One governor instance for complete SYSTEM
- * - pol: One governor instance per struct cpufreq_policy
- */
-#define show_gov_pol_sys(file_name) \
-static ssize_t show_##file_name##_gov_pol \
-(struct cpufreq_policy *policy, char *buf) \
-{ \
- return show_##file_name(policy->governor_data, buf); \
-}
-
-#define store_gov_pol_sys(file_name) \
-static ssize_t store_##file_name##_gov_pol \
-(struct cpufreq_policy *policy, const char *buf, size_t count) \
-{ \
- return store_##file_name(policy->governor_data, buf, count); \
-}
-
-#define gov_pol_attr_rw(_name) \
- static struct freq_attr _name##_gov_pol = \
- __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
-
-#define show_store_gov_pol_sys(file_name) \
- show_gov_pol_sys(file_name); \
- store_gov_pol_sys(file_name)
-#define tunable_handlers(file_name) \
- show_gov_pol_sys(file_name); \
- store_gov_pol_sys(file_name); \
- gov_pol_attr_rw(file_name)
-
-tunable_handlers(down_throttle_nsec);
-tunable_handlers(up_throttle_nsec);
-
-/* Per policy governor instance */
-static struct attribute *sched_attributes_gov_pol[] = {
- &up_throttle_nsec_gov_pol.attr,
- &down_throttle_nsec_gov_pol.attr,
- NULL,
-};
-
-static struct attribute_group sched_attr_group_gov_pol = {
- .attrs = sched_attributes_gov_pol,
- .name = "sched",
-};
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
static
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..e12309c1b07b
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,788 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpufreq.h>
+#include <linux/kthread.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned long boosted_cpu_util(int cpu);
+
+/* Stub out fast switch routines present on mainline to reduce the backport
+ * overhead. */
+#define cpufreq_driver_fast_switch(x, y) 0
+#define cpufreq_enable_fast_switch(x)
+#define cpufreq_disable_fast_switch(x)
+#define LATENCY_MULTIPLIER (1000)
+#define SUGOV_KTHREAD_PRIORITY 50
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int up_rate_limit_us;
+ unsigned int down_rate_limit_us;
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 min_rate_limit_ns;
+ s64 up_rate_delay_ns;
+ s64 down_rate_delay_ns;
+ unsigned int next_freq;
+ unsigned int cached_raw_freq;
+
+ /* The next fields are only needed if fast switch cannot be used. */
+ struct irq_work irq_work;
+ struct kthread_work work;
+ struct mutex work_lock;
+ struct kthread_worker worker;
+ struct task_struct *thread;
+ bool work_in_progress;
+
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+
+ unsigned long iowait_boost;
+ unsigned long iowait_boost_max;
+ u64 last_update;
+
+ /* The fields below are only needed when sharing a policy. */
+ unsigned long util;
+ unsigned long max;
+ unsigned int flags;
+
+ /* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned long saved_idle_calls;
+#endif
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+
+ if (sg_policy->work_in_progress)
+ return false;
+
+ if (unlikely(sg_policy->need_freq_update)) {
+ sg_policy->need_freq_update = false;
+ /*
+ * This happens when limits change, so forget the previous
+ * next_freq value and force an update.
+ */
+ sg_policy->next_freq = UINT_MAX;
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ /* No need to recalculate next freq for min_rate_limit_us at least */
+ return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ s64 delta_ns;
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+
+ if (next_freq > sg_policy->next_freq &&
+ delta_ns < sg_policy->up_rate_delay_ns)
+ return true;
+
+ if (next_freq < sg_policy->next_freq &&
+ delta_ns < sg_policy->down_rate_delay_ns)
+ return true;
+
+ return false;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+
+ if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+ return;
+
+ if (sg_policy->next_freq == next_freq)
+ return;
+
+ sg_policy->next_freq = next_freq;
+ sg_policy->last_freq_update_time = time;
+
+ if (policy->fast_switch_enabled) {
+ next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+ if (next_freq == CPUFREQ_ENTRY_INVALID)
+ return;
+
+ policy->cur = next_freq;
+ trace_cpu_frequency(next_freq, smp_processor_id());
+ } else {
+ sg_policy->work_in_progress = true;
+ irq_work_queue(&sg_policy->irq_work);
+ }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @sg_policy: schedutil policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
+ */
+static unsigned int get_next_freq(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int freq = arch_scale_freq_invariant() ?
+ policy->cpuinfo.max_freq : policy->cur;
+
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_policy->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
+}
+
+static inline bool use_pelt(void)
+{
+#ifdef CONFIG_SCHED_WALT
+ return (!sysctl_sched_use_walt_cpu_util || walt_disabled);
+#else
+ return true;
+#endif
+}
+
+static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long max_cap, rt;
+ s64 delta;
+
+ max_cap = arch_scale_cpu_capacity(NULL, cpu);
+
+ sched_avg_update(rq);
+ delta = time - rq->age_stamp;
+ if (unlikely(delta < 0))
+ delta = 0;
+ rt = div64_u64(rq->rt_avg, sched_avg_period() + delta);
+ rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT;
+
+ *util = boosted_cpu_util(cpu);
+ if (likely(use_pelt()))
+ *util = min((*util + rt), max_cap);
+
+ *max = max_cap;
+}
+
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned int flags)
+{
+ if (flags & SCHED_CPUFREQ_IOWAIT) {
+ sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+ } else if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC)
+ sg_cpu->iowait_boost = 0;
+ }
+}
+
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+ unsigned long *max)
+{
+ unsigned long boost_util = sg_cpu->iowait_boost;
+ unsigned long boost_max = sg_cpu->iowait_boost_max;
+
+ if (!boost_util)
+ return;
+
+ if (*util * boost_max < *max * boost_util) {
+ *util = boost_util;
+ *max = boost_max;
+ }
+ sg_cpu->iowait_boost >>= 1;
+}
+
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+ unsigned long idle_calls = tick_nohz_get_idle_calls();
+ bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+ sg_cpu->saved_idle_calls = idle_calls;
+ return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned long util, max;
+ unsigned int next_f;
+ bool busy;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ if (!sugov_should_update_freq(sg_policy, time))
+ return;
+
+ busy = sugov_cpu_is_busy(sg_cpu);
+
+ if (flags & SCHED_CPUFREQ_DL) {
+ next_f = policy->cpuinfo.max_freq;
+ } else {
+ sugov_get_util(&util, &max, time);
+ sugov_iowait_boost(sg_cpu, &util, &max);
+ next_f = get_next_freq(sg_policy, util, max);
+ /*
+ * Do not reduce the frequency if the CPU has not been idle
+ * recently, as the reduction is likely to be premature then.
+ */
+ if (busy && next_f < sg_policy->next_freq)
+ next_f = sg_policy->next_freq;
+ }
+ sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
+{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned long util = 0, max = 1;
+ unsigned int j;
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
+ unsigned long j_util, j_max;
+ s64 delta_ns;
+
+ /*
+ * If the CPU utilization was last updated before the previous
+ * frequency update and the time elapsed between the last update
+ * of the CPU utilization and the last frequency update is long
+ * enough, don't take the CPU into account as it probably is
+ * idle now (and clear iowait_boost for it).
+ */
+ delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ if (delta_ns > TICK_NSEC) {
+ j_sg_cpu->iowait_boost = 0;
+ continue;
+ }
+ if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
+ return policy->cpuinfo.max_freq;
+
+ j_util = j_sg_cpu->util;
+ j_max = j_sg_cpu->max;
+ if (j_util * max > j_max * util) {
+ util = j_util;
+ max = j_max;
+ }
+
+ sugov_iowait_boost(j_sg_cpu, &util, &max);
+ }
+
+ return get_next_freq(sg_policy, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned long util, max;
+ unsigned int next_f;
+
+ sugov_get_util(&util, &max, time);
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sg_cpu->util = util;
+ sg_cpu->max = max;
+ sg_cpu->flags = flags;
+
+ sugov_set_iowait_boost(sg_cpu, time, flags);
+ sg_cpu->last_update = time;
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ if (flags & SCHED_CPUFREQ_DL)
+ next_f = sg_policy->policy->cpuinfo.max_freq;
+ else
+ next_f = sugov_next_freq_shared(sg_cpu);
+
+ sugov_update_commit(sg_policy, time, next_f);
+ }
+
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct kthread_work *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+ mutex_lock(&sg_policy->work_lock);
+ __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+ CPUFREQ_RELATION_L);
+ mutex_unlock(&sg_policy->work_lock);
+
+ sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+
+ /*
+ * For RT and deadline tasks, the schedutil governor shoots the
+ * frequency to maximum. Special care must be taken to ensure that this
+ * kthread doesn't result in the same behavior.
+ *
+ * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+ * updated only at the end of the sugov_work() function and before that
+ * the schedutil governor rejects all other frequency scaling requests.
+ *
+ * There is a very rare case though, where the RT thread yields right
+ * after the work_in_progress flag is cleared. The effects of that are
+ * neglected for now.
+ */
+ queue_kthread_work(&sg_policy->worker, &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_us(struct sugov_policy *sg_policy)
+{
+ mutex_lock(&min_rate_lock);
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+ mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
+}
+
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->up_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->down_rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+ sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ }
+
+ return count;
+}
+
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+ &up_rate_limit_us.attr,
+ &down_rate_limit_us.attr,
+ NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+ .default_attrs = sugov_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ kfree(sg_policy);
+}
+
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+ struct task_struct *thread;
+ struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+ struct cpufreq_policy *policy = sg_policy->policy;
+ int ret;
+
+ /* kthread only required for slow path */
+ if (policy->fast_switch_enabled)
+ return 0;
+
+ init_kthread_work(&sg_policy->work, sugov_work);
+ init_kthread_worker(&sg_policy->worker);
+ thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+ "sugov:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR(thread)) {
+ pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+ return PTR_ERR(thread);
+ }
+
+ ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+ if (ret) {
+ kthread_stop(thread);
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ return ret;
+ }
+
+ sg_policy->thread = thread;
+ kthread_bind_mask(thread, policy->related_cpus);
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ mutex_init(&sg_policy->work_lock);
+
+ wake_up_process(thread);
+
+ return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+ /* kthread only required for slow path */
+ if (sg_policy->policy->fast_switch_enabled)
+ return;
+
+ flush_kthread_worker(&sg_policy->worker);
+ kthread_stop(sg_policy->thread);
+ mutex_destroy(&sg_policy->work_lock);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ unsigned int lat;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ cpufreq_enable_fast_switch(policy);
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy) {
+ ret = -ENOMEM;
+ goto disable_fast_switch;
+ }
+
+ ret = sugov_kthread_create(sg_policy);
+ if (ret)
+ goto free_sg_policy;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto stop_kthread;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto stop_kthread;
+ }
+
+ tunables->up_rate_limit_us = LATENCY_MULTIPLIER;
+ tunables->down_rate_limit_us = LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat) {
+ tunables->up_rate_limit_us *= lat;
+ tunables->down_rate_limit_us *= lat;
+ }
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ cpufreq_gov_schedutil.name);
+ if (ret)
+ goto fail;
+
+out:
+ mutex_unlock(&global_tunables_lock);
+ return 0;
+
+fail:
+ policy->governor_data = NULL;
+ sugov_tunables_free(tunables);
+
+stop_kthread:
+ sugov_kthread_stop(sg_policy);
+
+free_sg_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+ cpufreq_disable_fast_switch(policy);
+
+ pr_err("initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count)
+ sugov_tunables_free(tunables);
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_kthread_stop(sg_policy);
+ sugov_policy_free(sg_policy);
+
+ cpufreq_disable_fast_switch(policy);
+ return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ sg_policy->up_rate_delay_ns =
+ sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+ update_min_rate_limit_us(sg_policy);
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = UINT_MAX;
+ sg_policy->work_in_progress = false;
+ sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->sg_policy = sg_policy;
+ sg_cpu->flags = SCHED_CPUFREQ_DL;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
+ }
+ return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_sched();
+
+ if (!policy->fast_switch_enabled) {
+ irq_work_sync(&sg_policy->irq_work);
+ kthread_cancel_work_sync(&sg_policy->work);
+ }
+ return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ if (!policy->fast_switch_enabled) {
+ mutex_lock(&sg_policy->work_lock);
+ cpufreq_policy_apply_limits(policy);
+ mutex_unlock(&sg_policy->work_lock);
+ }
+
+ sg_policy->need_freq_update = true;
+
+ return 0;
+}
+
+static int cpufreq_schedutil_cb(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ switch(event) {
+ case CPUFREQ_GOV_POLICY_INIT:
+ return sugov_init(policy);
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return sugov_exit(policy);
+ case CPUFREQ_GOV_START:
+ return sugov_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return sugov_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ return sugov_limits(policy);
+ default:
+ BUG();
+ }
+}
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+static
+#endif
+struct cpufreq_governor cpufreq_gov_schedutil = {
+ .name = "schedutil",
+ .governor = cpufreq_schedutil_cb,
+ .owner = THIS_MODULE,
+};
+
+static int __init sugov_register(void)
+{
+ return cpufreq_register_governor(&cpufreq_gov_schedutil);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..1d00cf8c00fa 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -27,6 +27,8 @@
* of the License.
*/
+#include "sched.h"
+
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -51,6 +53,27 @@ static int convert_prio(int prio)
}
/**
+ * drop_nopreempt_cpus - remove a cpu from the mask if it is likely
+ * non-preemptible
+ * @lowest_mask: mask with selected CPUs (non-NULL)
+ */
+static void
+drop_nopreempt_cpus(struct cpumask *lowest_mask)
+{
+ unsigned int cpu = cpumask_first(lowest_mask);
+
+ while (cpu < nr_cpu_ids) {
+ /* unlocked access */
+ struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr);
+
+ if (task_may_not_preempt(task, cpu))
+ cpumask_clear_cpu(cpu, lowest_mask);
+
+ cpu = cpumask_next(cpu, lowest_mask);
+ }
+}
+
+/**
* cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context
* @p: The task
@@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
{
int idx = 0;
int task_pri = convert_prio(p->prio);
+ bool drop_nopreempts = task_pri <= MAX_RT_PRIO;
BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
+retry:
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
int skip = 0;
@@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
-
+ if (drop_nopreempts)
+ drop_nopreempt_cpus(lowest_mask);
/*
* We have to ensure that we have at least one bit
* still set in the array, since the map could have
@@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
return 1;
}
-
+ /*
+ * If we can't find any non-preemptible cpu's, retry so we can
+ * find the lowest priority target and avoid priority inversion.
+ */
+ if (drop_nopreempts) {
+ drop_nopreempts = false;
+ goto retry;
+ }
return 0;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3d55ec89c400..a105e97ab6bf 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -755,6 +755,9 @@ static void update_curr_dl(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c8c4272c61d8..ed8e6bb4531b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -636,6 +636,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
+ /* eas */
+ /* select_idle_sibling() */
+ P(se.statistics.nr_wakeups_sis_attempts);
+ P(se.statistics.nr_wakeups_sis_idle);
+ P(se.statistics.nr_wakeups_sis_cache_affine);
+ P(se.statistics.nr_wakeups_sis_suff_cap);
+ P(se.statistics.nr_wakeups_sis_idle_cpu);
+ P(se.statistics.nr_wakeups_sis_count);
+ /* select_energy_cpu_brute() */
+ P(se.statistics.nr_wakeups_secb_attempts);
+ P(se.statistics.nr_wakeups_secb_sync);
+ P(se.statistics.nr_wakeups_secb_idle_bt);
+ P(se.statistics.nr_wakeups_secb_insuff_cap);
+ P(se.statistics.nr_wakeups_secb_no_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_count);
+ /* find_best_target() */
+ P(se.statistics.nr_wakeups_fbt_attempts);
+ P(se.statistics.nr_wakeups_fbt_no_cpu);
+ P(se.statistics.nr_wakeups_fbt_no_sd);
+ P(se.statistics.nr_wakeups_fbt_pref_idle);
+ P(se.statistics.nr_wakeups_fbt_count);
+ /* cas */
+ /* select_task_rq_fair() */
+ P(se.statistics.nr_wakeups_cas_attempts);
+ P(se.statistics.nr_wakeups_cas_count);
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
__P(load_avg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 099a1b93bebf..853064319b0d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -35,6 +35,8 @@
#include "sched.h"
#include <trace/events/sched.h>
#include "tune.h"
+#include "walt.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -50,7 +52,6 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
-unsigned int sysctl_sched_is_big_little = 0;
unsigned int sysctl_sched_sync_hint_enable = 1;
unsigned int sysctl_sched_initial_task_util = 0;
unsigned int sysctl_sched_cstate_aware = 1;
@@ -119,6 +120,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -294,19 +301,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
/*
* Ensure we either appear before our parent (if already
* enqueued) or force our parent to appear after us when it is
- * enqueued. The fact that we always enqueue bottom-up
- * reduces this to two cases.
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases and a special case for the root
+ * cfs_rq. Furthermore, it also means that we will always reset
+ * tmp_alone_branch either when the branch is connected
+ * to a tree or when we reach the beg of the tree
*/
if (cfs_rq->tg->parent &&
- cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
- list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
- } else {
+ cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
+ /*
+ * If parent is already on the list, we add the child
+ * just before. Thanks to circular linked property of
+ * the list, this means to put the child at the tail
+ * of the list that starts by parent.
+ */
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
+ /*
+ * The branch is now connected to its tree so we can
+ * reset tmp_alone_branch to the beginning of the
+ * list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else if (!cfs_rq->tg->parent) {
+ /*
+ * cfs rq without parent should be put
+ * at the tail of the list.
+ */
list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
- &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ &rq->leaf_cfs_rq_list);
+ /*
+ * We have reach the beg of a tree so we can reset
+ * tmp_alone_branch to the beginning of the list.
+ */
+ rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
+ } else {
+ /*
+ * The parent has not already been added so we want to
+ * make sure that it will be put after us.
+ * tmp_alone_branch points to the beg of the branch
+ * where we will add parent.
+ */
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ rq->tmp_alone_branch);
+ /*
+ * update tmp_alone_branch to points to the new beg
+ * of the branch
+ */
+ rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
}
cfs_rq->on_list = 1;
@@ -664,7 +711,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
/*
@@ -688,20 +735,115 @@ void init_entity_runnable_average(struct sched_entity *se)
* will definitely be update (after enqueue).
*/
sa->period_contrib = 1023;
- sa->load_avg = scale_load_down(se->load.weight);
+ /*
+ * Tasks are intialized with full load to be seen as heavy tasks until
+ * they get a chance to stabilize to their real load level.
+ * Group entities are intialized with zero load to reflect the fact that
+ * nothing has been attached to the task group yet.
+ */
+ if (entity_is_task(se))
+ sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = sched_freq() ?
- sysctl_sched_initial_task_util :
- scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * In previous Android versions, we used to have:
+ * sa->util_avg = sched_freq() ?
+ * sysctl_sched_initial_task_util :
+ * scale_load_down(SCHED_LOAD_SCALE);
+ * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ * However, that functionality has been moved to enqueue.
+ * It is unclear if we should restore this in enqueue.
+ */
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static void attach_entity_cfs_rq(struct sched_entity *se);
+
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ } else {
+ sa->util_avg = cap;
+ }
+ /*
+ * If we wish to restore tuning via setting initial util,
+ * this is where we should do it.
+ */
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
+ return;
+ }
+ }
+
+ attach_entity_cfs_rq(se);
+}
+
#else
void init_entity_runnable_average(struct sched_entity *se)
{
}
-#endif
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1425,7 +1567,8 @@ balance:
* Call select_idle_sibling to maybe find a better one.
*/
if (!cur)
- env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+ env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+ env->dst_cpu);
assign:
assigned = true;
@@ -2410,28 +2553,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
-
- return tg_weight;
-}
+ load = scale_load_down(cfs_rq->load.weight);
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2450,6 +2587,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -2468,16 +2606,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2486,8 +2628,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3790,25 +3933,262 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return decayed;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
/*
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
- * and effective_load (which is not done because it is too costly).
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/**
+ * update_tg_load_avg - update the tg's load avg
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
*/
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ /*
+ * No need to update load_avg for root_task_group as it is not used.
+ */
+ if (cfs_rq->tg == &root_task_group)
+ return;
+
if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
+
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's utilization */
+ se->avg.util_avg = gcfs_rq->avg.util_avg;
+ se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+ long delta, load = gcfs_rq->avg.load_avg;
+
+ /*
+ * If the load of group cfs_rq is null, the load of the
+ * sched_entity will also be null so we can skip the formula
+ */
+ if (load) {
+ long tg_load;
+
+ /* Get tg's load and ensure tg_load > 0 */
+ tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
+
+ /* Ensure tg_load >= load and updated with current load*/
+ tg_load -= gcfs_rq->tg_load_avg_contrib;
+ tg_load += load;
+
+ /*
+ * We need to compute a correction term in the case that the
+ * task group is consuming more CPU than a task of equal
+ * weight. A task with a weight equals to tg->shares will have
+ * a load less or equal to scale_load_down(tg->shares).
+ * Similarly, the sched_entities that represent the task group
+ * at parent level, can't have a load higher than
+ * scale_load_down(tg->shares). And the Sum of sched_entities'
+ * load must be <= scale_load_down(tg->shares).
+ */
+ if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
+ /* scale gcfs_rq's load into tg's shares*/
+ load *= scale_load_down(gcfs_rq->tg->shares);
+ load /= tg_load;
+ }
+ }
+
+ delta = load - se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_entity's load */
+ se->avg.load_avg = load;
+ se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&cfs_rq->avg.load_avg, delta);
+ cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * If the sched_entity is already enqueued, we also have to update the
+ * runnable load avg.
+ */
+ if (se->on_rq) {
+ /* Update parent cfs_rq runnable_load_avg */
+ add_positive(&cfs_rq->runnable_load_avg, delta);
+ cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+ }
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->propagate_avg = 1;
+}
+
+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
+
+ if (!cfs_rq->propagate_avg)
+ return 0;
+
+ cfs_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ if (entity_is_task(se))
+ return 0;
+
+ if (!test_and_clear_tg_cfs_propagate(se))
+ return 0;
+
+ cfs_rq = cfs_rq_of(se);
+
+ set_tg_cfs_propagate(cfs_rq);
+
+ update_tg_cfs_util(cfs_rq, se);
+ update_tg_cfs_load(cfs_rq, se);
+
+ return 1;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
+
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+ return 0;
+}
+
+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}
+
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
/*
@@ -3828,23 +4208,43 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sub_positive(&sa->load_avg, r);
sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
removed = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+ removed_util = 1;
+ set_tg_cfs_propagate(cfs_rq);
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3859,68 +4259,89 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
if (cfs_rq == &rq_of(cfs_rq)->cfs)
trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
return decayed || removed;
}
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG 0x1
+#define SKIP_AGE_LOAD 0x2
+
/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
+static inline void update_load_avg(struct sched_entity *se, int flags)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
int cpu = cpu_of(rq_of(cfs_rq));
+ int decayed;
+ void *ptr = NULL;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
+ __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
+ }
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed |= propagate_entity_load_avg(se);
+
+ if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
- if (entity_is_task(se))
- trace_sched_load_avg_task(task_of(se), &se->avg);
+ if (entity_is_task(se)) {
+#ifdef CONFIG_SCHED_WALT
+ ptr = (void *)&(task_of(se)->ravg);
+#endif
+ trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
+ }
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (!sched_feat(ATTACH_AGE_LOAD))
- goto skip_aging;
-
- /*
- * If we got migrated (either between CPUs or between cgroups) we'll
- * have aged the average right before clearing @last_update_time.
- */
- if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
-
- /*
- * XXX: we could have just aged the entire load away if we've been
- * absent from the fair class for too long.
- */
- }
-
-skip_aging:
se->avg.last_update_time = cfs_rq->avg.last_update_time;
cfs_rq->avg.load_avg += se->avg.load_avg;
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ set_tg_cfs_propagate(cfs_rq);
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -3928,34 +4349,20 @@ static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- u64 now = cfs_rq_clock_task(cfs_rq);
- int migrated, decayed;
-
- migrated = !sa->last_update_time;
- if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
- }
-
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
- if (migrated)
+ if (!sa->last_update_time) {
attach_entity_load_avg(cfs_rq, se);
-
- if (decayed || migrated)
update_tg_load_avg(cfs_rq, 0);
+ }
}
/* Remove the runnable load generated by se from cfs_rq's runnable load average */
static inline void
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- update_load_avg(se, 1);
-
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
cfs_rq->runnable_load_sum =
@@ -3984,13 +4391,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
#endif
/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+}
+
+/*
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
/*
* Newly created task or never used group entity should not be removed
@@ -3999,9 +4418,7 @@ void remove_entity_load_avg(struct sched_entity *se)
if (se->avg.last_update_time == 0)
return;
- last_update_time = cfs_rq_last_update_time(cfs_rq);
-
- __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
@@ -4038,7 +4455,16 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
+#define UPDATE_TG 0x0
+#define SKIP_AGE_LOAD 0x0
+
+static inline void update_load_avg(struct sched_entity *se, int not_used1){}
static inline void
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline void
@@ -4187,9 +4613,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -4262,6 +4689,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
+ update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
@@ -4297,7 +4734,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
return_cfs_rq_runtime(cfs_rq);
update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
/*
@@ -4352,7 +4789,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_load_avg(se, 1);
+ update_load_avg(se, UPDATE_TG);
}
update_stats_curr_start(cfs_rq, se);
@@ -4468,8 +4905,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_load_avg(curr, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(curr, UPDATE_TG);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -5373,7 +5810,7 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static bool cpu_overutilized(int cpu);
-static inline unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu);
#else
#define boosted_cpu_util(cpu) cpu_util(cpu)
#endif
@@ -5410,6 +5847,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int task_wakeup = flags & ENQUEUE_WAKEUP;
#endif
+ /*
+ * If in_iowait is set, the code below may not trigger any cpufreq
+ * utilization updates, so do it here explicitly with the IOWAIT flag
+ * passed.
+ */
+ if (p->in_iowait)
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
for_each_sched_entity(se) {
if (se->on_rq)
break;
@@ -5438,8 +5883,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se) {
@@ -5544,8 +5989,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, 1);
- update_cfs_shares(cfs_rq);
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
}
if (!se) {
@@ -6105,9 +6550,11 @@ static int find_new_capacity(struct energy_env *eenv,
return idx;
}
-static int group_idle_state(struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
{
int i, state = INT_MAX;
+ int src_in_grp, dst_in_grp;
+ long grp_util = 0;
/* Find the shallowest idle state in the sched group. */
for_each_cpu(i, sched_group_cpus(sg))
@@ -6116,6 +6563,54 @@ static int group_idle_state(struct sched_group *sg)
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
+ /*
+ * Try to estimate if a deeper idle state is
+ * achievable when we move the task.
+ */
+ for_each_cpu(i, sched_group_cpus(sg))
+ grp_util += cpu_util(i);
+
+ src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ if (src_in_grp == dst_in_grp) {
+ /* both CPUs under consideration are in the same group or not in
+ * either group, migration should leave idle state the same.
+ */
+ goto end;
+ }
+ /* add or remove util as appropriate to indicate what group util
+ * will be (worst case - no concurrent execution) after moving the task
+ */
+ grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
+
+ if (grp_util <=
+ ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+ /* after moving, this group is at most partly
+ * occupied, so it should have some idle time.
+ */
+ int max_idle_state_idx = sg->sge->nr_idle_states - 2;
+ int new_state = grp_util * max_idle_state_idx;
+ if (grp_util <= 0)
+ /* group will have no util, use lowest state */
+ new_state = max_idle_state_idx + 1;
+ else {
+ /* for partially idle, linearly map util to idle
+ * states, excluding the lowest one. This does not
+ * correspond to the state we expect to enter in
+ * reality, but an indication of what might happen.
+ */
+ new_state = min(max_idle_state_idx, (int)
+ (new_state / sg->sgc->max_capacity));
+ new_state = max_idle_state_idx - new_state;
+ }
+ state = new_state;
+ } else {
+ /* After moving, the group will be fully occupied
+ * so assume it will not be idle at all.
+ */
+ state = 0;
+ }
+end:
return state;
}
@@ -6151,15 +6646,7 @@ static int sched_group_energy(struct energy_env *eenv)
*/
sd = rcu_dereference(per_cpu(sd_scs, cpu));
- if (!sd)
- /*
- * We most probably raced with hotplug; returning a
- * wrong energy estimation is better than entering an
- * infinite loop.
- */
- return -EINVAL;
-
- if (sd->parent)
+ if (sd && sd->parent)
sg_shared_cap = sd->parent->groups;
for_each_domain(cpu, sd) {
@@ -6196,8 +6683,9 @@ static int sched_group_energy(struct energy_env *eenv)
}
}
- idle_idx = group_idle_state(sg);
+ idle_idx = group_idle_state(eenv, sg);
group_util = group_norm_util(eenv, sg);
+
sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
>> SCHED_CAPACITY_SHIFT;
sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
@@ -6214,6 +6702,14 @@ static int sched_group_energy(struct energy_env *eenv)
} while (sg = sg->next, sg != sd->groups);
}
+
+ /*
+ * If we raced with hotplug and got an sd NULL-pointer;
+ * returning a wrong energy estimation is better than
+ * entering an infinite loop.
+ */
+ if (cpumask_test_cpu(cpu, &visit_cpus))
+ return -EINVAL;
next_cpu:
cpumask_clear_cpu(cpu, &visit_cpus);
continue;
@@ -6240,6 +6736,7 @@ static inline int __energy_diff(struct energy_env *eenv)
struct sched_domain *sd;
struct sched_group *sg;
int sd_cpu = -1, energy_before = 0, energy_after = 0;
+ int diff, margin;
struct energy_env eenv_before = {
.util_delta = 0,
@@ -6282,12 +6779,22 @@ static inline int __energy_diff(struct energy_env *eenv)
eenv->nrg.after = energy_after;
eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
eenv->payoff = 0;
-
+#ifndef CONFIG_SCHED_TUNE
trace_sched_energy_diff(eenv->task,
eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
eenv->cap.before, eenv->cap.after, eenv->cap.delta,
eenv->nrg.delta, eenv->payoff);
+#endif
+ /*
+ * Dead-zone margin preventing too many migrations.
+ */
+
+ margin = eenv->nrg.before >> 6; /* ~1.56% */
+
+ diff = eenv->nrg.after - eenv->nrg.before;
+
+ eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
return eenv->nrg.diff;
}
@@ -6295,30 +6802,37 @@ static inline int __energy_diff(struct energy_env *eenv)
#ifdef CONFIG_SCHED_TUNE
struct target_nrg schedtune_target_nrg;
-
+extern bool schedtune_initialized;
/*
* System energy normalization
- * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
* corresponding to the specified energy variation.
*/
static inline int
normalize_energy(int energy_diff)
{
u32 normalized_nrg;
+
+ /* during early setup, we don't know the extents */
+ if (unlikely(!schedtune_initialized))
+ return energy_diff < 0 ? -1 : 1 ;
+
#ifdef CONFIG_SCHED_DEBUG
+ {
int max_delta;
/* Check for boundaries */
max_delta = schedtune_target_nrg.max_power;
max_delta -= schedtune_target_nrg.min_power;
WARN_ON(abs(energy_diff) >= max_delta);
+ }
#endif
/* Do scaling using positive numbers to increase the range */
normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
/* Scale by energy magnitude */
- normalized_nrg <<= SCHED_LOAD_SHIFT;
+ normalized_nrg <<= SCHED_CAPACITY_SHIFT;
/* Normalize on max energy for target platform */
normalized_nrg = reciprocal_divide(
@@ -6349,6 +6863,12 @@ energy_diff(struct energy_env *eenv)
eenv->cap.delta,
eenv->task);
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
/*
* When SchedTune is enabled, the energy_diff() function will return
* the computed energy payoff value. Since the energy_diff() return
@@ -6388,18 +6908,18 @@ static int wake_wide(struct task_struct *p)
return 1;
}
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+ int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
- int idx, this_cpu, prev_cpu;
+ int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
- prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
@@ -6459,8 +6979,6 @@ static inline unsigned long task_util(struct task_struct *p)
return p->se.avg.util_avg;
}
-unsigned int capacity_margin = 1280; /* ~20% margin */
-
static inline unsigned long boosted_task_util(struct task_struct *task);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
@@ -6486,11 +7004,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
return __task_fits(p, cpu, 0);
}
-static inline bool task_fits_spare(struct task_struct *p, int cpu)
-{
- return __task_fits(p, cpu, cpu_util(cpu));
-}
-
static bool cpu_overutilized(int cpu)
{
return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
@@ -6498,6 +7011,8 @@ static bool cpu_overutilized(int cpu)
#ifdef CONFIG_SCHED_TUNE
+struct reciprocal_value schedtune_spc_rdiv;
+
static long
schedtune_margin(unsigned long signal, long boost)
{
@@ -6508,29 +7023,16 @@ schedtune_margin(unsigned long signal, long boost)
*
* The Boost (B) value is used to compute a Margin (M) which is
* proportional to the complement of the original Signal (S):
- * M = B * (SCHED_LOAD_SCALE - S), if B is positive
- * M = B * S, if B is negative
+ * M = B * (SCHED_CAPACITY_SCALE - S)
* The obtained M could be used by the caller to "boost" S.
*/
if (boost >= 0) {
- margin = SCHED_LOAD_SCALE - signal;
+ margin = SCHED_CAPACITY_SCALE - signal;
margin *= boost;
} else
margin = -signal * boost;
- /*
- * Fast integer division by constant:
- * Constant : (C) = 100
- * Precision : 0.1% (P) = 0.1
- * Reference : C * 100 / P (R) = 100000
- *
- * Thus:
- * Shift bits : ceil(log(R,2)) (S) = 17
- * Mult const : round(2^S/C) (M) = 1311
- *
- *
- */
- margin *= 1311;
- margin >>= 17;
+
+ margin = reciprocal_divide(margin, schedtune_spc_rdiv);
if (boost < 0)
margin *= -1;
@@ -6580,7 +7082,7 @@ schedtune_task_margin(struct task_struct *task)
#endif /* CONFIG_SCHED_TUNE */
-static inline unsigned long
+unsigned long
boosted_cpu_util(int cpu)
{
unsigned long util = cpu_util(cpu);
@@ -6602,6 +7104,13 @@ boosted_task_util(struct task_struct *task)
return util + margin;
}
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+ return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -6611,10 +7120,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *fit_group = NULL, *spare_group = NULL;
+ struct sched_group *most_spare_sg = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
- unsigned long fit_capacity = ULONG_MAX;
- unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
+ unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -6622,7 +7130,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load, spare_capacity;
+ unsigned long load, avg_load, spare_cap, max_spare_cap;
int local_group;
int i;
@@ -6634,8 +7142,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
+ /*
+ * Tally up the load of all CPUs in the group and find
+ * the group containing the CPU with most spare capacity.
+ */
avg_load = 0;
+ max_spare_cap = 0;
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
@@ -6646,24 +7158,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
- /*
- * Look for most energy-efficient group that can fit
- * that can fit the task.
- */
- if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
- fit_capacity = capacity_of(i);
- fit_group = group;
- }
+ spare_cap = capacity_spare_wake(i, p);
- /*
- * Look for group which has most spare capacity on a
- * single cpu.
- */
- spare_capacity = capacity_of(i) - cpu_util(i);
- if (spare_capacity > max_spare_capacity) {
- max_spare_capacity = spare_capacity;
- spare_group = group;
- }
+ if (spare_cap > max_spare_cap)
+ max_spare_cap = spare_cap;
}
/* Adjust by relative CPU capacity of the group */
@@ -6671,17 +7169,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (local_group) {
this_load = avg_load;
- } else if (avg_load < min_load) {
- min_load = avg_load;
- idlest = group;
+ this_spare = max_spare_cap;
+ } else {
+ if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+
+ if (most_spare < max_spare_cap) {
+ most_spare = max_spare_cap;
+ most_spare_sg = group;
+ }
}
} while (group = group->next, group != sd->groups);
- if (fit_group)
- return fit_group;
-
- if (spare_group)
- return spare_group;
+ /*
+ * The cross-over point between using spare capacity or least load
+ * is too conservative for high utilization tasks on partially
+ * utilized systems if we require spare_capacity > task_util(p),
+ * so we allow for some task stuffing by using
+ * spare_capacity > task_util(p)/2.
+ */
+ if (this_spare > task_util(p) / 2 &&
+ imbalance*this_spare > 100*most_spare)
+ return NULL;
+ else if (most_spare > task_util(p) / 2)
+ return most_spare_sg;
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
@@ -6701,9 +7214,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
int shallowest_idle_cpu = -1;
int i;
+ /* Check if we have any choice: */
+ if (group->group_weight == 1)
+ return cpumask_first(sched_group_cpus(group));
+
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (task_fits_spare(p, i)) {
+ if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -6715,8 +7232,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (idle_cpu(i) &&
- (!idle || idle->exit_latency == min_exit_latency) &&
+ } else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -6725,13 +7241,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if (shallowest_idle_cpu == -1) {
- /*
- * If we haven't found an idle CPU yet
- * pick a non-idle one that can fit the task as
- * fallback.
- */
- shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -6748,24 +7257,32 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/*
* Try and locate an idle CPU in the sched_domain.
*/
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
struct sched_group *sg;
- int i = task_cpu(p);
- int best_idle = -1;
- int best_idle_cstate = -1;
- int best_idle_capacity = INT_MAX;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ unsigned long best_idle_capacity = ULONG_MAX;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
+ schedstat_inc(this_rq(), eas_stats.sis_attempts);
if (!sysctl_sched_cstate_aware) {
- if (idle_cpu(target))
+ if (idle_cpu(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
+ schedstat_inc(this_rq(), eas_stats.sis_idle);
return target;
+ }
/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
+ schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
+ return prev;
+ }
}
if (!(current->flags & PF_WAKE_UP_IDLE) &&
@@ -6779,24 +7296,30 @@ static int select_idle_sibling(struct task_struct *p, int target)
for_each_lower_domain(sd) {
sg = sd->groups;
do {
+ int i;
if (!cpumask_intersects(sched_group_cpus(sg),
tsk_cpus_allowed(p)))
goto next;
if (sysctl_sched_cstate_aware) {
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
- struct rq *rq = cpu_rq(i);
- int idle_idx = idle_get_state_idx(rq);
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
unsigned long new_usage = boosted_task_util(p);
unsigned long capacity_orig = capacity_orig_of(i);
+
if (new_usage > capacity_orig || !idle_cpu(i))
goto next;
- if (i == target && new_usage <= capacity_curr_of(target))
+ if (i == target && new_usage <= capacity_curr_of(target)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
+ schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
+ schedstat_inc(sd, eas_stats.sis_suff_cap);
return target;
+ }
- if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
- best_idle = i;
+ if (idle_idx < best_idle_cstate &&
+ capacity_orig <= best_idle_capacity) {
+ best_idle_cpu = i;
best_idle_cstate = idle_idx;
best_idle_capacity = capacity_orig;
}
@@ -6809,231 +7332,440 @@ static int select_idle_sibling(struct task_struct *p, int target)
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
+ schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
+ schedstat_inc(sd, eas_stats.sis_idle_cpu);
goto done;
}
next:
sg = sg->next;
} while (sg != sd->groups);
}
- if (best_idle > 0)
- target = best_idle;
+
+ if (best_idle_cpu >= 0)
+ target = best_idle_cpu;
done:
+ schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
+ schedstat_inc(this_rq(), eas_stats.sis_count);
+
return target;
}
-static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
{
- int iter_cpu;
- int target_cpu = -1;
- int target_util = 0;
- int backup_capacity = 0;
- int best_idle_cpu = -1;
- int best_idle_cstate = INT_MAX;
- int backup_cpu = -1;
- unsigned long task_util_boosted, new_util;
+ unsigned long util, capacity;
- task_util_boosted = boosted_task_util(p);
- for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
- int cur_capacity;
- struct rq *rq;
- int idle_idx;
-
- /*
- * Iterate from higher cpus for boosted tasks.
- */
- int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
-
- if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
- continue;
-
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- new_util = cpu_util(i) + task_util_boosted;
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+ return cpu_util(cpu);
+#endif
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ return cpu_util(cpu);
- /*
- * Ensure minimum capacity to grant the required boost.
- * The target CPU can be already at a capacity level higher
- * than the one required to boost the task.
- */
- if (new_util > capacity_orig_of(i))
- continue;
+ capacity = capacity_orig_of(cpu);
+ util = max_t(long, cpu_util(cpu) - task_util(p), 0);
- /*
- * Unconditionally favoring tasks that prefer idle cpus to
- * improve latency.
- */
- if (idle_cpu(i) && prefer_idle) {
- if (best_idle_cpu < 0)
- best_idle_cpu = i;
- continue;
- }
+ return (util >= capacity) ? capacity : util;
+}
- cur_capacity = capacity_curr_of(i);
- rq = cpu_rq(i);
- idle_idx = idle_get_state_idx(rq);
-
- if (new_util < cur_capacity) {
- if (cpu_rq(i)->nr_running) {
- if (prefer_idle) {
- /* Find a target cpu with highest
- * utilization.
- */
- if (target_util == 0 ||
- target_util < new_util) {
- target_cpu = i;
- target_util = new_util;
- }
- } else {
- /* Find a target cpu with lowest
- * utilization.
- */
- if (target_util == 0 ||
- target_util > new_util) {
- target_cpu = i;
- target_util = new_util;
- }
- }
- } else if (!prefer_idle) {
- if (best_idle_cpu < 0 ||
- (sysctl_sched_cstate_aware &&
- best_idle_cstate > idle_idx)) {
- best_idle_cstate = idle_idx;
- best_idle_cpu = i;
- }
- }
- } else if (backup_capacity == 0 ||
- backup_capacity > cur_capacity) {
- // Find a backup cpu with least capacity.
- backup_capacity = cur_capacity;
- backup_cpu = i;
- }
- }
+static int start_cpu(bool boosted)
+{
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- if (prefer_idle && best_idle_cpu >= 0)
- target_cpu = best_idle_cpu;
- else if (target_cpu < 0)
- target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+ RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(),
+ "sched RCU must be held");
- return target_cpu;
+ return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
}
-static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+ bool boosted, bool prefer_idle)
{
+ unsigned long best_idle_min_cap_orig = ULONG_MAX;
+ unsigned long min_util = boosted_task_util(p);
+ unsigned long target_capacity = ULONG_MAX;
+ unsigned long min_wake_util = ULONG_MAX;
+ unsigned long target_max_spare_cap = 0;
+ unsigned long target_util = ULONG_MAX;
+ unsigned long best_active_util = ULONG_MAX;
+ int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
- struct sched_group *sg, *sg_target;
- int target_max_cap = INT_MAX;
- int target_cpu = task_cpu(p);
- unsigned long task_util_boosted, new_util;
- int i;
+ struct sched_group *sg;
+ int best_active_cpu = -1;
+ int best_idle_cpu = -1;
+ int target_cpu = -1;
+ int cpu, i;
- if (sysctl_sched_sync_hint_enable && sync) {
- int cpu = smp_processor_id();
- cpumask_t search_cpus;
- cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
- if (cpumask_test_cpu(cpu, &search_cpus))
- return cpu;
- }
+ *backup_cpu = -1;
- sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
+ schedstat_inc(this_rq(), eas_stats.fbt_attempts);
- if (!sd)
- return target;
+ /* Find start CPU based on boost value */
+ cpu = start_cpu(boosted);
+ if (cpu < 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
+ return -1;
+ }
- sg = sd->groups;
- sg_target = sg;
+ /* Find SD for the start CPU */
+ sd = rcu_dereference(per_cpu(sd_ea, cpu));
+ if (!sd) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
+ schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
+ return -1;
+ }
- if (sysctl_sched_is_big_little) {
+ /* Scan CPUs in all SDs */
+ sg = sd->groups;
+ do {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ unsigned long capacity_curr = capacity_curr_of(i);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ unsigned long wake_util, new_util;
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- /* Assuming all cpus are the same in group */
- int max_cap_cpu = group_first_cpu(sg);
+ if (!cpu_online(i))
+ continue;
- /*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
- */
- if (capacity_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ if (walt_cpu_high_irqload(i))
+ continue;
- task_util_boosted = boosted_task_util(p);
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
* accounting. However, the blocked utilization may be zero.
*/
- new_util = cpu_util(i) + task_util_boosted;
+ wake_util = cpu_util_wake(i, p);
+ new_util = wake_util + task_util(p);
/*
* Ensure minimum capacity to grant the required boost.
* The target CPU can be already at a capacity level higher
* than the one required to boost the task.
*/
- if (new_util > capacity_orig_of(i))
+ new_util = max(min_util, new_util);
+ if (new_util > capacity_orig)
continue;
- if (new_util < capacity_curr_of(i)) {
- target_cpu = i;
- if (cpu_rq(i)->nr_running)
- break;
+ /*
+ * Case A) Latency sensitive tasks
+ *
+ * Unconditionally favoring tasks that prefer idle CPU to
+ * improve latency.
+ *
+ * Looking for:
+ * - an idle CPU, whatever its idle_state is, since
+ * the first CPUs we explore are more likely to be
+ * reserved for latency sensitive tasks.
+ * - a non idle CPU where the task fits in its current
+ * capacity and has the maximum spare capacity.
+ * - a non idle CPU with lower contention from other
+ * tasks and running at the lowest possible OPP.
+ *
+ * The last two goals tries to favor a non idle CPU
+ * where the task can run as if it is "almost alone".
+ * A maximum spare capacity CPU is favoured since
+ * the task already fits into that CPU's capacity
+ * without waiting for an OPP chance.
+ *
+ * The following code path is the only one in the CPUs
+ * exploration loop which is always used by
+ * prefer_idle tasks. It exits the loop with wither a
+ * best_active_cpu or a target_cpu which should
+ * represent an optimal choice for latency sensitive
+ * tasks.
+ */
+ if (prefer_idle) {
+
+ /*
+ * Case A.1: IDLE CPU
+ * Return the first IDLE CPU we find.
+ */
+ if (idle_cpu(i)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
+ schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
+
+ trace_sched_find_best_target(p,
+ prefer_idle, min_util,
+ cpu, best_idle_cpu,
+ best_active_cpu, i);
+
+ return i;
+ }
+
+ /*
+ * Case A.2: Target ACTIVE CPU
+ * Favor CPUs with max spare capacity.
+ */
+ if ((capacity_curr > new_util) &&
+ (capacity_orig - new_util > target_max_spare_cap)) {
+ target_max_spare_cap = capacity_orig - new_util;
+ target_cpu = i;
+ continue;
+ }
+ if (target_cpu != -1)
+ continue;
+
+
+ /*
+ * Case A.3: Backup ACTIVE CPU
+ * Favor CPUs with:
+ * - lower utilization due to other tasks
+ * - lower utilization with the task in
+ */
+ if (wake_util > min_wake_util)
+ continue;
+ if (new_util > best_active_util)
+ continue;
+ min_wake_util = wake_util;
+ best_active_util = new_util;
+ best_active_cpu = i;
+ continue;
}
- /* cpu has capacity at higher OPP, keep it as fallback */
- if (target_cpu == task_cpu(p))
- target_cpu = i;
+ /*
+ * Case B) Non latency sensitive tasks on IDLE CPUs.
+ *
+ * Find an optimal backup IDLE CPU for non latency
+ * sensitive tasks.
+ *
+ * Looking for:
+ * - minimizing the capacity_orig,
+ * i.e. preferring LITTLE CPUs
+ * - favoring shallowest idle states
+ * i.e. avoid to wakeup deep-idle CPUs
+ *
+ * The following code path is used by non latency
+ * sensitive tasks if IDLE CPUs are available. If at
+ * least one of such CPUs are available it sets the
+ * best_idle_cpu to the most suitable idle CPU to be
+ * selected.
+ *
+ * If idle CPUs are available, favour these CPUs to
+ * improve performances by spreading tasks.
+ * Indeed, the energy_diff() computed by the caller
+ * will take care to ensure the minimization of energy
+ * consumptions without affecting performance.
+ */
+ if (idle_cpu(i)) {
+ int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+ /* Select idle CPU with lower cap_orig */
+ if (capacity_orig > best_idle_min_cap_orig)
+ continue;
+
+ /*
+ * Skip CPUs in deeper idle state, but only
+ * if they are also less energy efficient.
+ * IOW, prefer a deep IDLE LITTLE CPU vs a
+ * shallow idle big CPU.
+ */
+ if (sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ /* Keep track of best idle CPU */
+ best_idle_min_cap_orig = capacity_orig;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
+ }
+
+ /*
+ * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+ *
+ * Pack tasks in the most energy efficient capacities.
+ *
+ * This task packing strategy prefers more energy
+ * efficient CPUs (i.e. pack on smaller maximum
+ * capacity CPUs) while also trying to spread tasks to
+ * run them all at the lower OPP.
+ *
+ * This assumes for example that it's more energy
+ * efficient to run two tasks on two CPUs at a lower
+ * OPP than packing both on a single CPU but running
+ * that CPU at an higher OPP.
+ *
+ * Thus, this case keep track of the CPU with the
+ * smallest maximum capacity and highest spare maximum
+ * capacity.
+ */
+
+ /* Favor CPUs with smaller capacity */
+ if (capacity_orig > target_capacity)
+ continue;
+
+ /* Favor CPUs with maximum spare capacity */
+ if ((capacity_orig - new_util) < target_max_spare_cap)
+ continue;
+
+ target_max_spare_cap = capacity_orig - new_util;
+ target_capacity = capacity_orig;
+ target_util = new_util;
+ target_cpu = i;
}
- } else {
- /*
- * Find a cpu with sufficient capacity
- */
+
+ } while (sg = sg->next, sg != sd->groups);
+
+ /*
+ * For non latency sensitive tasks, cases B and C in the previous loop,
+ * we pick the best IDLE CPU only if we was not able to find a target
+ * ACTIVE CPU.
+ *
+ * Policies priorities:
+ *
+ * - prefer_idle tasks:
+ *
+ * a) IDLE CPU available, we return immediately
+ * b) ACTIVE CPU where task fits and has the bigger maximum spare
+ * capacity (i.e. target_cpu)
+ * c) ACTIVE CPU with less contention due to other tasks
+ * (i.e. best_active_cpu)
+ *
+ * - NON prefer_idle tasks:
+ *
+ * a) ACTIVE CPU: target_cpu
+ * b) IDLE CPU: best_idle_cpu
+ */
+ if (target_cpu == -1)
+ target_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+ else
+ *backup_cpu = prefer_idle
+ ? best_active_cpu
+ : best_idle_cpu;
+
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ target_cpu);
+
+ schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
+ schedstat_inc(this_rq(), eas_stats.fbt_count);
+
+ return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+ long min_cap, max_cap;
+
+ min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ /* Minimum capacity is close to max, no need to abort wake_affine */
+ if (max_cap - min_cap < max_cap >> 3)
+ return 0;
+
+ /* Bring task utilization in sync with prev_cpu */
+ sync_entity_load_avg(&p->se);
+
+ return min_cap * 1024 < task_util(p) * capacity_margin;
+}
+
+static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
+{
+ struct sched_domain *sd;
+ int target_cpu = prev_cpu, tmp_target, tmp_backup;
+ bool boosted, prefer_idle;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
+ schedstat_inc(this_rq(), eas_stats.secb_attempts);
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
+ schedstat_inc(this_rq(), eas_stats.secb_sync);
+ return cpu;
+ }
+ }
+
+ rcu_read_lock();
#ifdef CONFIG_CGROUP_SCHEDTUNE
- bool boosted = schedtune_task_boost(p) > 0;
- bool prefer_idle = schedtune_prefer_idle(p) > 0;
+ boosted = schedtune_task_boost(p) > 0;
+ prefer_idle = schedtune_prefer_idle(p) > 0;
#else
- bool boosted = 0;
- bool prefer_idle = 0;
+ boosted = get_sysctl_sched_cfs_boost() > 0;
+ prefer_idle = 0;
#endif
- int tmp_target = find_best_target(p, boosted, prefer_idle);
- if (tmp_target >= 0) {
- target_cpu = tmp_target;
- if ((boosted || prefer_idle) && idle_cpu(target_cpu))
- return target_cpu;
+
+ sync_entity_load_avg(&p->se);
+
+ sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ /* Find a cpu with sufficient capacity */
+ tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
+
+ if (!sd)
+ goto unlock;
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
+ goto unlock;
}
}
- if (target_cpu != task_cpu(p)) {
+ if (target_cpu != prev_cpu) {
struct energy_env eenv = {
- .util_delta = task_util(p),
- .src_cpu = task_cpu(p),
- .dst_cpu = target_cpu,
- .task = p,
+ .util_delta = task_util(p),
+ .src_cpu = prev_cpu,
+ .dst_cpu = target_cpu,
+ .task = p,
};
/* Not enough spare capacity on previous cpu */
- if (cpu_overutilized(task_cpu(p)))
- return target_cpu;
+ if (cpu_overutilized(prev_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
+ schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
+ goto unlock;
+ }
- if (energy_diff(&eenv) >= 0)
- return task_cpu(p);
+ if (energy_diff(&eenv) >= 0) {
+ /* No energy saving for target_cpu, try backup */
+ target_cpu = tmp_backup;
+ eenv.dst_cpu = target_cpu;
+ if (tmp_backup < 0 || energy_diff(&eenv) >= 0) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+ }
+
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ goto unlock;
}
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
+ schedstat_inc(this_rq(), eas_stats.secb_count);
+
+unlock:
+ rcu_read_unlock();
+
return target_cpu;
}
@@ -7063,9 +7795,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
#endif
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
- cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
- energy_aware();
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+ && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+
+ if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
+ return select_energy_cpu_brute(p, prev_cpu, sync);
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -7090,49 +7824,65 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (!sd) {
- if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
- else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
- new_cpu = select_idle_sibling(p, new_cpu);
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
+ } else {
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
+ if (wu) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq(), eas_stats.cas_attempts);
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
+ while (sd) {
+ struct sched_group *group;
+ int weight;
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
+ if (wu)
+ schedstat_inc(sd, eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
}
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq(), eas_stats.cas_count);
}
- /* while loop will break here if sd == NULL */
}
rcu_read_unlock();
@@ -8135,8 +8885,13 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+ true))
update_tg_load_avg(cfs_rq, 0);
+
+ /* Propagate pending load changes to the parent */
+ if (cfs_rq->tg->se[cpu])
+ update_load_avg(cfs_rq->tg->se[cpu], 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8196,7 +8951,7 @@ static inline void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8434,13 +9189,14 @@ skip_unlock: __attribute__ ((unused));
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, max_capacity;
+ unsigned long capacity, max_capacity, min_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -8454,6 +9210,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity = 0;
max_capacity = 0;
+ min_capacity = ULONG_MAX;
if (child->flags & SD_OVERLAP) {
/*
@@ -8486,6 +9243,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
max_capacity = max(capacity, max_capacity);
+ min_capacity = min(capacity, min_capacity);
}
} else {
/*
@@ -8503,6 +9261,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
if (!cpu_isolated(cpumask_first(cpus))) {
capacity += sgc->capacity;
max_capacity = max(sgc->max_capacity, max_capacity);
+ min_capacity = min(sgc->min_capacity, min_capacity);
}
group = group->next;
} while (group != child->groups);
@@ -8510,6 +9269,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
sdg->sgc->capacity = capacity;
sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity = min_capacity;
}
/*
@@ -8632,6 +9392,38 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
+#ifdef CONFIG_NO_HZ_COMMON
+/*
+ * idle load balancing data
+ * - used by the nohz balance, but we want it available here
+ * so that we can see which CPUs have no tick.
+ */
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ unsigned long next_balance; /* in jiffy units */
+} nohz ____cacheline_aligned;
+
+static inline void update_cpu_stats_if_tickless(struct rq *rq)
+{
+ /* only called from update_sg_lb_stats when irqs are disabled */
+ if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
+ /* rate limit updates to once-per-jiffie at most */
+ if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
+ return;
+
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
+ raw_spin_unlock(&rq->lock);
+ }
+}
+
+#else
+static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
+#endif
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8663,6 +9455,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (cpu_isolated(i))
continue;
+ /* if we are entering idle and there are CPUs with
+ * their tick stopped, do an update for them
+ */
+ if (env->idle == CPU_NEWLY_IDLE)
+ update_cpu_stats_if_tickless(rq);
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -8791,15 +9589,21 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= busiest->avg_load)
return false;
+ if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
+ goto asym_packing;
+
/*
- * Candiate sg has no more than one task per cpu and has higher
- * per-cpu capacity. No reason to pull tasks to less capable cpus.
+ * Candidate sg has no more than one task per CPU and
+ * has higher per-CPU capacity. Migrating tasks to less
+ * capable CPUs may harm throughput. Maximize throughput,
+ * power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
group_smaller_cpu_capacity(sds->local, sg))
return false;
}
+asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
@@ -8850,6 +9654,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
}
#endif /* CONFIG_NUMA_BALANCING */
+#define lb_sd_parent(sd) \
+ (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -8935,7 +9742,7 @@ next_group:
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
- if (!env->sd->parent) {
+ if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
@@ -9524,7 +10331,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *continue_balancing)
{
int ld_moved = 0, cur_ld_moved, active_balance = 0;
- struct sched_domain *sd_parent = sd->parent;
+ struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
struct sched_group *group = NULL;
struct rq *busiest = NULL;
unsigned long flags;
@@ -10114,11 +10921,6 @@ static inline int on_null_domain(struct rq *rq)
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
#ifdef CONFIG_SCHED_HMP
static inline int find_new_hmp_ilb(int type)
@@ -10545,6 +11347,10 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
(!energy_aware() || cpu_overutilized(cpu)))
return true;
+ /* Do idle load balance if there have misfit task */
+ if (energy_aware() && rq->misfit_task)
+ return 1;
+
return (rq->nr_running >= 2);
}
@@ -10808,6 +11614,61 @@ static inline bool vruntime_normalized(struct task_struct *p)
return false;
}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Propagate the changes of the sched_entity across the tg tree to make it
+ * visible to the root
+ */
+static void propagate_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+
+ /* Start to propagate at parent */
+ se = se->parent;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
+ update_load_avg(se, UPDATE_TG);
+ }
+}
+#else
+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
+#endif
+
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ update_load_avg(se, 0);
+ detach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+
+ /* Synchronize entity with its cfs_rq */
+ update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+ attach_entity_load_avg(cfs_rq, se);
+ update_tg_load_avg(cfs_rq, false);
+ propagate_entity_cfs_rq(se);
+}
+
static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
@@ -10822,8 +11683,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
- /* Catch up with the cfs_rq and remove our load when we leave */
- detach_entity_load_avg(cfs_rq, se);
+ detach_entity_cfs_rq(se);
}
static void attach_task_cfs_rq(struct task_struct *p)
@@ -10831,16 +11691,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /*
- * Since the real-depth could have been changed (only FAIR
- * class maintain depth value), reset depth properly.
- */
- se->depth = se->parent ? se->parent->depth + 1 : 0;
-#endif
-
- /* Synchronize task with its cfs_rq */
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_cfs_rq(se);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -10894,6 +11745,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->propagate_avg = 0;
+#endif
atomic_long_set(&cfs_rq->removed_load_avg, 0);
atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
@@ -10931,8 +11785,9 @@ void free_fair_sched_group(struct task_group *tg)
int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
- struct cfs_rq *cfs_rq;
struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -10947,6 +11802,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_bandwidth(tg_cfs_bandwidth(tg));
for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -10960,6 +11817,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ raw_spin_unlock_irq(&rq->lock);
}
return 1;
@@ -11056,8 +11917,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index b0b93fd33af9..f8e8d68ed3fd 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -201,8 +201,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +212,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 29345ed74069..ee095f4e7230 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,6 +5,7 @@
#include "sched.h"
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/irq_work.h>
#include <trace/events/sched.h>
@@ -1005,6 +1006,9 @@ static void update_curr_rt(struct rq *rq)
if (unlikely((s64)delta_exec <= 0))
return;
+ /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -1456,11 +1460,30 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
}
#endif
+/*
+ * Return whether the task on the given cpu is currently non-preemptible
+ * while handling a potentially long softint, or if the task is likely
+ * to block preemptions soon because it is a ksoftirq thread that is
+ * handling slow softints.
+ */
+bool
+task_may_not_preempt(struct task_struct *task, int cpu)
+{
+ __u32 softirqs = per_cpu(active_softirqs, cpu) |
+ __IRQ_STAT(cpu, __softirq_pending);
+ struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
+
+ return ((softirqs & LONG_SOFTIRQ_MASK) &&
+ (task == cpu_ksoftirqd ||
+ task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+}
+
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
struct task_struct *curr;
struct rq *rq;
+ bool may_not_preempt;
#ifdef CONFIG_SCHED_HMP
return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
@@ -1476,7 +1499,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
curr = READ_ONCE(rq->curr); /* unlocked access */
/*
- * If the current task on @p's runqueue is an RT task, then
+ * If the current task on @p's runqueue is a softirq task,
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
+ * it may run without preemption for a time that is
+ * ill-suited for a waiting RT task. Therefore, try to
+ * wake this RT task on another runqueue.
+ *
+ * Also, if the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
@@ -1497,17 +1530,22 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
* This test is optimistic, if we get it wrong the load-balancer
* will have to sort it out.
*/
- if (curr && unlikely(rt_task(curr)) &&
+ may_not_preempt = task_may_not_preempt(curr, cpu);
+ if (may_not_preempt ||
+ (unlikely(rt_task(curr)) &&
(curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio)) {
+ curr->prio <= p->prio))) {
int target = find_lowest_rq(p);
/*
- * Don't bother moving it if the destination CPU is
- * not running a lower priority task.
+ * If cpu is non-preemptible, prefer remote cpu
+ * even if it's running a higher-prio task.
+ * Otherwise: Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
*/
if (target != -1 &&
- p->prio < cpu_rq(target)->rt.highest_prio.curr)
+ (may_not_preempt ||
+ p->prio < cpu_rq(target)->rt.highest_prio.curr))
cpu = target;
}
rcu_read_unlock();
@@ -1785,6 +1823,7 @@ static int find_lowest_rq_hmp(struct task_struct *task)
* the best one based on our affinity and topology.
*/
+retry:
for_each_sched_cluster(cluster) {
if (boost_on_big && cluster->capacity != max_possible_capacity)
continue;
@@ -1792,6 +1831,15 @@ static int find_lowest_rq_hmp(struct task_struct *task)
cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
cpumask_andnot(&candidate_mask, &candidate_mask,
cpu_isolated_mask);
+ /*
+ * When placement boost is active, if there is no eligible CPU
+ * in the highest capacity cluster, we fallback to the other
+ * clusters. So clear the CPUs of the traversed cluster from
+ * the lowest_mask.
+ */
+ if (unlikely(boost_on_big))
+ cpumask_andnot(lowest_mask, lowest_mask,
+ &cluster->cpus);
if (cpumask_empty(&candidate_mask))
continue;
@@ -1831,6 +1879,11 @@ static int find_lowest_rq_hmp(struct task_struct *task)
break;
}
+ if (unlikely(boost_on_big && best_cpu == -1)) {
+ boost_on_big = 0;
+ goto retry;
+ }
+
return best_cpu;
}
#endif /* CONFIG_SCHED_HMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b88f647ea935..67b7da81f8a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -465,6 +465,7 @@ struct cfs_rq {
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
+ unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
@@ -651,6 +652,9 @@ struct root_domain {
/* Maximum cpu capacity in the system. */
struct max_cpu_capacity max_cpu_capacity;
+
+ /* First cpu with maximum and minimum original capacity */
+ int max_cap_orig_cpu, min_cap_orig_cpu;
};
extern struct root_domain def_root_domain;
@@ -708,6 +712,7 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
+ struct list_head *tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -827,6 +832,9 @@ struct rq {
/* try_to_wake_up() stats */
unsigned int ttwu_count;
unsigned int ttwu_local;
+#ifdef CONFIG_SMP
+ struct eas_stats eas_stats;
+#endif
#endif
#ifdef CONFIG_SMP
@@ -997,6 +1005,7 @@ struct sched_group_capacity {
*/
unsigned long capacity;
unsigned long max_capacity; /* Max per-cpu capacity in group */
+ unsigned long min_capacity; /* Min per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -2158,6 +2167,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
static inline void __add_nr_running(struct rq *rq, unsigned count)
{
@@ -2671,6 +2681,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+/*
+ * task_may_not_preempt - check whether a task may not be preemptible soon
+ */
+extern bool task_may_not_preempt(struct task_struct *task, int cpu);
+
#else /* CONFIG_SMP */
/*
@@ -2792,3 +2807,55 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
+{
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_update_util(rq, flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant() (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant() (false)
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f0c33e..6d74a7c77c8c 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,28 @@
*/
#define SCHEDSTAT_VERSION 15
+#ifdef CONFIG_SMP
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+ /* eas-specific runqueue stats */
+ seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+ stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+ stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+ stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+ stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+ stats->secb_nrg_sav, stats->secb_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu ",
+ stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+ stats->fbt_pref_idle, stats->fbt_count);
+
+ seq_printf(seq, "%llu %llu\n",
+ stats->cas_attempts, stats->cas_count);
+}
+#endif
+
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
@@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+ show_easstat(seq, &rq->eas_stats);
+
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
+
+ show_easstat(seq, &sd->eas_stats);
}
rcu_read_unlock();
#endif
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b0c5fe6d1f3b..a71e94cecdb6 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -12,11 +12,12 @@
#include "tune.h"
#ifdef CONFIG_CGROUP_SCHEDTUNE
-static bool schedtune_initialized = false;
+bool schedtune_initialized = false;
#endif
unsigned int sysctl_sched_cfs_boost __read_mostly;
+extern struct reciprocal_value schedtune_spc_rdiv;
extern struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
@@ -675,6 +676,9 @@ int schedtune_task_boost(struct task_struct *p)
struct schedtune *st;
int task_boost;
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
/* Get task boost value */
rcu_read_lock();
st = task_schedtune(p);
@@ -689,6 +693,9 @@ int schedtune_prefer_idle(struct task_struct *p)
struct schedtune *st;
int prefer_idle;
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
/* Get prefer_idle value */
rcu_read_lock();
st = task_schedtune(p);
@@ -822,6 +829,7 @@ schedtune_boostgroup_init(struct schedtune *st)
bg = &per_cpu(cpu_boost_groups, cpu);
bg->group[st->idx].boost = 0;
bg->group[st->idx].tasks = 0;
+ raw_spin_lock_init(&bg->lock);
}
return 0;
@@ -1121,9 +1129,12 @@ schedtune_init(void)
pr_info("schedtune: configured to support global boosting only\n");
#endif
+ schedtune_spc_rdiv = reciprocal_value(100);
+
return 0;
nodata:
+ pr_warning("schedtune: disabled!\n");
rcu_read_unlock();
return -EINVAL;
}
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 6e053bd9830c..92c3aae8e056 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -72,7 +72,15 @@ static cpumask_t mpc_mask = CPU_MASK_ALL;
__read_mostly unsigned int walt_ravg_window = 20000000;
/* Min window size (in ns) = 10ms */
+#ifdef CONFIG_HZ_300
+/*
+ * Tick interval becomes to 3333333 due to
+ * rounding error when HZ=300.
+ */
+#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
+#else
#define MIN_SCHED_RAVG_WINDOW 10000000
+#endif
/* Max window size (in ns) = 1s */
#define MAX_SCHED_RAVG_WINDOW 1000000000
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index e181c87a928d..f56c4da16d0b 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -55,6 +55,8 @@ static inline void walt_migrate_sync_cpu(int cpu) { }
static inline void walt_init_cpu_efficiency(void) { }
static inline u64 walt_ktime_clock(void) { return 0; }
+#define walt_cpu_high_irqload(cpu) false
+
#endif /* CONFIG_SCHED_WALT */
extern unsigned int walt_disabled;
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a972fd..b92a047ddc82 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -503,7 +503,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
+ bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
@@ -525,6 +526,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
+
+ *resched_timer =
+ (first->flags & SIGQUEUE_PREALLOC) &&
+ (info->si_code == SI_TIMER) &&
+ (info->si_sys_private);
+
__sigqueue_free(first);
} else {
/*
@@ -541,12 +548,12 @@ still_pending:
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
- siginfo_t *info)
+ siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
- collect_signal(sig, pending, info);
+ collect_signal(sig, pending, info, resched_timer);
return sig;
}
@@ -558,15 +565,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
+ bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
- signr = __dequeue_signal(&tsk->pending, mask, info);
+ signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
- mask, info);
+ mask, info, &resched_timer);
/*
* itimer signal ?
*
@@ -611,7 +619,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
- if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e4436f787..39ffd41594ce 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+/*
+ * active_softirqs -- per cpu, a mask of softirqs that are being handled,
+ * with the expectation that approximate answers are acceptable and therefore
+ * no synchronization.
+ */
+DEFINE_PER_CPU(__u32, active_softirqs);
+
const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -253,6 +260,7 @@ asmlinkage __visible void __do_softirq(void)
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
+ __this_cpu_write(active_softirqs, pending);
local_irq_enable();
@@ -282,6 +290,7 @@ restart:
pending >>= softirq_bit;
}
+ __this_cpu_write(active_softirqs, 0);
rcu_bh_qs();
local_irq_disable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 816999804a16..8576e6385d63 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -522,13 +522,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
- .procname = "sched_is_big_little",
- .data = &sysctl_sched_is_big_little,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "sched_sync_hint_enable",
.data = &sysctl_sched_sync_hint_enable,
.maxlen = sizeof(unsigned int),
@@ -2387,9 +2380,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
+ *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2af5687b83c9..ceec77c652b5 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -569,7 +569,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add(start, base->gettime());
+ start = ktime_add_safe(start, base->gettime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -655,7 +655,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
overrun++;
}
- alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
return overrun;
}
EXPORT_SYMBOL_GPL(alarm_forward);
@@ -843,13 +843,22 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+
+ /*
+ * Rate limit to the tick as a hot fix to prevent DOS. Will be
+ * mopped up later.
+ */
+ if (timr->it.alarm.interval.tv64 &&
+ ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC)
+ timr->it.alarm.interval = ktime_set(0, TICK_NSEC);
+
exp = timespec_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
- exp = ktime_add(now, exp);
+ exp = ktime_add_safe(now, exp);
}
alarm_start(&timr->it.alarm.alarmtimer, exp);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ec2102104cb8..333f627a3a3b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -896,6 +896,18 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fa544f3f560..738f3467d169 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -116,6 +116,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
@@ -173,7 +193,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -207,7 +227,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
cycle_t cycle_now, delta;
/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -235,12 +255,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
- tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
- tk->tkr_raw.read = clock->read;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
@@ -404,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
- tkr->read(tkr->clock),
+ tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -461,6 +479,10 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
return cycles_at_suspend;
}
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -477,13 +499,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
- tkr_dummy.read = dummy_clock_read;
+ cycles_at_suspend = tk_clock_read(tkr);
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- tkr_dummy.read = dummy_clock_read;
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
@@ -647,11 +669,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
s64 nsec;
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -1434,7 +1455,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
@@ -1829,7 +1850,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 34b2a0d5cf1a..eba904bae48c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3535,7 +3535,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
int exclude_mod = 0;
int found = 0;
int ret;
- int clear_filter;
+ int clear_filter = 0;
if (func) {
func_g.type = filter_parse_regex(func, len, &func_g.search,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60d246c4eefa..e3d61aab6bf7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6847,6 +6847,7 @@ static int instance_rmdir(const char *name)
}
kfree(tr->topts);
+ free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 12ea4ea619ee..e9092a0247bf 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -659,30 +659,25 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
- /* an address specified */
- ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
- if (ret) {
- pr_info("Failed to parse address.\n");
- return ret;
- }
- } else {
+
+ /* try to parse an address. if that fails, try to read the
+ * input as a symbol. */
+ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
- pr_info("Failed to parse symbol.\n");
+ pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
if (offset && is_return) {
pr_info("Return probe must be used without offset.\n");
return -EINVAL;
}
+ } else if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
}
argc -= 2; argv += 2;