summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c126
-rw-r--r--kernel/sched/core_ctl.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c154
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/energy.c13
-rw-r--r--kernel/sched/fair.c1070
-rw-r--r--kernel/sched/features.h26
-rw-r--r--kernel/sched/hmp.c40
-rw-r--r--kernel/sched/rt.c343
-rw-r--r--kernel/sched/sched.h69
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/tune.c116
-rw-r--r--kernel/sched/wait.c8
-rw-r--r--kernel/sched/walt.c7
-rw-r--r--kernel/sched/walt.h2
19 files changed, 1322 insertions, 673 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7dde1b9918e4..ea301717538f 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,6 +19,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o sched_avg.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f6f8bb2f0d95..d28060bc74fe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -78,6 +78,7 @@
#include <linux/irq.h>
#include <linux/sched/core_ctl.h>
#include <linux/cpufreq_times.h>
+#include <linux/prefetch.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -97,6 +98,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include "walt.h"
ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
@@ -1084,6 +1086,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+ if (!(p->flags & PF_KTHREAD))
+ return false;
+
+ if (p->nr_cpus_allowed != 1)
+ return false;
+
+ return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ return false;
+
+ if (is_per_cpu_kthread(p))
+ return cpu_online(cpu);
+
+ return cpu_active(cpu);
+}
+
/*
* This is how migration works:
*
@@ -1141,16 +1170,10 @@ struct migration_arg {
*/
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
{
- int src_cpu;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
-
/* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ if (!is_cpu_allowed(p, dest_cpu))
return rq;
- src_cpu = cpu_of(rq);
rq = move_queued_task(rq, p, dest_cpu);
return rq;
@@ -1364,6 +1387,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.nr_migrations++;
perf_event_task_migrate(p);
+ walt_fixup_busy_time(p, new_cpu);
fixup_busy_time(p, new_cpu);
}
@@ -1648,9 +1672,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
- if (!cpu_online(dest_cpu))
- continue;
- if (!cpu_active(dest_cpu))
+ if (!is_cpu_allowed(p, dest_cpu))
continue;
if (cpu_isolated(dest_cpu)) {
if (allow_iso)
@@ -1989,6 +2011,9 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
#endif /* CONFIG_SMP */
@@ -2129,9 +2154,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
raw_spin_lock(&rq->lock);
old_load = task_load(p);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
raw_spin_unlock(&rq->lock);
rcu_read_lock();
@@ -2225,6 +2254,12 @@ static void try_to_wake_up_local(struct task_struct *p)
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
note_task_waking(p, wallclock);
}
@@ -2357,6 +2392,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
INIT_LIST_HEAD(&p->se.group_node);
+ walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -2641,6 +2677,7 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
add_new_task_to_grp(p);
+ walt_init_new_task_load(p);
raw_spin_lock_irqsave(&p->pi_lock, flags);
p->state = TASK_RUNNING;
@@ -2659,6 +2696,7 @@ void wake_up_new_task(struct task_struct *p)
#endif
rq = __task_rq_lock(p);
mark_task_starting(p);
+ walt_mark_task_starting(p);
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
@@ -3129,6 +3167,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -3162,6 +3217,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
@@ -3189,13 +3245,18 @@ void scheduler_tick(void)
raw_spin_lock(&rq->lock);
old_load = task_load(curr);
+ walt_set_window_start(rq);
set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
calc_global_load_tick(rq);
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+
+ cpufreq_update_util(rq, 0);
early_notif = early_detection_notify(rq, wallclock);
raw_spin_unlock(&rq->lock);
@@ -3554,6 +3615,9 @@ static void __sched notrace __schedule(bool preempt)
update_rq_clock(rq);
next = pick_next_task(rq, prev);
+ wallclock = walt_ktime_clock();
+ walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+ walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->clock_skip_update = 0;
@@ -3564,6 +3628,7 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
+ cpufreq_update_util(rq, 0);
if (!is_idle_task(prev) && !prev->on_rq)
update_avg_burst(prev);
@@ -3582,6 +3647,7 @@ static void __sched notrace __schedule(bool preempt)
cpu = cpu_of(rq);
} else {
update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
+ cpufreq_update_util(rq, 0);
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3819,7 +3885,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(prio)) {
struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
queue_flag |= ENQUEUE_REPLENISH;
} else
@@ -4892,6 +4959,9 @@ again:
retval = -EINVAL;
}
+ if (!retval && !(p->flags & PF_KTHREAD))
+ cpumask_and(&p->cpus_requested, in_mask, cpu_possible_mask);
+
out_free_new_mask:
free_cpumask_var(new_mask);
out_free_cpus_allowed:
@@ -4991,14 +5061,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
size_t retlen = min_t(size_t, len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
@@ -5913,12 +5983,6 @@ int sched_isolate_cpu(int cpu)
cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
- /* We cannot isolate ALL cpus in the system */
- if (cpumask_weight(&avail_cpus) == 1) {
- ret_code = -EINVAL;
- goto out;
- }
-
if (!cpu_online(cpu)) {
ret_code = -EINVAL;
goto out;
@@ -5927,6 +5991,13 @@ int sched_isolate_cpu(int cpu)
if (++cpu_isolation_vote[cpu] > 1)
goto out;
+ /* We cannot isolate ALL cpus in the system */
+ if (cpumask_weight(&avail_cpus) == 1) {
+ --cpu_isolation_vote[cpu];
+ ret_code = -EINVAL;
+ goto out;
+ }
+
/*
* There is a race between watchdog being enabled by hotplug and
* core isolation disabling the watchdog. When a CPU is hotplugged in
@@ -5950,7 +6021,9 @@ int sched_isolate_cpu(int cpu)
smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
+ irq_lock_sparse();
stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+ irq_unlock_sparse();
calc_load_migrate(rq);
update_max_interval();
@@ -6309,6 +6382,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_UP_PREPARE:
raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_set_window_start(rq);
set_window_start(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
rq->calc_load_update = calc_load_update;
@@ -6330,6 +6404,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_migrate_sync_cpu(cpu);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -8315,6 +8390,7 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
BUG();
+ cpumask_copy(&current->cpus_requested, cpu_possible_mask);
sched_init_granularity();
free_cpumask_var(non_isolated_cpus);
@@ -8524,6 +8600,11 @@ void __init sched_init(void)
}
#endif
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+ rq->cur_irqload = 0;
+ rq->avg_irqload = 0;
+ rq->irqload_ts = 0;
+#endif
INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -9252,8 +9333,9 @@ int sched_rr_handler(struct ctl_table *table, int write,
/* make sure that internally we keep jiffies */
/* also, writing zero resets timeslice to default */
if (!ret && write) {
- sched_rr_timeslice = sched_rr_timeslice <= 0 ?
- RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
@@ -9316,7 +9398,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
* This is called before wake_up_new_task(), therefore we really only
* have to set its group bits, all the other stuff does not apply.
*/
-static void cpu_cgroup_fork(struct task_struct *task, void *private)
+static void cpu_cgroup_fork(struct task_struct *task)
{
unsigned long flags;
struct rq *rq;
diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c
index 70cd0649ac9b..2f060a570061 100644
--- a/kernel/sched/core_ctl.c
+++ b/kernel/sched/core_ctl.c
@@ -22,6 +22,7 @@
#include <linux/sched/rt.h>
#include <trace/events/sched.h>
+#include "sched.h"
#define MAX_CPUS_PER_CLUSTER 4
#define MAX_CLUSTERS 2
@@ -575,7 +576,8 @@ static bool eval_need(struct cluster_data *cluster)
cluster->active_cpus = get_active_cpu_count(cluster);
thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0;
list_for_each_entry(c, &cluster->lru, sib) {
- if (c->busy >= cluster->busy_up_thres[thres_idx])
+ if (c->busy >= cluster->busy_up_thres[thres_idx] ||
+ sched_cpu_high_irqload(c->cpu))
c->is_busy = true;
else if (c->busy < cluster->busy_down_thres[thres_idx])
c->is_busy = false;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6c84b4d28914..869a125ebb87 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -33,6 +33,7 @@ struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int up_rate_limit_us;
unsigned int down_rate_limit_us;
+ bool iowait_boost_enable;
};
struct sugov_policy {
@@ -81,6 +82,7 @@ struct sugov_cpu {
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+static DEFINE_PER_CPU(struct sugov_tunables *, cached_tunables);
/************************ Governor internals ***********************/
@@ -88,16 +90,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
- if (sg_policy->work_in_progress)
- return false;
-
if (unlikely(sg_policy->need_freq_update)) {
- sg_policy->need_freq_update = false;
- /*
- * This happens when limits change, so forget the previous
- * next_freq value and force an update.
- */
- sg_policy->next_freq = UINT_MAX;
return true;
}
@@ -149,7 +142,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
policy->cur = next_freq;
trace_cpu_frequency(next_freq, smp_processor_id());
- } else {
+ } else if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
@@ -186,8 +179,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
+
+ sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -228,6 +223,20 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+
+ if (!sg_policy->tunables->iowait_boost_enable)
+ return;
+
+ if (sg_cpu->iowait_boost) {
+ s64 delta_ns = time - sg_cpu->last_update;
+
+ /* Clear iowait_boost if the CPU apprears to have been idle. */
+ if (delta_ns > TICK_NSEC) {
+ sg_cpu->iowait_boost = 0;
+ sg_cpu->iowait_boost_pending = false;
+ }
+ }
if (flags & SCHED_CPUFREQ_IOWAIT) {
if (sg_cpu->iowait_boost_pending)
return;
@@ -241,14 +250,6 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
} else {
sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
}
- } else if (sg_cpu->iowait_boost) {
- s64 delta_ns = time - sg_cpu->last_update;
-
- /* Clear iowait_boost if the CPU apprears to have been idle. */
- if (delta_ns > TICK_NSEC) {
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_pending = false;
- }
}
}
@@ -305,6 +306,13 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
sugov_set_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
+ /*
+ * For slow-switch systems, single policy requests can't run at the
+ * moment if update is in progress, unless we acquire update_lock.
+ */
+ if (sg_policy->work_in_progress)
+ return;
+
if (!sugov_should_update_freq(sg_policy, time))
return;
@@ -320,7 +328,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq) {
+ if (busy && next_f < sg_policy->next_freq &&
+ sg_policy->next_freq != UINT_MAX) {
next_f = sg_policy->next_freq;
/* Reset cached freq as next_freq has changed */
@@ -360,7 +369,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
- if (j_util * max > j_max * util) {
+ if (j_util * max >= j_max * util) {
util = j_util;
max = j_max;
}
@@ -405,13 +414,27 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+ unsigned int freq;
+ unsigned long flags;
+
+ /*
+ * Hold sg_policy->update_lock shortly to handle the case where:
+ * incase sg_policy->next_freq is read here, and then updated by
+ * sugov_update_shared just before work_in_progress is set to false
+ * here, we may miss queueing the new update.
+ *
+ * Note: If a work was queued after the update_lock is released,
+ * sugov_work will just be called again by kthread_work code; and the
+ * request will be proceed before the sugov thread sleeps.
+ */
+ raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
+ freq = sg_policy->next_freq;
+ sg_policy->work_in_progress = false;
+ raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
mutex_lock(&sg_policy->work_lock);
- __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
- CPUFREQ_RELATION_L);
+ __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&sg_policy->work_lock);
-
- sg_policy->work_in_progress = false;
}
static void sugov_irq_work(struct irq_work *irq_work)
@@ -510,12 +533,36 @@ static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
return count;
}
+static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set,
+ char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->iowait_boost_enable);
+}
+
+static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set,
+ const char *buf, size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ bool enable;
+
+ if (kstrtobool(buf, &enable))
+ return -EINVAL;
+
+ tunables->iowait_boost_enable = enable;
+
+ return count;
+}
+
static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable);
static struct attribute *sugov_attributes[] = {
&up_rate_limit_us.attr,
&down_rate_limit_us.attr,
+ &iowait_boost_enable.attr,
NULL
};
@@ -610,6 +657,29 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic
return tunables;
}
+static void sugov_tunables_save(struct cpufreq_policy *policy,
+ struct sugov_tunables *tunables)
+{
+ int cpu;
+ struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+ if (!have_governor_per_policy())
+ return;
+
+ if (!cached) {
+ cached = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (!cached) {
+ pr_warn("Couldn't allocate tunables for caching\n");
+ return;
+ }
+ for_each_cpu(cpu, policy->related_cpus)
+ per_cpu(cached_tunables, cpu) = cached;
+ }
+
+ cached->up_rate_limit_us = tunables->up_rate_limit_us;
+ cached->down_rate_limit_us = tunables->down_rate_limit_us;
+}
+
static void sugov_tunables_free(struct sugov_tunables *tunables)
{
if (!have_governor_per_policy())
@@ -618,6 +688,25 @@ static void sugov_tunables_free(struct sugov_tunables *tunables)
kfree(tunables);
}
+static void sugov_tunables_restore(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
+
+ if (!cached)
+ return;
+
+ tunables->up_rate_limit_us = cached->up_rate_limit_us;
+ tunables->down_rate_limit_us = cached->down_rate_limit_us;
+ sg_policy->up_rate_delay_ns =
+ tunables->up_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->down_rate_delay_ns =
+ tunables->down_rate_limit_us * NSEC_PER_USEC;
+ sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+ sg_policy->down_rate_delay_ns);
+}
+
static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
@@ -675,9 +764,13 @@ static int sugov_init(struct cpufreq_policy *policy)
}
}
+ tunables->iowait_boost_enable = policy->iowait_boost_enable;
+
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
+ sugov_tunables_restore(policy);
+
ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
get_governor_parent_kobj(policy), "%s",
cpufreq_gov_schedutil.name);
@@ -717,8 +810,10 @@ static int sugov_exit(struct cpufreq_policy *policy)
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
- if (!count)
+ if (!count) {
+ sugov_tunables_save(policy, tunables);
sugov_tunables_free(tunables);
+ }
mutex_unlock(&global_tunables_lock);
@@ -740,7 +835,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
update_min_rate_limit_us(sg_policy);
sg_policy->last_freq_update_time = 0;
- sg_policy->next_freq = UINT_MAX;
+ sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
@@ -752,6 +847,11 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_DL;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ }
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 14225d5d8617..867cb7877511 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -133,6 +133,8 @@ retry:
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_andnot(lowest_mask, lowest_mask,
+ cpu_isolated_mask);
if (drop_nopreempts)
drop_nopreempt_cpus(lowest_mask);
/*
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e6ec68c15aa3..cf6729cb46dd 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,6 +6,7 @@
#include <linux/context_tracking.h>
#include <linux/cpufreq_times.h>
#include "sched.h"
+#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -79,9 +80,10 @@ void irqtime_account_irq(struct task_struct *curr)
irq_time_write_end();
- if (account)
+ if (account) {
+ walt_account_irqtime(cpu, curr, delta, wallclock);
sched_account_irqtime(cpu, curr, delta, wallclock);
- else if (curr != this_cpu_ksoftirqd())
+ } else if (curr != this_cpu_ksoftirqd())
sched_account_irqstart(cpu, curr, wallclock);
local_irq_restore(flags);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 188c8388a63f..d40995e9cf5f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1038,6 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
@@ -1053,6 +1054,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ed8e6bb4531b..5c8e6e37fce7 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -104,7 +104,8 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+ return group_path;
}
#endif
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
index 50d183b1e156..770624996f9f 100644
--- a/kernel/sched/energy.c
+++ b/kernel/sched/energy.c
@@ -91,11 +91,17 @@ void init_sched_energy_costs(void)
sge = kcalloc(1, sizeof(struct sched_group_energy),
GFP_NOWAIT);
+ if (!sge)
+ goto out;
nstates = (prop->length / sizeof(u32)) / 2;
cap_states = kcalloc(nstates,
sizeof(struct capacity_state),
GFP_NOWAIT);
+ if (!cap_states) {
+ kfree(sge);
+ goto out;
+ }
for (i = 0, val = prop->value; i < nstates; i++) {
cap_states[i].cap = be32_to_cpup(val++);
@@ -108,6 +114,8 @@ void init_sched_energy_costs(void)
prop = of_find_property(cp, "idle-cost-data", NULL);
if (!prop || !prop->value) {
pr_warn("No idle-cost data, skipping sched_energy init\n");
+ kfree(sge);
+ kfree(cap_states);
goto out;
}
@@ -115,6 +123,11 @@ void init_sched_energy_costs(void)
idle_states = kcalloc(nstates,
sizeof(struct idle_state),
GFP_NOWAIT);
+ if (!idle_states) {
+ kfree(sge);
+ kfree(cap_states);
+ goto out;
+ }
for (i = 0, val = prop->value; i < nstates; i++)
idle_states[i].power = be32_to_cpup(val++);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index df2e6dd2c665..43c3d2684f64 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -55,6 +55,12 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
unsigned int sysctl_sched_sync_hint_enable = 1;
unsigned int sysctl_sched_cstate_aware = 1;
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -2504,7 +2510,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -3001,6 +3007,8 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats)
int i;
struct cpumask search_cpus;
+ extern int num_clusters;
+
while (!bitmap_empty(env->backup_list, num_clusters)) {
next = next_candidate(env->backup_list, 0, num_clusters);
__clear_bit(next->id, env->backup_list);
@@ -3024,6 +3032,8 @@ next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env,
{
struct sched_cluster *next = NULL;
+ extern int num_clusters;
+
__clear_bit(cluster->id, env->candidate_list);
if (env->rtg && preferred_cluster(cluster, env->p))
@@ -3680,68 +3690,6 @@ static inline int migration_needed(struct task_struct *p, int cpu)
return 0;
}
-static inline int
-kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
-{
- unsigned long flags;
- int rc = 0;
-
- /* Invoke active balance to force migrate currently running task */
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (!rq->active_balance) {
- rq->active_balance = 1;
- rq->push_cpu = new_cpu;
- get_task_struct(p);
- rq->push_task = p;
- rc = 1;
- }
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-
- return rc;
-}
-
-static DEFINE_RAW_SPINLOCK(migration_lock);
-
-static bool do_migration(int reason, int new_cpu, int cpu)
-{
- if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
- && same_cluster(new_cpu, cpu))
- return false;
-
- /* Inter cluster high irqload migrations are OK */
- return new_cpu != cpu;
-}
-
-/*
- * Check if currently running task should be migrated to a better cpu.
- *
- * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
- */
-void check_for_migration(struct rq *rq, struct task_struct *p)
-{
- int cpu = cpu_of(rq), new_cpu;
- int active_balance = 0, reason;
-
- reason = migration_needed(p, cpu);
- if (!reason)
- return;
-
- raw_spin_lock(&migration_lock);
- new_cpu = select_best_cpu(p, cpu, reason, 0);
-
- if (do_migration(reason, new_cpu, cpu)) {
- active_balance = kick_active_balance(rq, p, new_cpu);
- if (active_balance)
- mark_reserved(new_cpu);
- }
-
- raw_spin_unlock(&migration_lock);
-
- if (active_balance)
- stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
- &rq->active_balance_work);
-}
-
#ifdef CONFIG_CFS_BANDWIDTH
static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq)
@@ -4175,6 +4123,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
return 1;
}
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+ struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+ /*
+ * If sched_entity still have not zero load or utilization, we have to
+ * decay it:
+ */
+ if (se->avg.load_avg || se->avg.util_avg)
+ return false;
+
+ /*
+ * If there is a pending propagation, we have to update the load and
+ * the utilization of the sched_entity:
+ */
+ if (gcfs_rq->propagate_avg)
+ return false;
+
+ /*
+ * Otherwise, the load and the utilization of the sched_entity is
+ * already zero and there is no pending propagation, so it will be a
+ * waste of time to try to decay it:
+ */
+ return true;
+}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -4292,6 +4270,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
*/
#define UPDATE_TG 0x1
#define SKIP_AGE_LOAD 0x2
+#define SKIP_CPUFREQ 0x4
/* Update task and its cfs_rq load average */
static inline void update_load_avg(struct sched_entity *se, int flags)
@@ -4312,7 +4291,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
cfs_rq->curr == se, NULL);
}
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ));
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
@@ -4488,6 +4467,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
#define UPDATE_TG 0x0
#define SKIP_AGE_LOAD 0x0
+#define SKIP_CPUFREQ 0x0
static inline void update_load_avg(struct sched_entity *se, int not_used1){}
static inline void
@@ -4710,6 +4690,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ int update_flags;
+
/*
* Update run-time statistics of the 'current'.
*/
@@ -4723,7 +4705,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
- update_load_avg(se, UPDATE_TG);
+ update_flags = UPDATE_TG;
+
+ if (flags & DEQUEUE_IDLE)
+ update_flags |= SKIP_CPUFREQ;
+
+ update_load_avg(se, update_flags);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
@@ -5011,14 +4998,10 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*/
void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
- u64 now;
-
if (cfs_b->quota == RUNTIME_INF)
return;
- now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -5040,7 +5023,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -5057,61 +5040,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if ((s64)(expires - cfs_rq->runtime_expires) > 0)
- cfs_rq->runtime_expires = expires;
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced. It is valid to compare
- * cfs_b->runtime_expires without any locks since we only care about
- * exact equality, so a partial write will still work.
- */
-
- if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
@@ -5345,8 +5284,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cpu_temp(cpu_of(rq)));
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -5367,7 +5305,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -5392,7 +5329,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -5420,8 +5357,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -5434,8 +5369,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock(&cfs_b->lock);
cfs_b->distribute_running = 0;
@@ -5475,7 +5409,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -5483,7 +5417,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -5512,8 +5446,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -5545,7 +5478,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
@@ -5562,7 +5494,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -5571,11 +5502,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock(&cfs_b->lock);
- if (expires == cfs_b->runtime_expires)
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
@@ -5673,20 +5603,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
- new = (old * 147) / 128; /* ~115% */
- new = min(new, max_cfs_quota_period);
-
- cfs_b->period = ns_to_ktime(new);
-
- /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
- cfs_b->quota *= new;
- cfs_b->quota = div64_u64(cfs_b->quota, old);
-
- pr_warn_ratelimited(
- "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
- smp_processor_id(),
- div_u64(new, NSEC_PER_USEC),
- div_u64(cfs_b->quota, NSEC_PER_USEC));
+ /*
+ * Grow period by a factor of 2 to avoid losing precision.
+ * Precision loss in the quota/period ratio can cause __cfs_schedulable
+ * to fail.
+ */
+ new = old * 2;
+ if (new < max_cfs_quota_period) {
+ cfs_b->period = ns_to_ktime(new);
+ cfs_b->quota *= 2;
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ } else {
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(old, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+ }
/* reset count so we don't come right back in here */
count = 0;
@@ -5894,6 +5832,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
#ifdef CONFIG_SMP
int task_new = flags & ENQUEUE_WAKEUP_NEW;
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
#endif
/*
@@ -5919,6 +5876,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
flags = ENQUEUE_WAKEUP;
@@ -5927,6 +5885,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
@@ -5942,27 +5901,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
-
- /*
- * Update SchedTune accounting.
- *
- * We do it before updating the CPU capacity to ensure the
- * boost value of the current task is accounted for in the
- * selection of the OPP.
- *
- * We do it also in the case where we enqueue a throttled task;
- * we could argue that a throttled task should not boost a CPU,
- * however:
- * a) properly implementing CPU boosting considering throttled
- * tasks will increase a lot the complexity of the solution
- * b) it's not easy to quantify the benefits introduced by
- * such a more complex solution.
- * Thus, for the time being we go for the simple solution and boost
- * also for throttled RQs.
- */
- schedtune_enqueue_task(p, cpu_of(rq));
-
if (energy_aware() && !se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_new && !rq->rd->overutilized &&
cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
@@ -5987,6 +5927,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ if (task_sleep && rq->nr_running == 1)
+ flags |= DEQUEUE_IDLE;
+
+#ifdef CONFIG_SMP
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+#endif
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -6000,6 +5954,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
/* Don't dequeue parent if it has other entities besides us */
@@ -6018,14 +5973,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
for_each_sched_entity(se) {
+ int update_flags;
+
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
if (cfs_rq_throttled(cfs_rq))
break;
- update_load_avg(se, UPDATE_TG);
+ update_flags = UPDATE_TG;
+
+ if (flags & DEQUEUE_IDLE)
+ update_flags |= SKIP_CPUFREQ;
+
+ update_load_avg(se, update_flags);
update_cfs_shares(se);
}
@@ -6035,16 +5998,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
-
- /*
- * Update SchedTune accounting
- *
- * We do it before updating the CPU capacity to ensure the
- * boost value of the current task is accounted for in the
- * selection of the OPP.
- */
- schedtune_dequeue_task(p, cpu_of(rq));
-
+ if (energy_aware() && !se)
+ walt_dec_cumulative_runnable_avg(rq, p);
#endif /* CONFIG_SMP */
hrtick_update(rq);
@@ -6457,28 +6412,79 @@ unsigned long capacity_curr_of(int cpu)
>> SCHED_CAPACITY_SHIFT;
}
+/*
+ * CPU candidates.
+ *
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
+ */
+#define EAS_CPU_PRV 0
+#define EAS_CPU_NXT 1
+#define EAS_CPU_BKP 2
+#define EAS_CPU_CNT 3
+
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and min freq scaling.
+ */
+unsigned long capacity_min_of(int cpu)
+{
+ if (!sched_feat(MIN_CAPACITY_CAPPING))
+ return 0;
+ return arch_scale_cpu_capacity(NULL, cpu) *
+ arch_scale_min_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
struct energy_env {
- struct sched_group *sg_top;
- struct sched_group *sg_cap;
- int cap_idx;
+ /* Utilization to move */
+ struct task_struct *p;
int util_delta;
- int src_cpu;
- int dst_cpu;
- int trg_cpu;
- int energy;
- int payoff;
- struct task_struct *task;
- struct {
- int before;
- int after;
- int delta;
- int diff;
- } nrg;
+
+ /* Mask of CPUs candidates to evaluate */
+ cpumask_t cpus_mask;
+
+ /* CPU candidates to evaluate */
struct {
- int before;
- int after;
- int delta;
- } cap;
+
+ /* CPU ID, must be in cpus_mask */
+ int cpu_id;
+
+ /*
+ * Index (into sched_group_energy::cap_states) of the OPP the
+ * CPU needs to run at if the task is placed on it.
+ * This includes the both active and blocked load, due to
+ * other tasks on this CPU, as well as the task's own
+ * utilization.
+ */
+ int cap_idx;
+ int cap;
+
+ /* Estimated system energy */
+ unsigned int energy;
+
+ /* Estimated energy variation wrt EAS_CPU_PRV */
+ int nrg_delta;
+
+ } cpu[EAS_CPU_CNT];
+
+ /*
+ * Index (into energy_env::cpu) of the morst energy efficient CPU for
+ * the specified energy_env::task
+ */
+ int next_idx;
+
+ /* Support data */
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ struct sched_group *sg;
};
static int cpu_util_wake(int cpu, struct task_struct *p);
@@ -6506,24 +6512,33 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
return (util << SCHED_CAPACITY_SHIFT)/capacity;
}
-static unsigned long group_max_util(struct energy_env *eenv)
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
{
unsigned long max_util = 0;
unsigned long util;
int cpu;
for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
- util = cpu_util_wake(cpu, eenv->task);
+ util = cpu_util_wake(cpu, eenv->p);
/*
* If we are looking at the target CPU specified by the eenv,
* then we should add the (estimated) utilization of the task
* assuming we will wake it up on that CPU.
*/
- if (unlikely(cpu == eenv->trg_cpu))
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
util += eenv->util_delta;
max_util = max(max_util, util);
+
+ /*
+ * Take into account any minimum frequency imposed
+ * elsewhere which limits the energy states available
+ * If the MIN_CAPACITY_CAPPING feature is not enabled
+ * capacity_min_of will return 0 (not capped).
+ */
+ max_util = max(max_util, capacity_min_of(cpu));
+
}
return max_util;
@@ -6541,21 +6556,21 @@ static unsigned long group_max_util(struct energy_env *eenv)
* estimate (more busy).
*/
static unsigned
-long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
{
- unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long capacity = eenv->cpu[cpu_idx].cap;
unsigned long util, util_sum = 0;
int cpu;
- for_each_cpu(cpu, sched_group_cpus(sg)) {
- util = cpu_util_wake(cpu, eenv->task);
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg)) {
+ util = cpu_util_wake(cpu, eenv->p);
/*
* If we are looking at the target CPU specified by the eenv,
* then we should add the (estimated) utilization of the task
* assuming we will wake it up on that CPU.
*/
- if (unlikely(cpu == eenv->trg_cpu))
+ if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
util += eenv->util_delta;
util_sum += __cpu_norm_util(util, capacity);
@@ -6564,27 +6579,31 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
}
-static int find_new_capacity(struct energy_env *eenv,
- const struct sched_group_energy * const sge)
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
{
+ const struct sched_group_energy *sge = eenv->sg->sge;
int idx, max_idx = sge->nr_cap_states - 1;
- unsigned long util = group_max_util(eenv);
+ unsigned long util = group_max_util(eenv, cpu_idx);
/* default is max_cap if we don't find a match */
- eenv->cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap_idx = max_idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap;
for (idx = 0; idx < sge->nr_cap_states; idx++) {
if (sge->cap_states[idx].cap >= util) {
- eenv->cap_idx = idx;
+ /* Keep track of SG's capacity */
+ eenv->cpu[cpu_idx].cap_idx = idx;
+ eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap;
break;
}
}
- return eenv->cap_idx;
+ return eenv->cpu[cpu_idx].cap_idx;
}
-static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
{
+ struct sched_group *sg = eenv->sg;
int i, state = INT_MAX;
int src_in_grp, dst_in_grp;
long grp_util = 0;
@@ -6596,8 +6615,10 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
state++;
- src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
- dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
+ src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+ sched_group_cpus(sg));
+ dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+ sched_group_cpus(sg));
if (src_in_grp == dst_in_grp) {
/* both CPUs under consideration are in the same group or not in
* either group, migration should leave idle state the same.
@@ -6610,8 +6631,8 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
* achievable when we move the task.
*/
for_each_cpu(i, sched_group_cpus(sg)) {
- grp_util += cpu_util_wake(i, eenv->task);
- if (unlikely(i == eenv->trg_cpu))
+ grp_util += cpu_util_wake(i, eenv->p);
+ if (unlikely(i == eenv->cpu[cpu_idx].cpu_id))
grp_util += eenv->util_delta;
}
@@ -6647,19 +6668,65 @@ end:
}
/*
- * sched_group_energy(): Computes the absolute energy consumption of cpus
- * belonging to the sched_group including shared resources shared only by
- * members of the group. Iterates over all cpus in the hierarchy below the
- * sched_group starting from the bottom working it's way up before going to
- * the next cpu until all cpus are covered at all levels. The current
- * implementation is likely to gather the same util statistics multiple times.
- * This can probably be done in a faster but more complex way.
- * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
+ *
+ * NOTE: in the following computations for busy_energy and idle_energy we do
+ * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors.
+ * The required scaling will be performed just one time, by the calling
+ * functions, once we accumulated the contributons for all the SGs.
*/
-static int sched_group_energy(struct energy_env *eenv)
+static void calc_sg_energy(struct energy_env *eenv)
+{
+ struct sched_group *sg = eenv->sg;
+ int busy_energy, idle_energy;
+ unsigned int busy_power;
+ unsigned int idle_power;
+ unsigned long sg_util;
+ int cap_idx, idle_idx;
+ int total_energy = 0;
+ int cpu_idx;
+
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+
+
+ if (eenv->cpu[cpu_idx].cpu_id == -1)
+ continue;
+ /* Compute ACTIVE energy */
+ cap_idx = find_new_capacity(eenv, cpu_idx);
+ busy_power = sg->sge->cap_states[cap_idx].power;
+ /*
+ * in order to calculate cpu_norm_util, we need to know which
+ * capacity level the group will be at, so calculate that first
+ */
+ sg_util = group_norm_util(eenv, cpu_idx);
+
+ busy_energy = sg_util * busy_power;
+
+ /* Compute IDLE energy */
+ idle_idx = group_idle_state(eenv, cpu_idx);
+ idle_power = sg->sge->idle_states[idle_idx].power;
+
+ idle_energy = SCHED_CAPACITY_SCALE - sg_util;
+ idle_energy *= idle_power;
+
+ total_energy = busy_energy + idle_energy;
+ eenv->cpu[cpu_idx].energy += total_energy;
+ }
+}
+
+/*
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ * which case we abort by returning -EINVAL.
+ */
+static int compute_energy(struct energy_env *eenv)
{
struct cpumask visit_cpus;
- u64 total_energy = 0;
int cpu_count;
WARN_ON(!eenv->sg_top->sge);
@@ -6701,41 +6768,18 @@ static int sched_group_energy(struct energy_env *eenv)
break;
do {
- unsigned long group_util;
- int sg_busy_energy, sg_idle_energy;
- int cap_idx, idle_idx;
-
+ eenv->sg_cap = sg;
if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
eenv->sg_cap = sg_shared_cap;
- else
- eenv->sg_cap = sg;
-
- cap_idx = find_new_capacity(eenv, sg->sge);
-
- if (sg->group_weight == 1) {
- /* Remove capacity of src CPU (before task move) */
- if (eenv->trg_cpu == eenv->src_cpu &&
- cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
- eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta -= eenv->cap.before;
- }
- /* Add capacity of dst CPU (after task move) */
- if (eenv->trg_cpu == eenv->dst_cpu &&
- cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
- eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
- eenv->cap.delta += eenv->cap.after;
- }
- }
-
- idle_idx = group_idle_state(eenv, sg);
- group_util = group_norm_util(eenv, sg);
-
- sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
- sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
- * sg->sge->idle_states[idle_idx].power);
- total_energy += sg_busy_energy + sg_idle_energy;
+ /*
+ * Compute the energy for all the candidate
+ * CPUs in the current visited SG.
+ */
+ eenv->sg = sg;
+ calc_sg_energy(eenv);
+ /* remove CPUs we have just visited */
if (!sd->child) {
/*
* cpu_count here is the number of
@@ -6776,7 +6820,6 @@ next_cpu:
continue;
}
- eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
return 0;
}
@@ -6785,180 +6828,100 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
-static inline unsigned long task_util(struct task_struct *p);
-
/*
- * energy_diff(): Estimate the energy impact of changing the utilization
- * distribution. eenv specifies the change: utilisation amount, source, and
- * destination cpu. Source or destination cpu may be -1 in which case the
- * utilization is removed from or added to the system (e.g. task wake-up). If
- * both are specified, the utilization is migrated.
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilisation amount and a pair of
+ * possible CPU candidates (the previous CPU and a different target CPU).
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the first CPU saving energy.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations.
+ * A value greater than zero means that the first energy-efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
*/
-static inline int __energy_diff(struct energy_env *eenv)
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
- int sd_cpu = -1, energy_before = 0, energy_after = 0;
- int diff, margin;
-
- struct energy_env eenv_before = {
- .util_delta = task_util(eenv->task),
- .src_cpu = eenv->src_cpu,
- .dst_cpu = eenv->dst_cpu,
- .trg_cpu = eenv->src_cpu,
- .nrg = { 0, 0, 0, 0},
- .cap = { 0, 0, 0 },
- .task = eenv->task,
- };
+ int sd_cpu = -1;
+ int cpu_idx;
+ int margin;
- if (eenv->src_cpu == eenv->dst_cpu)
- return 0;
-
- sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
-
if (!sd)
- return 0; /* Error */
+ return EAS_CPU_PRV;
- sg = sd->groups;
+ cpumask_clear(&eenv->cpus_mask);
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ int cpu = eenv->cpu[cpu_idx].cpu_id;
- do {
- if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
- eenv_before.sg_top = eenv->sg_top = sg;
+ if (cpu < 0)
+ continue;
+ cpumask_set_cpu(cpu, &eenv->cpus_mask);
+ }
- if (sched_group_energy(&eenv_before))
- return 0; /* Invalid result abort */
- energy_before += eenv_before.energy;
+ sg = sd->groups;
+ do {
+ /* Skip SGs which do not contains a candidate CPU */
+ if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg)))
+ continue;
- /* Keep track of SRC cpu (before) capacity */
- eenv->cap.before = eenv_before.cap.before;
- eenv->cap.delta = eenv_before.cap.delta;
+ eenv->sg_top = sg;
+ /* energy is unscaled to reduce rounding errors */
+ if (compute_energy(eenv) == -EINVAL)
+ return EAS_CPU_PRV;
- if (sched_group_energy(eenv))
- return 0; /* Invalid result abort */
- energy_after += eenv->energy;
- }
} while (sg = sg->next, sg != sd->groups);
- eenv->nrg.before = energy_before;
- eenv->nrg.after = energy_after;
- eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
- eenv->payoff = 0;
-#ifndef CONFIG_SCHED_TUNE
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
-#endif
+ /* Scale energy before comparisons */
+ for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx)
+ eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT;
+
/*
- * Dead-zone margin preventing too many migrations.
+ * Compute the dead-zone margin used to prevent too many task
+ * migrations with negligible energy savings.
+ * An energy saving is considered meaningful if it reduces the energy
+ * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
*/
+ margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
- margin = eenv->nrg.before >> 6; /* ~1.56% */
-
- diff = eenv->nrg.after - eenv->nrg.before;
-
- eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
-
- return eenv->nrg.diff;
-}
-
-#ifdef CONFIG_SCHED_TUNE
-
-struct target_nrg schedtune_target_nrg;
-
-#ifdef CONFIG_CGROUP_SCHEDTUNE
-extern bool schedtune_initialized;
-#endif /* CONFIG_CGROUP_SCHEDTUNE */
-
-/*
- * System energy normalization
- * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
- * corresponding to the specified energy variation.
- */
-static inline int
-normalize_energy(int energy_diff)
-{
- u32 normalized_nrg;
-
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- /* during early setup, we don't know the extents */
- if (unlikely(!schedtune_initialized))
- return energy_diff < 0 ? -1 : 1 ;
-#endif /* CONFIG_CGROUP_SCHEDTUNE */
-
-#ifdef CONFIG_SCHED_DEBUG
- {
- int max_delta;
-
- /* Check for boundaries */
- max_delta = schedtune_target_nrg.max_power;
- max_delta -= schedtune_target_nrg.min_power;
- WARN_ON(abs(energy_diff) >= max_delta);
- }
-#endif
-
- /* Do scaling using positive numbers to increase the range */
- normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
-
- /* Scale by energy magnitude */
- normalized_nrg <<= SCHED_CAPACITY_SHIFT;
-
- /* Normalize on max energy for target platform */
- normalized_nrg = reciprocal_divide(
- normalized_nrg, schedtune_target_nrg.rdiv);
-
- return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
-}
-
-static inline int
-energy_diff(struct energy_env *eenv)
-{
- int boost = schedtune_task_boost(eenv->task);
- int nrg_delta;
-
- /* Conpute "absolute" energy diff */
- __energy_diff(eenv);
-
- /* Return energy diff when boost margin is 0 */
- if (boost == 0) {
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- 0, -eenv->nrg.diff);
- return eenv->nrg.diff;
- }
-
- /* Compute normalized energy diff */
- nrg_delta = normalize_energy(eenv->nrg.diff);
- eenv->nrg.delta = nrg_delta;
-
- eenv->payoff = schedtune_accept_deltas(
- eenv->nrg.delta,
- eenv->cap.delta,
- eenv->task);
-
- trace_sched_energy_diff(eenv->task,
- eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
- eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
- eenv->cap.before, eenv->cap.after, eenv->cap.delta,
- eenv->nrg.delta, eenv->payoff);
+ /*
+ * By default the EAS_CPU_PRV CPU is considered the most energy
+ * efficient, with a 0 energy variation.
+ */
+ eenv->next_idx = EAS_CPU_PRV;
/*
- * When SchedTune is enabled, the energy_diff() function will return
- * the computed energy payoff value. Since the energy_diff() return
- * value is expected to be negative by its callers, this evaluation
- * function return a negative value each time the evaluation return a
- * positive payoff, which is the condition for the acceptance of
- * a scheduling decision
+ * Compare the other CPU candidates to find a CPU which can be
+ * more energy efficient then EAS_CPU_PRV
*/
- return -eenv->payoff;
+ for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) {
+ /* Skip not valid scheduled candidates */
+ if (eenv->cpu[cpu_idx].cpu_id < 0)
+ continue;
+ /* Compute energy delta wrt EAS_CPU_PRV */
+ eenv->cpu[cpu_idx].nrg_delta =
+ eenv->cpu[cpu_idx].energy -
+ eenv->cpu[EAS_CPU_PRV].energy;
+ /* filter energy variations within the dead-zone margin */
+ if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+ eenv->cpu[cpu_idx].nrg_delta = 0;
+ /* update the schedule candidate with min(nrg_delta) */
+ if (eenv->cpu[cpu_idx].nrg_delta <
+ eenv->cpu[eenv->next_idx].nrg_delta) {
+ eenv->next_idx = cpu_idx;
+ if (sched_feat(FBT_STRICT_ORDER))
+ break;
+ }
+ }
+
+ return eenv->next_idx;
}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff(eenv) __energy_diff(eenv)
-#endif
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
@@ -7054,12 +7017,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return 1;
}
-static inline unsigned long task_util(struct task_struct *p)
-{
- return p->se.avg.util_avg;
-}
-
-static inline unsigned long boosted_task_util(struct task_struct *task);
+static inline unsigned long boosted_task_util(struct task_struct *p);
static inline bool __task_fits(struct task_struct *p, int cpu, int util)
{
@@ -7136,16 +7094,16 @@ schedtune_cpu_margin(unsigned long util, int cpu)
}
static inline long
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
- int boost = schedtune_task_boost(task);
+ int boost = schedtune_task_boost(p);
unsigned long util;
long margin;
if (boost == 0)
return 0;
- util = task_util(task);
+ util = task_util(p);
margin = schedtune_margin(util, boost);
return margin;
@@ -7160,7 +7118,7 @@ schedtune_cpu_margin(unsigned long util, int cpu)
}
static inline int
-schedtune_task_margin(struct task_struct *task)
+schedtune_task_margin(struct task_struct *p)
{
return 0;
}
@@ -7179,12 +7137,12 @@ boosted_cpu_util(int cpu)
}
static inline unsigned long
-boosted_task_util(struct task_struct *task)
+boosted_task_util(struct task_struct *p)
{
- unsigned long util = task_util(task);
- long margin = schedtune_task_margin(task);
+ unsigned long util = task_util(p);
+ long margin = schedtune_task_margin(p);
- trace_sched_boost_task(task, util, margin);
+ trace_sched_boost_task(p, util, margin);
return util + margin;
}
@@ -7554,6 +7512,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
unsigned long min_wake_util = ULONG_MAX;
unsigned long target_max_spare_cap = 0;
unsigned long best_active_util = ULONG_MAX;
+ unsigned long target_idle_max_spare_cap = 0;
int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
struct sched_group *sg;
@@ -7589,7 +7548,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
unsigned long capacity_curr = capacity_curr_of(i);
unsigned long capacity_orig = capacity_orig_of(i);
- unsigned long wake_util, new_util;
+ unsigned long wake_util, new_util, min_capped_util;
if (!cpu_online(i))
continue;
@@ -7611,6 +7570,16 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
* than the one required to boost the task.
*/
new_util = max(min_util, new_util);
+
+ /*
+ * Include minimum capacity constraint:
+ * new_util contains the required utilization including
+ * boost. min_capped_util also takes into account a
+ * minimum capacity cap imposed on the CPU by external
+ * actors.
+ */
+ min_capped_util = max(new_util, capacity_min_of(i));
+
if (new_util > capacity_orig)
continue;
@@ -7733,6 +7702,12 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
/* Select idle CPU with lower cap_orig */
if (capacity_orig > best_idle_min_cap_orig)
continue;
+ /* Favor CPUs that won't end up running at a
+ * high OPP.
+ */
+ if ((capacity_orig - min_capped_util) <
+ target_idle_max_spare_cap)
+ continue;
/*
* Skip CPUs in deeper idle state, but only
@@ -7746,6 +7721,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
/* Keep track of best idle CPU */
best_idle_min_cap_orig = capacity_orig;
+ target_idle_max_spare_cap = capacity_orig -
+ min_capped_util;
best_idle_cstate = idle_idx;
best_idle_cpu = i;
continue;
@@ -7776,10 +7753,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
continue;
/* Favor CPUs with maximum spare capacity */
- if ((capacity_orig - new_util) < target_max_spare_cap)
+ if ((capacity_orig - min_capped_util) <
+ target_max_spare_cap)
continue;
- target_max_spare_cap = capacity_orig - new_util;
+ target_max_spare_cap = capacity_orig - min_capped_util;
target_capacity = capacity_orig;
target_cpu = i;
}
@@ -7851,9 +7829,11 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
{
- struct sched_domain *sd;
- int target_cpu = prev_cpu, tmp_target, tmp_backup;
bool boosted, prefer_idle;
+ struct sched_domain *sd;
+ int target_cpu;
+ int backup_cpu;
+ int next_cpu;
schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
schedstat_inc(this_rq(), eas_stats.secb_attempts);
@@ -7868,7 +7848,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
}
}
- rcu_read_lock();
#ifdef CONFIG_CGROUP_SCHEDTUNE
boosted = schedtune_task_boost(p) > 0;
prefer_idle = schedtune_prefer_idle(p) > 0;
@@ -7877,31 +7856,49 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
prefer_idle = 0;
#endif
- sync_entity_load_avg(&p->se);
+ rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+ if (!sd) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
+
+ sync_entity_load_avg(&p->se);
+
/* Find a cpu with sufficient capacity */
- tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
+ next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle);
+ if (next_cpu == -1) {
+ target_cpu = prev_cpu;
+ goto unlock;
+ }
- if (!sd)
+ /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */
+ if ((boosted || prefer_idle) && idle_cpu(next_cpu)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
+ schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
+ target_cpu = next_cpu;
goto unlock;
- if (tmp_target >= 0) {
- target_cpu = tmp_target;
- if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
- schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
- schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
- goto unlock;
- }
}
- if (target_cpu != prev_cpu) {
+ target_cpu = prev_cpu;
+ if (next_cpu != prev_cpu) {
int delta = 0;
struct energy_env eenv = {
+ .p = p,
.util_delta = task_util(p),
- .src_cpu = prev_cpu,
- .dst_cpu = target_cpu,
- .task = p,
- .trg_cpu = target_cpu,
+ /* Task's previous CPU candidate */
+ .cpu[EAS_CPU_PRV] = {
+ .cpu_id = prev_cpu,
+ },
+ /* Main alternative CPU candidate */
+ .cpu[EAS_CPU_NXT] = {
+ .cpu_id = next_cpu,
+ },
+ /* Backup alternative CPU candidate */
+ .cpu[EAS_CPU_BKP] = {
+ .cpu_id = backup_cpu,
+ },
};
@@ -7914,26 +7911,21 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
if (__cpu_overutilized(prev_cpu, delta)) {
schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
+ target_cpu = next_cpu;
goto unlock;
}
- if (energy_diff(&eenv) >= 0) {
- /* No energy saving for target_cpu, try backup */
- target_cpu = tmp_backup;
- eenv.dst_cpu = target_cpu;
- eenv.trg_cpu = target_cpu;
- if (tmp_backup < 0 ||
- tmp_backup == prev_cpu ||
- energy_diff(&eenv) >= 0) {
- schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
- schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
- target_cpu = prev_cpu;
- goto unlock;
- }
+ /* Check if EAS_CPU_NXT is a more energy efficient CPU */
+ if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) {
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ target_cpu = eenv.cpu[eenv.next_idx].cpu_id;
+ goto unlock;
}
- schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
- schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
+ schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
+ schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
+ target_cpu = prev_cpu;
goto unlock;
}
@@ -8898,12 +8890,30 @@ redo:
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
- if ((load / 2) > env->imbalance)
+ /*
+ * p is not running task when we goes until here, so if p is one
+ * of the 2 task in src cpu rq and not the running one,
+ * that means it is the only task that can be balanced.
+ * So only when there is other tasks can be balanced or
+ * there is situation to ignore big task, it is needed
+ * to skip the task load bigger than 2*imbalance.
+ */
+ if (((cpu_rq(env->src_cpu)->nr_running > 2) ||
+ (env->flags & LBF_IGNORE_BIG_TASKS)) &&
+ ((load / 2) > env->imbalance))
goto next;
detach_task(p, env);
@@ -9013,6 +9023,8 @@ static void update_blocked_averages(int cpu)
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct sched_entity *se;
+
/* throttled entities do not contribute to load */
if (throttled_hierarchy(cfs_rq))
continue;
@@ -9021,9 +9033,10 @@ static void update_blocked_averages(int cpu)
true))
update_tg_load_avg(cfs_rq, 0);
- /* Propagate pending load changes to the parent */
- if (cfs_rq->tg->se[cpu])
- update_load_avg(cfs_rq->tg->se[cpu], 0);
+ /* Propagate pending load changes to the parent, if any: */
+ se = cfs_rq->tg->se[cpu];
+ if (se && !skip_blocked_update(se))
+ update_load_avg(se, 0);
}
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -9292,6 +9305,9 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
@@ -10326,6 +10342,17 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
+ /*
+ * For ASYM_CPUCAPACITY domains, don't pick a cpu that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-cpu capacity is considered better than balancing
+ * average load.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ capacity_of(env->dst_cpu) < capacity &&
+ rq->nr_running == 1)
+ continue;
+
wl = weighted_cpuload(i);
/*
@@ -10393,8 +10420,10 @@ static int need_active_balance(struct lb_env *env)
* It's worth migrating the task if the src_cpu's capacity is reduced
* because of other sched_class or IRQs if more capacity stays
* available on dst_cpu.
+ * Avoid pulling the CFS task if it is the only task running.
*/
if ((env->idle != CPU_NOT_IDLE) &&
+ (env->src_rq->nr_running > 1) &&
(env->src_rq->cfs.h_nr_running == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@ -10536,7 +10565,6 @@ redo:
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -10550,6 +10578,12 @@ more_balance:
}
/*
+ * Set loop_max when rq's lock is taken to prevent a race.
+ */
+ env.loop_max = min(sysctl_sched_nr_migrate,
+ busiest->nr_running);
+
+ /*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
@@ -10627,7 +10661,24 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus)) {
+ /*
+ * dst_cpu is not a valid busiest cpu in the following
+ * check since load cannot be pulled from dst_cpu to be
+ * put on dst_cpu.
+ */
+ cpumask_clear_cpu(env.dst_cpu, cpus);
+ /*
+ * Go back to "redo" iff the load-balance cpumask
+ * contains other potential busiest cpus for the
+ * current sched domain.
+ */
+ if (cpumask_intersects(cpus, sched_domain_span(env.sd))) {
+ /*
+ * Now that the check has passed, reenable
+ * dst_cpu so that load can be calculated on
+ * it in the redo path.
+ */
+ cpumask_set_cpu(env.dst_cpu, cpus);
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
@@ -11606,6 +11657,92 @@ static void rq_offline_fair(struct rq *rq)
unthrottle_offline_cfs_rqs(rq);
}
+static inline int
+kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ unsigned long flags;
+ int rc = 0;
+
+ /* Invoke active balance to force migrate currently running task */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (!rq->active_balance) {
+ rq->active_balance = 1;
+ rq->push_cpu = new_cpu;
+ get_task_struct(p);
+ rq->push_task = p;
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return rc;
+}
+
+#ifdef CONFIG_SCHED_HMP
+static DEFINE_RAW_SPINLOCK(migration_lock);
+
+static bool do_migration(int reason, int new_cpu, int cpu)
+{
+ if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION)
+ && same_cluster(new_cpu, cpu))
+ return false;
+
+ /* Inter cluster high irqload migrations are OK */
+ return new_cpu != cpu;
+}
+
+/*
+ * Check if currently running task should be migrated to a better cpu.
+ *
+ * Todo: Effect this via changes to nohz_balancer_kick() and load balance?
+ */
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int cpu = cpu_of(rq), new_cpu;
+ int active_balance = 0, reason;
+
+ reason = migration_needed(p, cpu);
+ if (!reason)
+ return;
+
+ raw_spin_lock(&migration_lock);
+ new_cpu = select_best_cpu(p, cpu, reason, 0);
+
+ if (do_migration(reason, new_cpu, cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ mark_reserved(new_cpu);
+ }
+
+ raw_spin_unlock(&migration_lock);
+
+ if (active_balance)
+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq,
+ &rq->active_balance_work);
+}
+#else
+void check_for_migration(struct rq *rq, struct task_struct *p)
+{
+ int new_cpu;
+ int active_balance;
+ int cpu = task_cpu(p);
+
+ if (rq->misfit_task) {
+ if (rq->curr->state != TASK_RUNNING ||
+ rq->curr->nr_cpus_allowed == 1)
+ return;
+
+ new_cpu = select_energy_cpu_brute(p, cpu, 0);
+ if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
+ active_balance = kick_active_balance(rq, p, new_cpu);
+ if (active_balance)
+ stop_one_cpu_nowait(cpu,
+ active_load_balance_cpu_stop,
+ rq, &rq->active_balance_work);
+ }
+ }
+}
+#endif
+
#endif /* CONFIG_SMP */
/*
@@ -11714,7 +11851,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_class == &fair_sched_class))
return true;
return false;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index c30c48fde7e6..c3e301589515 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -78,3 +78,29 @@ SCHED_FEAT(ENERGY_AWARE, true)
#else
SCHED_FEAT(ENERGY_AWARE, false)
#endif
+
+/*
+ * Minimum capacity capping. Keep track of minimum capacity factor when
+ * minimum frequency available to a policy is modified.
+ * If enabled, this can be used to inform the scheduler about capacity
+ * restrictions.
+ */
+SCHED_FEAT(MIN_CAPACITY_CAPPING, true)
+
+/*
+ * Enforce the priority of candidates selected by find_best_target()
+ * ON: If the target CPU saves any energy, use that.
+ * OFF: Use whichever of target or backup saves most.
+ */
+SCHED_FEAT(FBT_STRICT_ORDER, true)
+
+/*
+ * Apply schedtune boost hold to tasks of all sched classes.
+ * If enabled, schedtune will hold the boost applied to a CPU
+ * for 50ms regardless of task activation - if the task is
+ * still running 50ms later, the boost hold expires and schedtune
+ * boost will expire immediately the task stops.
+ * If disabled, this behaviour will only apply to tasks of the
+ * RT class.
+ */
+SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false)
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 5337ac7fcba1..649d6a437a13 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -764,13 +764,16 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
unsigned int
min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW 10000000
+/* Min window size (in ns) = 20ms */
+#define MIN_SCHED_RAVG_WINDOW ((20000000 / TICK_NSEC) * TICK_NSEC)
/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
-/* Window size (in ns) */
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
/* Maximum allowed threshold before freq aggregation must be enabled */
@@ -1616,17 +1619,20 @@ static inline int exiting_task(struct task_struct *p)
static int __init set_sched_ravg_window(char *str)
{
+ unsigned int adj_window;
unsigned int window_size;
get_option(&str, &window_size);
- if (window_size < MIN_SCHED_RAVG_WINDOW ||
- window_size > MAX_SCHED_RAVG_WINDOW) {
- WARN_ON(1);
- return -EINVAL;
- }
+ /* Adjust for CONFIG_HZ */
+ adj_window = (window_size / TICK_NSEC) * TICK_NSEC;
+
+ /* Warn if we're a bit too far away from the expected window size */
+ WARN(adj_window < window_size - NSEC_PER_MSEC,
+ "tick-adjusted window size %u, original was %u\n", adj_window,
+ window_size);
- sched_ravg_window = window_size;
+ sched_ravg_window = adj_window;
return 0;
}
@@ -3217,6 +3223,13 @@ void sched_get_cpus_busy(struct sched_load *busy,
update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
0);
+ /*
+ * Ensure that we don't report load for 'cpu' again via the
+ * cpufreq_update_util path in the window that started at
+ * rq->window_start
+ */
+ rq->load_reported_window = rq->window_start;
+
account_load_subtractions(rq);
load[i] = rq->prev_runnable_sum;
nload[i] = rq->nt_prev_runnable_sum;
@@ -3649,6 +3662,13 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
migrate_top_tasks(p, src_rq, dest_rq);
+ if (!same_freq_domain(new_cpu, task_cpu(p))) {
+ cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG |
+ SCHED_CPUFREQ_WALT);
+ cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG |
+ SCHED_CPUFREQ_WALT);
+ }
+
if (p == src_rq->ed_task) {
src_rq->ed_task = NULL;
if (!dest_rq->ed_task)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index af21389466b8..9d7f6998edd5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -12,8 +12,10 @@
#include <linux/hrtimer.h>
#include "tune.h"
+#include "walt.h"
int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -1437,6 +1439,25 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
}
/*
+ * Keep track of whether each cpu has an RT task that will
+ * soon schedule on that core. The problem this is intended
+ * to address is that we want to avoid entering a non-preemptible
+ * softirq handler if we are about to schedule a real-time
+ * task on that core. Ideally, we could just check whether
+ * the RT runqueue on that core had a runnable task, but the
+ * window between choosing to schedule a real-time task
+ * on a core and actually enqueueing it on that run-queue
+ * is large enough to lose races at an unacceptably high rate.
+ *
+ * This variable attempts to reduce that window by indicating
+ * when we have decided to schedule an RT task on a core
+ * but not yet enqueued it.
+ * This variable is a heuristic only: it is not guaranteed
+ * to be correct and may be updated without synchronization.
+ */
+DEFINE_PER_CPU(bool, incoming_rt_task);
+
+/*
* Adding/removing a task to/from a priority array:
*/
static void
@@ -1444,14 +1465,20 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
+#ifdef CONFIG_SMP
+ schedtune_enqueue_task(p, cpu_of(rq));
+#endif
+
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags);
+ walt_inc_cumulative_runnable_avg(rq, p);
inc_hmp_sched_stats_rt(rq, p);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+ *per_cpu_ptr(&incoming_rt_task, cpu_of(rq)) = false;
if (!schedtune_task_boost(p))
return;
@@ -1485,8 +1512,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
+#ifdef CONFIG_SMP
+ schedtune_dequeue_task(p, cpu_of(rq));
+#endif
+
update_curr_rt(rq);
dequeue_rt_entity(rt_se, flags);
+ walt_dec_cumulative_runnable_avg(rq, p);
dec_hmp_sched_stats_rt(rq, p);
dequeue_pushable_task(rq, p);
@@ -1539,8 +1571,19 @@ static void yield_task_rt(struct rq *rq)
requeue_task_rt(rq, rq->curr, 0);
}
+/*
+ * Return whether the given cpu has (or will shortly have) an RT task
+ * ready to run. NB: This is a heuristic and is subject to races.
+ */
+bool
+cpu_has_rt_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ return rq->rt.rt_nr_running > 0 || per_cpu(incoming_rt_task, cpu);
+}
+
#ifdef CONFIG_SMP
-static int find_lowest_rq(struct task_struct *task);
+static int find_lowest_rq(struct task_struct *task, int sync);
#ifdef CONFIG_SCHED_HMP
static int
@@ -1549,7 +1592,7 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
int target;
rcu_read_lock();
- target = find_lowest_rq(p);
+ target = find_lowest_rq(p, 0);
if (target != -1)
cpu = target;
rcu_read_unlock();
@@ -1561,8 +1604,10 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags)
/*
* Return whether the task on the given cpu is currently non-preemptible
* while handling a potentially long softint, or if the task is likely
- * to block preemptions soon because it is a ksoftirq thread that is
- * handling slow softints.
+ * to block preemptions soon because (a) it is a ksoftirq thread that is
+ * handling slow softints, (b) it is idle and therefore likely to start
+ * processing the irq's immediately, (c) the cpu is currently handling
+ * hard irq's and will soon move on to the softirq handler.
*/
bool
task_may_not_preempt(struct task_struct *task, int cpu)
@@ -1572,8 +1617,9 @@ task_may_not_preempt(struct task_struct *task, int cpu)
struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu);
return ((softirqs & LONG_SOFTIRQ_MASK) &&
- (task == cpu_ksoftirqd ||
- task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
+ (task == cpu_ksoftirqd || is_idle_task(task) ||
+ (task_thread_info(task)->preempt_count
+ & (HARDIRQ_MASK | SOFTIRQ_MASK))));
}
/*
@@ -1606,9 +1652,11 @@ static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
int sibling_count_hint)
{
- struct task_struct *curr;
+ struct task_struct *curr, *tgt_task;
struct rq *rq;
bool may_not_preempt;
+ int target;
+ int sync = flags & WF_SYNC;
#ifdef CONFIG_SCHED_HMP
return select_task_rq_rt_hmp(p, cpu, sd_flag, flags);
@@ -1623,58 +1671,28 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
- /*
- * If the current task on @p's runqueue is a softirq task,
- * it may run without preemption for a time that is
- * ill-suited for a waiting RT task. Therefore, try to
- * wake this RT task on another runqueue.
- *
- * Also, if the current task on @p's runqueue is an RT task, then
- * it may run without preemption for a time that is
- * ill-suited for a waiting RT task. Therefore, try to
- * wake this RT task on another runqueue.
- *
- * Also, if the current task on @p's runqueue is an RT task, then
- * try to see if we can wake this RT task up on another
- * runqueue. Otherwise simply start this RT task
- * on its current runqueue.
- *
- * We want to avoid overloading runqueues. If the woken
- * task is a higher priority, then it will stay on this CPU
- * and the lower prio task should be moved to another CPU.
- * Even though this will probably make the lower prio task
- * lose its cache, we do not want to bounce a higher task
- * around just because it gave up its CPU, perhaps for a
- * lock?
- *
- * For equal prio tasks, we just let the scheduler sort it out.
- *
- * Otherwise, just let it ride on the affined RQ and the
- * post-schedule router will push the preempted task away
- *
- * This test is optimistic, if we get it wrong the load-balancer
- * will have to sort it out.
- */
may_not_preempt = task_may_not_preempt(curr, cpu);
- if (may_not_preempt ||
- (unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 ||
- curr->prio <= p->prio))) {
- int target = find_lowest_rq(p);
+ target = find_lowest_rq(p, sync);
- /*
- * If cpu is non-preemptible, prefer remote cpu
- * even if it's running a higher-prio task.
- * Otherwise: Don't bother moving it if the
- * destination CPU is not running a lower priority task.
- */
- if (target != -1 &&
- (may_not_preempt ||
- p->prio < cpu_rq(target)->rt.highest_prio.curr))
- cpu = target;
+ /*
+ * Check once for losing a race with the other core's irq handler.
+ * This does not happen frequently, but it can avoid delaying
+ * the execution of the RT task in those cases.
+ */
+ if (target != -1) {
+ tgt_task = READ_ONCE(cpu_rq(target)->curr);
+ if (task_may_not_preempt(tgt_task, target))
+ target = find_lowest_rq(p, sync);
}
+ /*
+ * Possible race. Don't bother moving it if the
+ * destination CPU is not running a lower priority task.
+ */
+ if (target != -1 &&
+ (may_not_preempt || p->prio < cpu_rq(target)->rt.highest_prio.curr))
+ cpu = target;
+ *per_cpu_ptr(&incoming_rt_task, cpu) = true;
rcu_read_unlock();
-
out:
/*
* If previous CPU was different, make sure to cancel any active
@@ -1718,7 +1736,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
requeue_task_rt(rq, p, 1);
resched_curr(rq);
}
-
#endif /* CONFIG_SMP */
/*
@@ -1982,12 +1999,108 @@ retry:
}
#endif /* CONFIG_SCHED_HMP */
-static int find_lowest_rq(struct task_struct *task)
+static int find_best_rt_target(struct task_struct* task, int cpu,
+ struct cpumask* lowest_mask,
+ bool boosted, bool prefer_idle) {
+ int iter_cpu;
+ int target_cpu = -1;
+ int boosted_cpu = -1;
+ int backup_cpu = -1;
+ int boosted_orig_capacity = capacity_orig_of(0);
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ unsigned long target_util = 0;
+ unsigned long new_util;
+ /* We want to elect the best one based on task class,
+ * idleness, and utilization.
+ */
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ /*
+ * Iterate from higher cpus for boosted tasks.
+ */
+ int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(task)))
+ continue;
+
+ new_util = cpu_util(i) + task_util(task);
+
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle
+ && cpumask_test_cpu(i, lowest_mask) && best_idle_cpu < 0) {
+ best_idle_cpu = i;
+ continue;
+ }
+
+ if (cpumask_test_cpu(i, lowest_mask)) {
+ /* Bias cpu selection towards cpu with higher original
+ * capacity if task is boosted.
+ * Assumption: Higher cpus are exclusively alloted for
+ * boosted tasks.
+ */
+ if (boosted && boosted_cpu < 0
+ && boosted_orig_capacity < capacity_orig_of(i)) {
+ boosted_cpu = i;
+ boosted_orig_capacity = capacity_orig_of(i);
+ }
+ cur_capacity = capacity_curr_of(i);
+ if (new_util < cur_capacity && cpu_rq(i)->nr_running) {
+ if(!boosted) {
+ /* Find a target cpu with highest utilization.*/
+ if (target_util < new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else {
+ if (target_util == 0 || target_util > new_util) {
+ /* Find a target cpu with lowest utilization.*/
+ target_cpu = i;
+ target_util = new_util;
+ }
+ }
+ } else if (backup_capacity == 0 || backup_capacity < cur_capacity) {
+ /* Select a backup CPU with highest capacity.*/
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+ }
+
+ if (boosted && boosted_cpu >=0 && boosted_cpu > best_idle_cpu)
+ target_cpu = boosted_cpu;
+ else if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+
+ if (target_cpu < 0) {
+ if (backup_cpu >= 0)
+ return backup_cpu;
+
+ /* Select current cpu if it is present in the mask.*/
+ if (cpumask_test_cpu(cpu, lowest_mask))
+ return cpu;
+
+ /* Pick a random cpu from lowest_mask */
+ target_cpu = cpumask_any(lowest_mask);
+ if (target_cpu < nr_cpu_ids)
+ return target_cpu;
+ return -1;
+ }
+ return target_cpu;
+}
+
+static int find_lowest_rq(struct task_struct *task, int sync)
{
struct sched_domain *sd;
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+ bool boosted, prefer_idle;
#ifdef CONFIG_SCHED_HMP
return find_lowest_rq_hmp(task);
@@ -2000,64 +2113,88 @@ static int find_lowest_rq(struct task_struct *task)
if (task->nr_cpus_allowed == 1)
return -1; /* No other targets possible */
+ /* Constructing cpumask of lowest priorities */
if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
return -1; /* No targets found */
- /*
- * At this point we have built a mask of cpus representing the
- * lowest priority tasks in the system. Now we want to elect
- * the best one based on our affinity and topology.
- *
- * We prioritize the last cpu that the task executed on since
- * it is most likely cache-hot in that location.
+ /* Return current cpu if WF_SYNC hint is set and present in
+ * lowest_mask. Improves data locality.
*/
- if (cpumask_test_cpu(cpu, lowest_mask))
- return cpu;
+ if (sysctl_sched_sync_hint_enable && sync) {
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(task), lowest_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
/*
- * Otherwise, we consult the sched_domains span maps to figure
- * out which cpu is logically closest to our hot cache data.
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system.
*/
- if (!cpumask_test_cpu(this_cpu, lowest_mask))
- this_cpu = -1; /* Skip this_cpu opt if not among lowest */
-
- rcu_read_lock();
- for_each_domain(cpu, sd) {
- if (sd->flags & SD_WAKE_AFFINE) {
- int best_cpu;
- /*
- * "this_cpu" is cheaper to preempt than a
- * remote processor.
- */
- if (this_cpu != -1 &&
- cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
- rcu_read_unlock();
- return this_cpu;
- }
+ boosted = schedtune_task_boost(task) > 0;
+ prefer_idle = schedtune_prefer_idle(task) > 0;
+ if(boosted || prefer_idle) {
+ return find_best_rt_target(task, cpu, lowest_mask, boosted, prefer_idle);
+ } else {
+ /* Now we want to elect the best one based on on our affinity
+ * and topology.
+ * We prioritize the last cpu that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ struct task_struct* curr;
+ if (!cpumask_test_cpu(this_cpu, lowest_mask))
+ this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
+ /*
+ * "this_cpu" is cheaper to preempt than a
+ * remote processor.
+ */
+ if (this_cpu != -1 &&
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ curr = cpu_rq(this_cpu)->curr;
+ /* Ensuring that boosted/prefer idle
+ * tasks are not pre-empted even if low
+ * priority*/
+ if (!curr || (schedtune_task_boost(curr) == 0
+ && schedtune_prefer_idle(curr) == 0)) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+ }
- best_cpu = cpumask_first_and(lowest_mask,
- sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids) {
- rcu_read_unlock();
- return best_cpu;
+ best_cpu = cpumask_first_and(lowest_mask,
+ sched_domain_span(sd));
+ if (best_cpu < nr_cpu_ids) {
+ curr = cpu_rq(best_cpu)->curr;
+ /* Ensuring that boosted/prefer idle
+ * tasks are not pre-empted even if low
+ * priority*/
+ if(!curr || (schedtune_task_boost(curr) == 0
+ && schedtune_prefer_idle(curr) == 0)) {
+ rcu_read_unlock();
+ return best_cpu;
+ }
+ }
}
}
- }
- rcu_read_unlock();
+ rcu_read_unlock();
- /*
- * And finally, if there were no matches within the domains
- * just give the caller *something* to work with from the compatible
- * locations.
- */
- if (this_cpu != -1)
- return this_cpu;
+ /* And finally, if there were no matches within the domains just
+ * give the caller *something* to work with from the compatible
+ * locations.
+ */
+ if (this_cpu != -1)
+ return this_cpu;
- cpu = cpumask_any(lowest_mask);
- if (cpu < nr_cpu_ids)
- return cpu;
- return -1;
+ cpu = cpumask_any(lowest_mask);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ return -1;
+ }
}
/* Will lock the rq it finds */
@@ -2068,7 +2205,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
int cpu;
for (tries = 0; tries < RT_MAX_TRIES; tries++) {
- cpu = find_lowest_rq(task);
+ cpu = find_lowest_rq(task, 0);
if ((cpu == -1) || (cpu == rq->cpu))
break;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 07a3cd3c6fbc..fa4d0ab014b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -33,8 +33,10 @@ extern long calc_load_fold_active(struct rq *this_rq);
#ifdef CONFIG_SMP
extern void update_cpu_load_active(struct rq *this_rq);
+extern void check_for_migration(struct rq *rq, struct task_struct *p);
#else
static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
#endif
/*
@@ -226,9 +228,8 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota, runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int idle, period_active;
+ short idle, period_active;
struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
@@ -430,7 +431,6 @@ struct related_thread_group {
};
extern struct list_head cluster_head;
-extern int num_clusters;
extern struct sched_cluster *sched_cluster[NR_CPUS];
struct cpu_cycle {
@@ -441,6 +441,7 @@ struct cpu_cycle {
#define for_each_sched_cluster(cluster) \
list_for_each_entry_rcu(cluster, &cluster_head, list)
+extern unsigned int sched_disable_window_stats;
#endif /* CONFIG_SCHED_HMP */
/* CFS-related fields in a runqueue */
@@ -511,6 +512,10 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+#ifdef CONFIG_SCHED_WALT
+ u64 cumulative_runnable_avg;
+#endif
+
#ifdef CONFIG_CFS_BANDWIDTH
#ifdef CONFIG_SCHED_HMP
@@ -518,7 +523,6 @@ struct cfs_rq {
#endif
int runtime_enabled;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock, throttled_clock_task;
@@ -793,6 +797,7 @@ struct rq {
int cstate, wakeup_latency, wakeup_energy;
u64 window_start;
+ u64 load_reported_window;
unsigned long hmp_flags;
u64 cur_irqload;
@@ -818,6 +823,7 @@ struct rq {
#endif
#ifdef CONFIG_SCHED_WALT
+ unsigned int cur_freq;
u64 cumulative_runnable_avg;
u64 window_start;
u64 curr_runnable_sum;
@@ -1466,7 +1472,6 @@ static inline bool is_short_burst_task(struct task_struct *p)
p->ravg.avg_sleep_time > sysctl_sched_short_sleep;
}
-extern void check_for_migration(struct rq *rq, struct task_struct *p);
extern void pre_big_task_count_change(const struct cpumask *cpus);
extern void post_big_task_count_change(const struct cpumask *cpus);
extern void set_hmp_defaults(void);
@@ -1726,7 +1731,6 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu)
return 1;
}
-static inline void check_for_migration(struct rq *rq, struct task_struct *p) { }
static inline void pre_big_task_count_change(void) { }
static inline void post_big_task_count_change(void) { }
static inline void set_hmp_defaults(void) { }
@@ -1853,7 +1857,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
@@ -2026,6 +2030,7 @@ static const u32 prio_to_wmult[40] = {
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_IDLE 0x80 /* The last dequeue before IDLE */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2352,6 +2357,26 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifndef arch_scale_max_freq_capacity
+static __always_inline
+unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
+#ifndef arch_scale_min_freq_capacity
+static __always_inline
+unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ /*
+ * Multiplied with any capacity value, this scale factor will return
+ * 0, which represents an un-capped state
+ */
+ return 0;
+}
+#endif
+
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -2378,6 +2403,19 @@ extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int walt_ravg_window;
extern bool walt_disabled;
+static inline unsigned long task_util(struct task_struct *p)
+{
+
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
+ return p->se.avg.util_avg;
+}
+
+
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
@@ -2852,7 +2890,22 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
struct update_util_data *data;
- data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+#ifdef CONFIG_SCHED_HMP
+ /*
+ * Skip if we've already reported, but not if this is an inter-cluster
+ * migration. Also only allow WALT update sites.
+ */
+ if (!(flags & SCHED_CPUFREQ_WALT))
+ return;
+ if (!sched_disable_window_stats &&
+ (rq->load_reported_window == rq->window_start) &&
+ !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG))
+ return;
+ rq->load_reported_window = rq->window_start;
+#endif
+
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+ cpu_of(rq)));
if (data)
data->func(data, rq_clock(rq), flags);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 3278c81cefb1..0fa11d86599e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
#include "sched.h"
+#include "walt.h"
/*
* stop-task scheduling class.
@@ -78,6 +79,7 @@ static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ walt_inc_cumulative_runnable_avg(rq, p);
inc_hmp_sched_stats_stop(rq, p);
}
@@ -85,6 +87,7 @@ static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ walt_dec_cumulative_runnable_avg(rq, p);
dec_hmp_sched_stats_stop(rq, p);
}
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b84d13750604..728553403c2b 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -17,8 +17,11 @@ bool schedtune_initialized = false;
unsigned int sysctl_sched_cfs_boost __read_mostly;
+/* We hold schedtune boost in effect for at least this long */
+#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL
+
extern struct reciprocal_value schedtune_spc_rdiv;
-extern struct target_nrg schedtune_target_nrg;
+struct target_nrg schedtune_target_nrg;
/* Performance Boost region (B) threshold params */
static int perf_boost_idx;
@@ -240,7 +243,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
* implementation especially for the computation of the per-CPU boost
* value
*/
-#define BOOSTGROUPS_COUNT 5
+#define BOOSTGROUPS_COUNT 6
/* Array of configured boostgroups */
static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
@@ -260,11 +263,14 @@ struct boost_groups {
/* Maximum boost value for all RUNNABLE tasks on a CPU */
bool idle;
int boost_max;
+ u64 boost_ts;
struct {
/* The boost for tasks on that boost group */
int boost;
/* Count of RUNNABLE tasks on that boost group */
unsigned tasks;
+ /* Timestamp of boost activation */
+ u64 ts;
} group[BOOSTGROUPS_COUNT];
/* CPU's boost group locking */
raw_spinlock_t lock;
@@ -388,32 +394,52 @@ static inline void init_sched_boost(struct schedtune *st) { }
#endif /* CONFIG_SCHED_HMP */
+static inline bool schedtune_boost_timeout(u64 now, u64 ts)
+{
+ return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS);
+}
+
+static inline bool
+schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now)
+{
+ if (bg->group[idx].tasks)
+ return true;
+
+ return !schedtune_boost_timeout(now, bg->group[idx].ts);
+}
+
static void
-schedtune_cpu_update(int cpu)
+schedtune_cpu_update(int cpu, u64 now)
{
struct boost_groups *bg;
- int boost_max;
+ u64 boost_ts = now;
+ int boost_max = INT_MIN;
int idx;
bg = &per_cpu(cpu_boost_groups, cpu);
- /* The root boost group is always active */
- boost_max = bg->group[0].boost;
- for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) {
/*
* A boost group affects a CPU only if it has
- * RUNNABLE tasks on that CPU
+ * RUNNABLE tasks on that CPU or it has hold
+ * in effect from a previous task.
*/
- if (bg->group[idx].tasks == 0)
+ if (!schedtune_boost_group_active(idx, bg, now))
+ continue;
+
+ /* this boost group is active */
+ if (boost_max > bg->group[idx].boost)
continue;
- boost_max = max(boost_max, bg->group[idx].boost);
+ boost_max = bg->group[idx].boost;
+ boost_ts = bg->group[idx].ts;
}
- /* Ensures boost_max is non-negative when all cgroup boost values
- * are neagtive. Avoids under-accounting of cpu capacity which may cause
- * task stacking and frequency spikes.*/
- boost_max = max(boost_max, 0);
+
+ /* If there are no active boost groups on the CPU, set no boost */
+ if (boost_max == INT_MIN)
+ boost_max = 0;
bg->boost_max = boost_max;
+ bg->boost_ts = boost_ts;
}
static int
@@ -423,6 +449,7 @@ schedtune_boostgroup_update(int idx, int boost)
int cur_boost_max;
int old_boost;
int cpu;
+ u64 now;
/* Update per CPU boost groups */
for_each_possible_cpu(cpu) {
@@ -439,16 +466,22 @@ schedtune_boostgroup_update(int idx, int boost)
/* Update the boost value of this boost group */
bg->group[idx].boost = boost;
- /* Check if this update increase current max */
- if (boost > cur_boost_max && bg->group[idx].tasks) {
+ now = sched_clock_cpu(cpu);
+ /*
+ * Check if this update increase current max.
+ */
+ if (boost > cur_boost_max &&
+ schedtune_boost_group_active(idx, bg, now)) {
bg->boost_max = boost;
+ bg->boost_ts = bg->group[idx].ts;
+
trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
continue;
}
/* Check if this update has decreased current max */
if (cur_boost_max == old_boost && old_boost > boost) {
- schedtune_cpu_update(cpu);
+ schedtune_cpu_update(cpu, now);
trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
continue;
}
@@ -462,21 +495,38 @@ schedtune_boostgroup_update(int idx, int boost)
#define ENQUEUE_TASK 1
#define DEQUEUE_TASK -1
+static inline bool
+schedtune_update_timestamp(struct task_struct *p)
+{
+ if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL))
+ return true;
+
+ return task_has_rt_policy(p);
+}
+
static inline void
schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
{
struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
int tasks = bg->group[idx].tasks + task_count;
+ u64 now;
/* Update boosted tasks count while avoiding to make it negative */
bg->group[idx].tasks = max(0, tasks);
+ /* Update timeout on enqueue */
+ if (task_count > 0) {
+ now = sched_clock_cpu(cpu);
+ if (schedtune_update_timestamp(p))
+ bg->group[idx].ts = now;
+
+ /* Boost group activation or deactivation on that RQ */
+ if (bg->group[idx].tasks == 1)
+ schedtune_cpu_update(cpu, now);
+ }
trace_sched_tune_tasks_update(p, cpu, tasks, idx,
- bg->group[idx].boost, bg->boost_max);
-
- /* Boost group activation or deactivation on that RQ */
- if (tasks == 1 || tasks == 0)
- schedtune_cpu_update(cpu);
+ bg->group[idx].boost, bg->boost_max,
+ bg->group[idx].ts);
}
/*
@@ -529,6 +579,7 @@ int schedtune_can_attach(struct cgroup_taskset *tset)
int src_bg; /* Source boost group index */
int dst_bg; /* Destination boost group index */
int tasks;
+ u64 now;
if (!unlikely(schedtune_initialized))
return 0;
@@ -574,18 +625,19 @@ int schedtune_can_attach(struct cgroup_taskset *tset)
* current boost group.
*/
+ now = sched_clock_cpu(cpu);
+
/* Move task from src to dst boost group */
tasks = bg->group[src_bg].tasks - 1;
bg->group[src_bg].tasks = max(0, tasks);
bg->group[dst_bg].tasks += 1;
+ bg->group[dst_bg].ts = now;
+
+ /* update next time someone asks */
+ bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS;
raw_spin_unlock(&bg->lock);
unlock_rq_of(rq, task, &irq_flags);
-
- /* Update CPU boost group */
- if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
- schedtune_cpu_update(task_cpu(task));
-
}
return 0;
@@ -666,8 +718,15 @@ void schedtune_exit_task(struct task_struct *tsk)
int schedtune_cpu_boost(int cpu)
{
struct boost_groups *bg;
+ u64 now;
bg = &per_cpu(cpu_boost_groups, cpu);
+ now = sched_clock_cpu(cpu);
+
+ /* check to see if we have a hold in effect */
+ if (schedtune_boost_timeout(now, bg->boost_ts))
+ schedtune_cpu_update(cpu, now);
+
return bg->boost_max;
}
@@ -770,6 +829,7 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
static void schedtune_attach(struct cgroup_taskset *tset)
{
+#ifdef CONFIG_SCHED_HMP
struct task_struct *task;
struct cgroup_subsys_state *css;
struct schedtune *st;
@@ -782,6 +842,7 @@ static void schedtune_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sync_cgroup_colocation(task, colocate);
+#endif
}
static struct cftype files[] = {
@@ -829,6 +890,7 @@ schedtune_boostgroup_init(struct schedtune *st)
bg = &per_cpu(cpu_boost_groups, cpu);
bg->group[st->idx].boost = 0;
bg->group[st->idx].tasks = 0;
+ bg->group[st->idx].ts = 0;
}
return 0;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..675228037d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -10,6 +10,7 @@
#include <linux/wait.h>
#include <linux/hash.h>
#include <linux/kthread.h>
+#include <linux/poll.h>
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
@@ -156,6 +157,13 @@ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(wait_queue_head_t *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, (void *)(POLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 8d25ffbe4fed..0162ff4647b6 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -55,12 +55,7 @@ __read_mostly unsigned int walt_ravg_window =
static unsigned int sync_cpu;
static ktime_t ktime_last;
-static bool walt_ktime_suspended;
-
-static unsigned int task_load(struct task_struct *p)
-{
- return p->ravg.demand;
-}
+static __read_mostly bool walt_ktime_suspended;
static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
{
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index de7edac43674..34c72a0fcf39 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -54,6 +54,8 @@ static inline void walt_set_window_start(struct rq *rq) { }
static inline void walt_migrate_sync_cpu(int cpu) { }
static inline void walt_init_cpu_efficiency(void) { }
static inline u64 walt_ktime_clock(void) { return 0; }
+static inline void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock) { }
#define walt_cpu_high_irqload(cpu) false