summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorBlagovest Kolenichev <bkolenichev@codeaurora.org>2017-11-06 07:02:23 -0800
committerBlagovest Kolenichev <bkolenichev@codeaurora.org>2017-11-06 15:58:47 -0800
commit985aecee1dfefe1471290deffdebd2a9a2f073c9 (patch)
tree8c55a0ea6933c9ee3e0649ccda5fbd03c013367b /kernel
parent22b18281457de02c9c830504fd4d48726db86f31 (diff)
parentceee5bdd470586fddfbbb8c6d0287ba792525d3f (diff)
Merge android-4.4@ceee5bd (v4.4.95) into msm-4.4
* refs/heads/tmp-ceee5bd BACKPORT: arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux sched/core: fix have_sched_energy_data build warning sched/core: Warn if ENERGY_AWARE is enabled but data is missing sched: walt: Correct WALT window size initialization FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide sched: WALT: account cumulative window demand sched/fair: remove useless variable in find_best_target sched/tune: access schedtune_initialized under CGROUP_SCHEDTUNE sched/fair: consider task utilization in group_max_util() sched/fair: consider task utilization in group_norm_util() sched/fair: enforce EAS mode sched/fair: ignore backup CPU when not valid sched/fair: trace energy_diff for non boosted tasks UPSTREAM: sched/fair: Sync task util before slow-path wakeup UPSTREAM: sched/fair: Fix usage of find_idlest_group() when the local group is idlest UPSTREAM: sched/fair: Fix usage of find_idlest_group() when no groups are allowed BACKPORT: sched/fair: Fix find_idlest_group when local group is not allowed UPSTREAM: sched/fair: Remove unnecessary comparison with -1 BACKPORT: sched/fair: Move select_task_rq_fair slow-path into its own function UPSTREAM: sched/fair: Force balancing on nohz balance if local group has capacity UPSTREAM: sched/core: Add missing update_rq_clock() call in set_user_nice() UPSTREAM: sched/core: Add missing update_rq_clock() call for task_hot() UPSTREAM: sched/core: Add missing update_rq_clock() in detach_task_cfs_rq() UPSTREAM: sched/core: Add missing update_rq_clock() in post_init_entity_util_avg() UPSTREAM: sched/core: Fix find_idlest_group() for fork BACKPORT: sched/fair: Fix PELT integrity for new tasks BACKPORT: sched/cgroup: Fix cpu_cgroup_fork() handling UPSTREAM: sched/fair: Fix and optimize the fork() path BACKPORT: sched/fair: Make it possible to account fair load avg consistently cpufreq/sched: Consider max cpu capacity when choosing frequencies Linux 4.4.95 FS-Cache: fix dereference of NULL user_key_payload fscrypto: require write access to mount to set encryption policy KEYS: Fix race between updating and finding a negative key fscrypt: fix dereference of NULL user_key_payload f2fs crypto: add missing locking for keyring_key access f2fs crypto: replace some BUG_ON()'s with error checks sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() parisc: Fix double-word compare and exchange in LWS code on 32-bit kernels parisc: Avoid trashing sr2 and sr3 in LWS code pkcs7: Prevent NULL pointer dereference, since sinfo is not always set. KEYS: don't let add_key() update an uninstantiated key lib/digsig: fix dereference of NULL user_key_payload KEYS: encrypted: fix dereference of NULL user_key_payload rtlwifi: rtl8821ae: Fix connection lost problem clockevents/drivers/cs5535: Improve resilience to spurious interrupts bus: mbus: fix window size calculation for 4GB windows brcmsmac: make some local variables 'static const' to reduce stack size i2c: ismt: Separate I2C block read from SMBus block read ALSA: hda: Remove superfluous '-' added by printk conversion ALSA: seq: Enable 'use' locking in all configurations drm/nouveau/mmu: flush tlbs before deleting page tables drm/nouveau/bsp/g92: disable by default can: esd_usb2: Fix can_dlc value for received RTR, frames usb: musb: Check for host-mode using is_host_active() on reset interrupt usb: musb: sunxi: Explicitly release USB PHY on exit can: gs_usb: fix busy loop if no more TX context is available ALSA: usb-audio: Add native DSD support for Pro-Ject Pre Box S2 Digital usb: hub: Allow reset retry for USB2 devices on connect bounce usb: quirks: add quirk for WORLDE MINI MIDI keyboard usb: cdc_acm: Add quirk for Elatec TWN3 USB: serial: metro-usb: add MS7820 device id USB: core: fix out-of-bounds access bug in usb_get_bos_descriptor() USB: devio: Revert "USB: devio: Don't corrupt user memory" ANDROID: binder: show high watermark of alloc->pages. ANDROID: binder: Add thread->process_todo flag. UPSTREAM: arm64: compat: Remove leftover variable declaration ANDROID: sched/fair: Select correct capacity state for energy_diff Revert "UPSTREAM: efi/libstub/arm64: Set -fpie when building the EFI stub" cpufreq: schedutil: clamp util to CPU maximum capacity FROMLIST: android: binder: Fix null ptr dereference in debug msg FROMLIST: android: binder: Change binder_shrinker to static cpufreq/sched: Use cpu max freq rather than policy max Conflicts: include/linux/sched.h kernel/sched/core.c kernel/sched/fair.c Change-Id: I2751f851df741f00e797deaf2119872b3dced655 Signed-off-by: Blagovest Kolenichev <bkolenichev@codeaurora.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/auto_group.c23
-rw-r--r--kernel/sched/core.c184
-rw-r--r--kernel/sched/cpufreq_sched.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c371
-rw-r--r--kernel/sched/idle_task.c3
-rw-r--r--kernel/sched/rt.c3
-rw-r--r--kernel/sched/sched.h37
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/walt.c102
-rw-r--r--kernel/sched/walt.h2
12 files changed, 516 insertions, 220 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 750ed601ddf7..8620fd01b3d0 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
-
/*
- * We can only assume the task group can't go away on us if
- * autogroup_move_group() can see us on ->thread_group list.
+ * If we race with autogroup_move_group() the caller can use the old
+ * value of signal->autogroup but in this case sched_move_task() will
+ * be called again before autogroup_kref_put().
*/
- if (p->flags & PF_EXITING)
- return false;
-
return true;
}
@@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
}
p->signal->autogroup = autogroup_kref_get(ag);
-
- if (!READ_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
+ /*
+ * We can't avoid sched_move_task() after we changed signal->autogroup,
+ * this process can already run with task_group() == prev->tg or we can
+ * race with cgroup code which can read autogroup = prev under rq->lock.
+ * In the latter case for_each_thread() can not miss a migrating thread,
+ * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+ * can't be removed from thread list, we hold ->siglock.
+ */
for_each_thread(p, t)
sched_move_task(t);
-out:
+
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c408280ddd12..eacfd2ac56a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -99,6 +99,10 @@
ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
+#ifdef CONFIG_SMP
+static bool have_sched_energy_data(void);
+#endif
+
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -201,6 +205,11 @@ static int sched_feat_set(char *cmp)
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
+#ifdef CONFIG_SMP
+ if (i == __SCHED_FEAT_ENERGY_AWARE)
+ WARN(!have_sched_energy_data(),
+ "Missing sched energy data\n");
+#endif
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
@@ -554,6 +563,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
return;
+ head->count++;
+
get_task_struct(task);
/*
@@ -563,6 +574,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
head->lastp = &node->next;
}
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ int sibling_count_hint);
+
void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;
@@ -577,10 +592,10 @@ void wake_up_q(struct wake_q_head *head)
task->wake_q.next = NULL;
/*
- * wake_up_process() implies a wmb() to pair with the queueing
+ * try_to_wake_up() implies a wmb() to pair with the queueing
* in wake_q_add() so as not to miss wakeups.
*/
- wake_up_process(task);
+ try_to_wake_up(task, TASK_NORMAL, 0, head->count);
put_task_struct(task);
}
}
@@ -1702,14 +1717,16 @@ out:
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+ int sibling_count_hint)
{
bool allow_isolated = (p->flags & PF_KTHREAD);
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1)
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+ sibling_count_hint);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2007,6 +2024,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ * in this event.
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
@@ -2018,7 +2037,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* or @state didn't match @p's state.
*/
static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+ int sibling_count_hint)
{
unsigned long flags;
int cpu, src_cpu, success = 0;
@@ -2134,7 +2154,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
- cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+ cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+ sibling_count_hint);
/* Refresh src_cpu as it could have changed since we last read it */
src_cpu = task_cpu(p);
@@ -2236,7 +2257,7 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_NORMAL, 0);
+ return try_to_wake_up(p, TASK_NORMAL, 0, 1);
}
EXPORT_SYMBOL(wake_up_process);
@@ -2256,13 +2277,13 @@ EXPORT_SYMBOL(wake_up_process);
int wake_up_process_no_notif(struct task_struct *p)
{
WARN_ON(task_is_stopped_or_traced(p));
- return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER);
+ return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1);
}
EXPORT_SYMBOL(wake_up_process_no_notif);
int wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ return try_to_wake_up(p, state, 0, 1);
}
/*
@@ -2337,9 +2358,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
+#ifdef CONFIG_SCHED_WALT
+ p->last_sleep_ts = 0;
+#endif
INIT_LIST_HEAD(&p->se.group_node);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
+
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2429,11 +2457,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2470,8 +2498,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2481,7 +2508,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2614,6 +2647,8 @@ void wake_up_new_task(struct task_struct *p)
add_new_task_to_grp(p);
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ p->state = TASK_RUNNING;
+
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
@@ -2621,11 +2656,15 @@ void wake_up_new_task(struct task_struct *p)
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
#endif
rq = __task_rq_lock(p);
mark_task_starting(p);
+ update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3071,7 +3110,7 @@ void sched_exec(void)
raw_spin_lock_irqsave(&p->pi_lock, flags);
curr_cpu = task_cpu(p);
- dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+ dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
if (dest_cpu == smp_processor_id())
goto unlock;
@@ -3171,7 +3210,9 @@ static void sched_freq_tick_pelt(int cpu)
* utilization and to harm its performance the least, request
* a jump to a higher OPP as soon as the margin of free capacity
* is impacted (specified by capacity_margin).
+ * Remember CPU utilization in sched_capacity_reqs should be normalised.
*/
+ cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, cpu_utilization);
}
@@ -3198,7 +3239,9 @@ static void sched_freq_tick_walt(int cpu)
* It is likely that the load is growing so we
* keep the added margin in our request as an
* extra boost.
+ * Remember CPU utilization in sched_capacity_reqs should be normalised.
*/
+ cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, cpu_utilization);
}
@@ -3579,6 +3622,10 @@ static void __sched notrace __schedule(bool preempt)
if (!is_idle_task(prev) && !prev->on_rq)
update_avg_burst(prev);
+#ifdef CONFIG_SCHED_WALT
+ if (!prev->on_rq)
+ prev->last_sleep_ts = wallclock;
+#endif
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3755,7 +3802,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
- return try_to_wake_up(curr->private, mode, wake_flags);
+ return try_to_wake_up(curr->private, mode, wake_flags, 1);
}
EXPORT_SYMBOL(default_wake_function);
@@ -3781,6 +3828,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3876,6 +3924,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -4303,6 +4353,7 @@ recheck:
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
/*
* Changing the policy of the stop threads its a very bad idea
@@ -7151,6 +7202,19 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
+static bool have_sched_energy_data(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (!rcu_dereference(per_cpu(sd_scs, cpu)) ||
+ !rcu_dereference(per_cpu(sd_ea, cpu)))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Check that the per-cpu provided sd energy data is consistent for all cpus
* within the mask.
@@ -7967,6 +8031,9 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
rcu_read_unlock();
+ WARN(sched_feat(ENERGY_AWARE) && !have_sched_energy_data(),
+ "Missing data for energy aware scheduling\n");
+
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
@@ -8784,27 +8851,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- unsigned long flags;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &flags);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -8817,11 +8866,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -9258,15 +9333,28 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
- sched_move_task(task);
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &flags);
+
+ update_rq_clock(rq);
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &flags);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9277,8 +9365,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
index 6ffb23adbcef..ec0aed7a8f96 100644
--- a/kernel/sched/cpufreq_sched.c
+++ b/kernel/sched/cpufreq_sched.c
@@ -202,7 +202,7 @@ static void update_fdomain_capacity_request(int cpu)
}
/* Convert the new maximum capacity request into a cpu frequency */
- freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+ freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT;
if (cpufreq_frequency_table_target(policy, policy->freq_table,
freq_new, CPUFREQ_RELATION_L,
&index_new))
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 28977799017b..d3765f0cb699 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -216,8 +216,9 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
*util = boosted_cpu_util(cpu);
if (likely(use_pelt()))
- *util = min((*util + rt), max_cap);
+ *util = *util + rt;
+ *util = min(*util, max_cap);
*max = max_cap;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 167a1038cff0..bb22bcf499f8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1107,7 +1107,8 @@ static void yield_task_dl(struct rq *rq)
static int find_later_rq(struct task_struct *task);
static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
struct task_struct *curr;
struct rq *rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e3ab49c262a..9263ffd5673f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -762,7 +762,9 @@ void init_entity_runnable_average(struct sched_entity *se)
}
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
static void attach_entity_cfs_rq(struct sched_entity *se);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
/*
* With new tasks being created, their initial util_avgs are extrapolated
@@ -833,7 +835,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
attach_entity_cfs_rq(se);
}
-#else
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
@@ -4412,11 +4414,14 @@ void remove_entity_load_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
sync_entity_load_avg(se);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
@@ -5824,7 +5829,7 @@ static void update_capacity_of(int cpu)
if (!sched_freq())
return;
- /* Convert scale-invariant capacity to cpu. */
+ /* Normalize scale-invariant capacity to cpu. */
req_cap = boosted_cpu_util(cpu);
req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, req_cap);
@@ -5867,7 +5872,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -6023,7 +6028,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (rq->cfs.nr_running)
update_capacity_of(cpu_of(rq));
else if (sched_freq())
- set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+ set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */
}
}
@@ -6446,6 +6451,7 @@ struct energy_env {
int util_delta;
int src_cpu;
int dst_cpu;
+ int trg_cpu;
int energy;
int payoff;
struct task_struct *task;
@@ -6462,11 +6468,14 @@ struct energy_env {
} cap;
};
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
/*
* __cpu_norm_util() returns the cpu util relative to a specific capacity,
- * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
- * energy calculations. Using the scale-invariant util returned by
- * cpu_util() and approximating scale-invariant util by:
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
+ * energy calculations.
+ *
+ * Since util is a scale-invariant utilization defined as:
*
* util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
*
@@ -6476,34 +6485,32 @@ struct energy_env {
*
* norm_util = running_time/time ~ util/capacity
*/
-static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
{
- int util = __cpu_util(cpu, delta);
-
if (util >= capacity)
return SCHED_CAPACITY_SCALE;
return (util << SCHED_CAPACITY_SHIFT)/capacity;
}
-static int calc_util_delta(struct energy_env *eenv, int cpu)
+static unsigned long group_max_util(struct energy_env *eenv)
{
- if (cpu == eenv->src_cpu)
- return -eenv->util_delta;
- if (cpu == eenv->dst_cpu)
- return eenv->util_delta;
- return 0;
-}
-
-static
-unsigned long group_max_util(struct energy_env *eenv)
-{
- int i, delta;
unsigned long max_util = 0;
+ unsigned long util;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
+ util = cpu_util_wake(cpu, eenv->task);
- for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
- delta = calc_util_delta(eenv, i);
- max_util = max(max_util, __cpu_util(i, delta));
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->trg_cpu))
+ util += eenv->util_delta;
+
+ max_util = max(max_util, util);
}
return max_util;
@@ -6511,44 +6518,56 @@ unsigned long group_max_util(struct energy_env *eenv)
/*
* group_norm_util() returns the approximated group util relative to it's
- * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
- * energy calculations. Since task executions may or may not overlap in time in
- * the group the true normalized util is between max(cpu_norm_util(i)) and
- * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
- * latter is used as the estimate as it leads to a more pessimistic energy
+ * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
+ * in energy calculations.
+ *
+ * Since task executions may or may not overlap in time in the group the true
+ * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
+ * when iterating over all CPUs in the group.
+ * The latter estimate is used as it leads to a more pessimistic energy
* estimate (more busy).
*/
static unsigned
long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
{
- int i, delta;
- unsigned long util_sum = 0;
unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+ unsigned long util, util_sum = 0;
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ util = cpu_util_wake(cpu, eenv->task);
+
+ /*
+ * If we are looking at the target CPU specified by the eenv,
+ * then we should add the (estimated) utilization of the task
+ * assuming we will wake it up on that CPU.
+ */
+ if (unlikely(cpu == eenv->trg_cpu))
+ util += eenv->util_delta;
- for_each_cpu(i, sched_group_cpus(sg)) {
- delta = calc_util_delta(eenv, i);
- util_sum += __cpu_norm_util(i, capacity, delta);
+ util_sum += __cpu_norm_util(util, capacity);
}
- if (util_sum > SCHED_CAPACITY_SCALE)
- return SCHED_CAPACITY_SCALE;
- return util_sum;
+ return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
}
static int find_new_capacity(struct energy_env *eenv,
const struct sched_group_energy * const sge)
{
- int idx;
+ int idx, max_idx = sge->nr_cap_states - 1;
unsigned long util = group_max_util(eenv);
+ /* default is max_cap if we don't find a match */
+ eenv->cap_idx = max_idx;
+
for (idx = 0; idx < sge->nr_cap_states; idx++) {
- if (sge->cap_states[idx].cap >= util)
+ if (sge->cap_states[idx].cap >= util) {
+ eenv->cap_idx = idx;
break;
+ }
}
- eenv->cap_idx = idx;
-
- return idx;
+ return eenv->cap_idx;
}
static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
@@ -6721,6 +6740,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
+static inline unsigned long task_util(struct task_struct *p);
+
/*
* energy_diff(): Estimate the energy impact of changing the utilization
* distribution. eenv specifies the change: utilisation amount, source, and
@@ -6736,11 +6757,13 @@ static inline int __energy_diff(struct energy_env *eenv)
int diff, margin;
struct energy_env eenv_before = {
- .util_delta = 0,
+ .util_delta = task_util(eenv->task),
.src_cpu = eenv->src_cpu,
.dst_cpu = eenv->dst_cpu,
+ .trg_cpu = eenv->src_cpu,
.nrg = { 0, 0, 0, 0},
.cap = { 0, 0, 0 },
+ .task = eenv->task,
};
if (eenv->src_cpu == eenv->dst_cpu)
@@ -6799,7 +6822,11 @@ static inline int __energy_diff(struct energy_env *eenv)
#ifdef CONFIG_SCHED_TUNE
struct target_nrg schedtune_target_nrg;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
extern bool schedtune_initialized;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
/*
* System energy normalization
* Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
@@ -6810,9 +6837,11 @@ normalize_energy(int energy_diff)
{
u32 normalized_nrg;
+#ifdef CONFIG_CGROUP_SCHEDTUNE
/* during early setup, we don't know the extents */
if (unlikely(!schedtune_initialized))
return energy_diff < 0 ? -1 : 1 ;
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
#ifdef CONFIG_SCHED_DEBUG
{
@@ -6848,8 +6877,14 @@ energy_diff(struct energy_env *eenv)
__energy_diff(eenv);
/* Return energy diff when boost margin is 0 */
- if (boost == 0)
+ if (boost == 0) {
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ 0, -eenv->nrg.diff);
return eenv->nrg.diff;
+ }
/* Compute normalized energy diff */
nrg_delta = normalize_energy(eenv->nrg.diff);
@@ -6892,15 +6927,18 @@ energy_diff(struct energy_env *eenv)
* being client/server, worker/dispatcher, interrupt source or whatever is
* irrelevant, spread criteria is apparent partner count exceeds socket size.
*/
-static int wake_wide(struct task_struct *p)
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
{
unsigned int master = current->wakee_flips;
unsigned int slave = p->wakee_flips;
- int factor = this_cpu_read(sd_llc_size);
+ int llc_size = this_cpu_read(sd_llc_size);
+
+ if (sibling_count_hint >= llc_size)
+ return 1;
if (master < slave)
swap(master, slave);
- if (slave < factor || master < slave * factor)
+ if (slave < llc_size || master < slave * llc_size)
return 0;
return 1;
}
@@ -7106,8 +7144,6 @@ boosted_task_util(struct task_struct *task)
return util + margin;
}
-static int cpu_util_wake(int cpu, struct task_struct *p);
-
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
{
return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
@@ -7116,6 +7152,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -7123,7 +7161,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
- unsigned long min_load = ULONG_MAX, this_load = 0;
+ unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -7191,23 +7229,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
* utilized systems if we require spare_capacity > task_util(p),
* so we allow for some task stuffing by using
* spare_capacity > task_util(p)/2.
+ *
+ * Spare capacity can't be used for fork because the utilization has
+ * not been set yet, we must first select a rq to compute the initial
+ * utilization.
*/
+ if (sd_flag & SD_BALANCE_FORK)
+ goto skip_spare;
+
if (this_spare > task_util(p) / 2 &&
imbalance*this_spare > 100*most_spare)
return NULL;
else if (most_spare > task_util(p) / 2)
return most_spare_sg;
+skip_spare:
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -7254,6 +7300,68 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ }
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = cpu;
+ int wu = sd_flag & SD_BALANCE_WAKE;
+ int cas_cpu = -1;
+
+ if (wu) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
+ schedstat_inc(this_rq(), eas_stats.cas_attempts);
+ }
+
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (wu)
+ schedstat_inc(sd, eas_stats.cas_attempts);
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = cas_cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ if (wu && (cas_cpu >= 0)) {
+ schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
+ schedstat_inc(this_rq(), eas_stats.cas_count);
+ }
+
+ return new_cpu;
}
/*
@@ -7397,7 +7505,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
unsigned long target_capacity = ULONG_MAX;
unsigned long min_wake_util = ULONG_MAX;
unsigned long target_max_spare_cap = 0;
- unsigned long target_util = ULONG_MAX;
unsigned long best_active_util = ULONG_MAX;
int best_idle_cstate = INT_MAX;
struct sched_domain *sd;
@@ -7536,6 +7643,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
}
/*
+ * Enforce EAS mode
+ *
+ * For non latency sensitive tasks, skip CPUs that
+ * will be overutilized by moving the task there.
+ *
+ * The goal here is to remain in EAS mode as long as
+ * possible at least for !prefer_idle tasks.
+ */
+ if ((new_util * capacity_margin) >
+ (capacity_orig * SCHED_CAPACITY_SCALE))
+ continue;
+
+ /*
* Case B) Non latency sensitive tasks on IDLE CPUs.
*
* Find an optimal backup IDLE CPU for non latency
@@ -7613,7 +7733,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
target_max_spare_cap = capacity_orig - new_util;
target_capacity = capacity_orig;
- target_util = new_util;
target_cpu = i;
}
@@ -7734,6 +7853,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
.src_cpu = prev_cpu,
.dst_cpu = target_cpu,
.task = p,
+ .trg_cpu = target_cpu,
};
@@ -7752,7 +7872,9 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync
/* No energy saving for target_cpu, try backup */
target_cpu = tmp_backup;
eenv.dst_cpu = target_cpu;
- if (tmp_backup < 0 || energy_diff(&eenv) >= 0) {
+ if (tmp_backup < 0 ||
+ tmp_backup == prev_cpu ||
+ energy_diff(&eenv) >= 0) {
schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
target_cpu = prev_cpu;
@@ -7787,7 +7909,8 @@ unlock:
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+ int sibling_count_hint)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -7799,9 +7922,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return select_best_cpu(p, prev_cpu, 0, sync);
#endif
- if (sd_flag & SD_BALANCE_WAKE)
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ if (sd_flag & SD_BALANCE_WAKE) {
+ record_wakee(p);
+ want_affine = !wake_wide(p, sibling_count_hint) &&
+ !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
+ }
if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
return select_energy_cpu_brute(p, prev_cpu, sync);
@@ -7833,61 +7959,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
new_cpu = cpu;
}
+ if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+ /*
+ * We're going to need the task's util for capacity_spare_wake
+ * in find_idlest_group. Sync it up to prev_cpu's
+ * last_update_time.
+ */
+ sync_entity_load_avg(&p->se);
+ }
+
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else {
- int wu = sd_flag & SD_BALANCE_WAKE;
- int cas_cpu = -1;
-
- if (wu) {
- schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
- schedstat_inc(this_rq(), eas_stats.cas_attempts);
- }
-
- while (sd) {
- struct sched_group *group;
- int weight;
-
- if (wu)
- schedstat_inc(sd, eas_stats.cas_attempts);
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = cas_cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
- }
-
- if (wu && (cas_cpu >= 0)) {
- schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
- schedstat_inc(this_rq(), eas_stats.cas_count);
- }
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
@@ -10040,8 +10126,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+ /*
+ * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+ * capacities from resulting in underutilization due to avg_load.
+ */
+ if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
@@ -10410,6 +10499,7 @@ redo:
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
+ update_rq_clock(busiest);
/* The world might have changed. Validate assumptions */
if (busiest->nr_running <= 1) {
@@ -10867,6 +10957,7 @@ static int active_load_balance_cpu_stop(void *data)
if (likely(sd)) {
env.sd = sd;
schedstat_inc(sd, alb_count);
+ update_rq_clock(busiest_rq);
p = detach_one_task(&env);
if (p) {
@@ -11530,31 +11621,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -11567,8 +11644,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -11760,6 +11836,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -11772,6 +11856,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -12003,7 +12100,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
#ifdef CONFIG_SCHED_HMP
.inc_hmp_sched_stats = inc_hmp_sched_stats_fair,
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 36c6634236fb..d562efb04775 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,8 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 23b68b051cee..47e97ef57eb8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1479,7 +1479,8 @@ task_may_not_preempt(struct task_struct *task, int cpu)
}
static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
struct task_struct *curr;
struct rq *rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3f17dd1d9d2c..cc5ae5ddee6b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -340,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
extern struct task_group *css_tg(struct cgroup_subsys_state *css);
#else /* CONFIG_CGROUP_SCHED */
@@ -804,6 +812,7 @@ struct rq {
u64 cur_irqload;
u64 avg_irqload;
u64 irqload_ts;
+ u64 cum_window_demand;
#endif /* CONFIG_SCHED_WALT */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1750,6 +1759,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
@@ -2036,7 +2046,8 @@ struct sched_class {
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
- int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+ int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
+ int subling_count_hint);
void (*migrate_task_rq)(struct task_struct *p);
void (*task_waking) (struct task_struct *task);
@@ -2069,8 +2080,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group)(struct task_struct *p, int type);
#endif
#ifdef CONFIG_SCHED_HMP
void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p);
@@ -2342,7 +2356,7 @@ static inline unsigned long capacity_orig_of(int cpu)
extern unsigned int sysctl_sched_use_walt_cpu_util;
extern unsigned int walt_ravg_window;
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
@@ -2418,6 +2432,10 @@ static inline bool sched_freq(void)
return static_key_false(&__sched_freq);
}
+/*
+ * sched_capacity_reqs expects capacity requests to be normalised.
+ * All capacities should sum to the range of 0-1024.
+ */
DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
void update_cpu_capacity_request(int cpu, bool request);
@@ -2887,6 +2905,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+ return cpu_of(rq) == task_cpu(p) &&
+ (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
#ifdef arch_scale_freq_capacity
#ifndef arch_scale_freq_invariant
#define arch_scale_freq_invariant() (true)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 134da1cc8fce..3278c81cefb1 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,8 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
+ int sibling_count_hint)
{
return task_cpu(p); /* stop tasks as never migrate */
}
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 441cba01bc04..8d25ffbe4fed 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -41,25 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0;
unsigned int sysctl_sched_walt_init_task_load_pct = 15;
-/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
-unsigned int __read_mostly walt_disabled = 0;
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
-/* Window size (in ns) */
-__read_mostly unsigned int walt_ravg_window = 20000000;
-
-/* Min window size (in ns) = 10ms */
-#ifdef CONFIG_HZ_300
/*
- * Tick interval becomes to 3333333 due to
- * rounding error when HZ=300.
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
*/
-#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
-#else
-#define MIN_SCHED_RAVG_WINDOW 10000000
-#endif
-
-/* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+__read_mostly unsigned int walt_ravg_window =
+ (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
static unsigned int sync_cpu;
static ktime_t ktime_last;
@@ -70,11 +62,28 @@ static unsigned int task_load(struct task_struct *p)
return p->ravg.demand;
}
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+ rq->cum_window_demand += delta;
+ if (unlikely((s64)rq->cum_window_demand < 0))
+ rq->cum_window_demand = 0;
+}
+
void
walt_inc_cumulative_runnable_avg(struct rq *rq,
struct task_struct *p)
{
rq->cumulative_runnable_avg += p->ravg.demand;
+
+ /*
+ * Add a task's contribution to the cumulative window demand when
+ *
+ * (1) task is enqueued with on_rq = 1 i.e migration,
+ * prio/cgroup/class change.
+ * (2) task is waking for the first time in this window.
+ */
+ if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+ fixup_cum_window_demand(rq, p->ravg.demand);
}
void
@@ -83,6 +92,14 @@ walt_dec_cumulative_runnable_avg(struct rq *rq,
{
rq->cumulative_runnable_avg -= p->ravg.demand;
BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+ /*
+ * on_rq will be 1 for sleeping tasks. So check if the task
+ * is migrating or dequeuing in RUNNING state to change the
+ * prio/cgroup/class.
+ */
+ if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+ fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
}
static void
@@ -95,6 +112,8 @@ fixup_cumulative_runnable_avg(struct rq *rq,
if ((s64)rq->cumulative_runnable_avg < 0)
panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
task_load_delta, task_load(p));
+
+ fixup_cum_window_demand(rq, task_load_delta);
}
u64 walt_ktime_clock(void)
@@ -153,10 +172,28 @@ static int exiting_task(struct task_struct *p)
static int __init set_walt_ravg_window(char *str)
{
+ unsigned int adj_window;
+ bool no_walt = walt_disabled;
+
get_option(&str, &walt_ravg_window);
- walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
- walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+ /* Adjust for CONFIG_HZ */
+ adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+ /* Warn if we're a bit too far away from the expected window size */
+ WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+ "tick-adjusted window size %u, original was %u\n", adj_window,
+ walt_ravg_window);
+
+ walt_ravg_window = adj_window;
+
+ walt_disabled = walt_disabled ||
+ (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+ WARN(!no_walt && walt_disabled,
+ "invalid window size, disabling WALT\n");
+
return 0;
}
@@ -180,6 +217,8 @@ update_window_start(struct rq *rq, u64 wallclock)
nr_windows = div64_u64(delta, walt_ravg_window);
rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+ rq->cum_window_demand = rq->cumulative_runnable_avg;
}
/*
@@ -568,10 +607,20 @@ static void update_history(struct rq *rq, struct task_struct *p,
* A throttled deadline sched class task gets dequeued without
* changing p->on_rq. Since the dequeue decrements hmp stats
* avoid decrementing it here again.
+ *
+ * When window is rolled over, the cumulative window demand
+ * is reset to the cumulative runnable average (contribution from
+ * the tasks on the runqueue). If the current task is dequeued
+ * already, it's demand is not included in the cumulative runnable
+ * average. So add the task demand separately to cumulative window
+ * demand.
*/
- if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
- !p->dl.dl_throttled))
- fixup_cumulative_runnable_avg(rq, p, demand);
+ if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+ if (task_on_rq_queued(p))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+ else if (rq->curr == p)
+ fixup_cum_window_demand(rq, demand);
+ }
p->ravg.demand = demand;
@@ -792,6 +841,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+ /*
+ * When a task is migrating during the wakeup, adjust
+ * the task's contribution towards cumulative window
+ * demand.
+ */
+ if (p->state == TASK_WAKING &&
+ p->last_sleep_ts >= src_rq->window_start) {
+ fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+ fixup_cum_window_demand(dest_rq, p->ravg.demand);
+ }
+
if (p->ravg.curr_window) {
src_rq->curr_runnable_sum -= p->ravg.curr_window;
dest_rq->curr_runnable_sum += p->ravg.curr_window;
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index f56c4da16d0b..de7edac43674 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -59,6 +59,6 @@ static inline u64 walt_ktime_clock(void) { return 0; }
#endif /* CONFIG_SCHED_WALT */
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
#endif