From d34d2c97ae56961ca73fc8704aec2304bb820668 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 20 Jun 2017 12:12:49 +0100 Subject: cpufreq/sched: Use cpu max freq rather than policy max When we convert capacity into frequency, we used policy->max to get the max freq of the cpu. Since this can be changed by userspace policy or thermal events, we are potentially asking for a lower frequency than the utilization demands. Change over to using cpuinfo.max which is the max freq supported by that cpu rather than the currently-chosen max. Frequency granted still honours the max policy. Tested by setting a userspace policy and observing the relevant vars in a trace. In this instance, we ask for around 1ghz instead of 620MHz. freq_new=1013512 unfixed_freq_new=624487 capacity=546 cpuinfo_max=1900800 policy_max=1171200 Change-Id: I8c5694db42243c6fb78bb9be9046b06ac81295e7 Signed-off-by: Chris Redpath --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 6ffb23adbcef..ec0aed7a8f96 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -202,7 +202,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) -- cgit v1.2.3 From 774481506a83d305353330e44b8e0f6ad80a34a8 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 15 Sep 2017 08:25:32 +0800 Subject: cpufreq: schedutil: clamp util to CPU maximum capacity The code is to get the CPU util by accumulate different scheduling classes and when the total util value is larger than CPU capacity then it clamps util to CPU maximum capacity. So we can get correct util value when use PELT signal but if with WALT signal it misses to clamp util value. On the other hand, WALT doesn't accumulate different class utilization but it needs to applying boost margin for WALT signal the CPU util value is possible to be larger than CPU capacity; so this patch is to always clamp util to CPU maximum capacity. Change-Id: I05481ddbf20246bb9be15b6bd21b6ec039015ea8 Signed-off-by: Leo Yan --- kernel/sched/cpufreq_schedutil.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28977799017b..d3765f0cb699 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -216,8 +216,9 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) *util = boosted_cpu_util(cpu); if (likely(use_pelt())) - *util = min((*util + rt), max_cap); + *util = *util + rt; + *util = min(*util, max_cap); *max = max_cap; } -- cgit v1.2.3 From 4f8767d1ca307ab8c03889534a3c5a525ce7485d Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 25 Oct 2017 17:25:20 +0100 Subject: ANDROID: sched/fair: Select correct capacity state for energy_diff The util returned from group_max_util is not capped at the max util present in the group, so it can be larger than the capacity stored in the array. Ensure that when this happens, we always use the last entry in the array to fetch energy from. Tested with synthetics on Juno board. Bug: 38159576 Change-Id: I89fb52fb7e68fa3e682e308acc232596672d03f7 Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1df3873b6fd..5cac6a77b2bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5384,17 +5384,20 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy * const sge) { - int idx; + int idx, max_idx = sge->nr_cap_states - 1; unsigned long util = group_max_util(eenv); + /* default is max_cap if we don't find a match */ + eenv->cap_idx = max_idx; + for (idx = 0; idx < sge->nr_cap_states; idx++) { - if (sge->cap_states[idx].cap >= util) + if (sge->cap_states[idx].cap >= util) { + eenv->cap_idx = idx; break; + } } - eenv->cap_idx = idx; - - return idx; + return eenv->cap_idx; } static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) -- cgit v1.2.3 From 0f85c0954be46bbd36960191daa447ad86b98f0b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:09 +0100 Subject: sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() commit 18f649ef344127ef6de23a5a4272dbe2fdb73dde upstream. The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove it, but see the next patch. However the comment is correct in that autogroup_move_group() must always change task_group() for every thread so the sysctl_ check is very wrong; we can race with cgroups and even sys_setsid() is not safe because a task running with task_group() == ag->tg must participate in refcounting: int main(void) { int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY); assert(sctl > 0); if (fork()) { wait(NULL); // destroy the child's ag/tg pause(); } assert(pwrite(sctl, "1\n", 2, 0) == 2); assert(setsid() > 0); if (fork()) pause(); kill(getppid(), SIGKILL); sleep(1); // The child has gone, the grandchild runs with kref == 1 assert(pwrite(sctl, "0\n", 2, 0) == 2); assert(setsid() > 0); // runs with the freed ag/tg for (;;) sleep(1); return 0; } crashes the kernel. It doesn't really need sleep(1), it doesn't matter if autogroup_move_group() actually frees the task_group or this happens later. Reported-by: Vern Lovejoy Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Sumit Semwal [sumits: submit to 4.4 LTS, post testing on Hikey] Signed-off-by: Greg Kroah-Hartman --- kernel/sched/auto_group.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..8620fd01b3d0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } -- cgit v1.2.3 From fac311be26e5af64612c386f5a041984fe7c59a2 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 25 Apr 2017 10:37:58 +0100 Subject: cpufreq/sched: Consider max cpu capacity when choosing frequencies When using schedfreq on cpus with max capacity significantly smaller than 1024, the tick update uses non-normalised capacities - this leads to selecting an incorrect OPP as we were scaling the frequency as if the max capacity achievable was 1024 rather than the max for that particular cpu or group. This could result in a cpu being stuck at the lowest OPP and unable to generate enough utilisation to climb out if the max capacity is significantly smaller than 1024. Instead, normalize the capacity to be in the range 0-1024 in the tick so that when we later select a frequency, we get the correct one. Also comments updated to be clearer about what is needed. Change-Id: Id84391c7ac015311002ada21813a353ee13bee60 Signed-off-by: Chris Redpath --- kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83f7c682032b..9cf530a6123e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2987,7 +2987,9 @@ static void sched_freq_tick_pelt(int cpu) * utilization and to harm its performance the least, request * a jump to a higher OPP as soon as the margin of free capacity * is impacted (specified by capacity_margin). + * Remember CPU utilization in sched_capacity_reqs should be normalised. */ + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, cpu_utilization); } @@ -3014,7 +3016,9 @@ static void sched_freq_tick_walt(int cpu) * It is likely that the load is growing so we * keep the added margin in our request as an * extra boost. + * Remember CPU utilization in sched_capacity_reqs should be normalised. */ + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, cpu_utilization); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cac6a77b2bc..e6b2461d07d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4671,7 +4671,7 @@ static void update_capacity_of(int cpu) if (!sched_freq()) return; - /* Convert scale-invariant capacity to cpu. */ + /* Normalize scale-invariant capacity to cpu. */ req_cap = boosted_cpu_util(cpu); req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); @@ -4864,7 +4864,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (rq->cfs.nr_running) update_capacity_of(cpu_of(rq)); else if (sched_freq()) - set_cfs_cpu_capacity(cpu_of(rq), false, 0); + set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */ } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f3d89faacdc..5256f05a26e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1621,6 +1621,10 @@ static inline bool sched_freq(void) return static_key_false(&__sched_freq); } +/* + * sched_capacity_reqs expects capacity requests to be normalised. + * All capacities should sum to the range of 0-1024. + */ DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); void update_cpu_capacity_request(int cpu, bool request); -- cgit v1.2.3 From 792510d9b392b392c763c5e395f046d5c246c66e Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 30 May 2017 14:51:53 +0100 Subject: BACKPORT: sched/fair: Make it possible to account fair load avg consistently While set_task_rq_fair() is introduced in mainline by commit ad936d8658fd ("sched/fair: Make it possible to account fair load avg consistently"), the function results to be introduced here by the backport of commit 09a43ace1f98 ("sched/fair: Propagate load during synchronous attach/detach"). The problem (apart from the confusion introduced by the backport) is actually that set_task_rq_fair() is currently not called at all. Fix the problem by backporting again commit ad936d8658fd ("sched/fair: Make it possible to account fair load avg consistently"). Original change log: The current code accounts for the time a task was absent from the fair class (per ATTACH_AGE_LOAD). However it does not work correctly when a task got migrated or moved to another cgroup while outside of the fair class. This patch tries to address that by aging on migration. We locklessly read the 'last_update_time' stamp from both the old and new cfs_rq, ages the load upto the old time, and sets it to the new time. These timestamps should in general not be more than 1 tick apart from one another, so there is a definite bound on things. Signed-off-by: Byungchul Park [ Changelog, a few edits and !SMP build fix ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1445616981-29904-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry-picked from ad936d8658fd348338cb7d42c577dac77892b074) Signed-off-by: Juri Lelli Signed-off-by: Chris Redpath Change-Id: I17294ab0ada3901d35895014715fd60952949358 Signed-off-by: Brendan Jackman --- kernel/sched/core.c | 4 ++++ kernel/sched/sched.h | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9cf530a6123e..e3242eed60e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2173,6 +2173,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); walt_init_new_task_load(p); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5256f05a26e8..1d52ca8a613c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -335,7 +335,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -987,6 +995,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif -- cgit v1.2.3 From 6b02ab68ec78f41f647ebc32b1c37bd27fdb034e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 18:51:48 +0200 Subject: UPSTREAM: sched/fair: Fix and optimize the fork() path The task_fork_fair() callback already calls __set_task_cpu() and takes rq->lock. If we move the sched_class::task_fork callback in sched_fork() under the existing p->pi_lock, right after its set_task_cpu() call, we can avoid doing two such calls and omit the IRQ disabling on the rq->lock. Change to __set_task_cpu() to skip the migration bits, this is a new task, not a migration. Similarly, make wake_up_new_task() use __set_task_cpu() for the same reason, the task hasn't actually migrated as it hasn't ever ran. This cures the problem of calling migrate_task_rq_fair(), which does remove_entity_from_load_avg() on tasks that have never been added to the load avg to begin with. This bug would result in transiently messed up load_avg values, averaged out after a few dozen milliseconds. This is probably the reason why this bug was not found for such a long time. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit e210bffd39d01b649c94b820c28ff112673266dd) Change-Id: Icbddbaa6e8c1071859673d8685bc3f38955cf144 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 16 +++++++++++----- kernel/sched/fair.c | 27 ++++++--------------------- 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3242eed60e5..7e696bb3291a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2300,9 +2300,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2311,7 +2308,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2453,8 +2456,11 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p); post_init_entity_util_avg(&p->se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6b2461d07d6..d42a54f7b2e3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4712,7 +4712,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -9901,31 +9901,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -9938,8 +9924,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* -- cgit v1.2.3 From 138a670d97ca84a4cab83515a0920d4ef8eeb22a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Jun 2016 13:38:55 +0200 Subject: BACKPORT: sched/cgroup: Fix cpu_cgroup_fork() handling A new fair task is detached and attached from/to task_group with: cgroup_post_fork() ss->fork(child) := cpu_cgroup_fork() sched_move_task() task_move_group_fair() Which is wrong, because at this point in fork() the task isn't fully initialized and it cannot 'move' to another group, because its not attached to any group as yet. In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we can just call this small part directly instead sched_move_task(). And the task doesn't really migrate because it is not yet attached so we need the following sequence: do_fork() sched_fork() __set_task_cpu() cgroup_post_fork() set_task_rq() # set task group and runqueue wake_up_new_task() select_task_rq() can select a new cpu __set_task_cpu post_init_entity_util_avg attach_task_cfs_rq() activate_task enqueue_task This patch makes that happen. BACKPORT: Difference from original commit: - Removed use of DEQUEUE_MOVE (which isn't defined in 4.4) in dequeue_task flags - Replaced "struct rq_flags rf" with "unsigned long flags". Signed-off-by: Vincent Guittot [ Added TASK_SET_GROUP to set depth properly. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit ea86cb4b7621e1298a37197005bf0abcc86348d4) Change-Id: I8126fd923288acf961218431ffd29d6bf6fd8d72 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 63 ++++++++++++++++++++++++++++++++++------------------ kernel/sched/fair.c | 23 ++++++++++++++++++- kernel/sched/sched.h | 5 ++++- 3 files changed, 67 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e696bb3291a..0b5c588929e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8188,27 +8188,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -8221,11 +8203,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8662,9 +8670,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task, void *private) { - sched_move_task(task); + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(task, &flags); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &flags); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d42a54f7b2e3..2afd81bfda99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10116,6 +10116,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -10128,6 +10136,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -10354,7 +10375,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d52ca8a613c..8d3712107e61 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1293,8 +1293,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group)(struct task_struct *p, int type); #endif }; -- cgit v1.2.3 From 97cb74f48599ca1ae6c17955882f167a4c3aaad2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 13:29:28 +0200 Subject: BACKPORT: sched/fair: Fix PELT integrity for new tasks Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. .:: BACKPORT Complicated by the fact that mch of the lines changed by the original of this commit were then changed by: df217913e72e sched/fair: Factorize attach/detach entity and then d31b1a66cbe0 sched/fair: Factorize PELT update , which have both already been backported here. Reported-by: Yuyang Du Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 7dc603c9028ea5d4354e0e317e8481df99b06d7e) Change-Id: Ibc59eb52310a62709d49a744bd5a24e8b97c4ae8 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 26 +++++++++++++++++++++++--- kernel/sched/fair.c | 15 ++++++++++----- 2 files changed, 33 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b5c588929e9..31cb76a915a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2259,11 +2259,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2300,6 +2300,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } + init_entity_runnable_average(&p->se); + /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2446,6 +2448,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; walt_init_new_task_load(p); @@ -8690,6 +8693,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8700,8 +8704,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2afd81bfda99..76af6e25e82e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -766,7 +766,9 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); static void attach_entity_cfs_rq(struct sched_entity *se); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -837,7 +839,7 @@ void post_init_entity_util_avg(struct sched_entity *se) attach_entity_cfs_rq(se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } @@ -3312,11 +3314,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); -- cgit v1.2.3 From c14c9b6e3e489062efe08c364196e76e705b8ea3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 8 Dec 2016 17:56:53 +0100 Subject: UPSTREAM: sched/core: Fix find_idlest_group() for fork During fork, the utilization of a task is init once the rq has been selected because the current utilization level of the rq is used to set the utilization of the fork task. As the task's utilization is still 0 at this step of the fork sequence, it doesn't make sense to look for some spare capacity that can fit the task's utilization. Furthermore, I can see perf regressions for the test: hackbench -P -g 1 because the least loaded policy is always bypassed and tasks are not spread during fork. With this patch and the fix below, we are back to same performances as for v4.8. The fix below is only a temporary one used for the test until a smarter solution is found because we can't simply remove the test which is useful for others benchmarks | @@ -5708,13 +5708,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | | avg_cost = this_sd->avg_scan_cost; | | - /* | - * Due to large variance we need a large fuzz factor; hackbench in | - * particularly is sensitive here. | - */ | - if ((avg_idle / 512) < avg_cost) | - return -1; | - | time = local_clock(); | | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { Tested-by: Matt Fleming Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Acked-by: Morten Rasmussen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: kernellwp@gmail.com Cc: umgwanakikbuti@gmail.com Cc: yuyang.du@intel.comc Link: http://lkml.kernel.org/r/1481216215-24651-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit f519a3f1c6b7a990e5aed37a8f853c6ecfdee945) Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath Change-Id: I86cc2ad81af3467c0b2f82b995111f428248baa4 --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76af6e25e82e..4a20c392c0eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6051,13 +6051,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilized systems if we require spare_capacity > task_util(p), * so we allow for some task stuffing by using * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + if (this_spare > task_util(p) / 2 && imbalance*this_spare > 100*most_spare) return NULL; else if (most_spare > task_util(p) / 2) return most_spare_sg; +skip_spare: if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; -- cgit v1.2.3 From 4863faf5e4df76322999ad06502cbe27d0ec86dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:20:59 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() in post_init_entity_util_avg() Address this rq-clock update bug: WARNING: CPU: 0 PID: 0 at ../kernel/sched/sched.h:797 post_init_entity_util_avg() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: __warn() post_init_entity_util_avg() wake_up_new_task() _do_fork() kernel_thread() rest_init() start_kernel() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 4126bad6717336abe5d666440ae15555563ca53f) Change-Id: Ibe9a73386896377f96483d195e433259218755a5 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 31cb76a915a8..7da9ce69e707 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2466,6 +2466,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); walt_mark_task_starting(p); -- cgit v1.2.3 From bea1b621d952079c4cebc1178ea580c290d0446e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:28:37 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() in detach_task_cfs_rq() Instead of adding the update_rq_clock() all the way at the bottom of the callstack, add one at the top, this to aid later effort to minimize update_rq_lock() calls. WARNING: CPU: 0 PID: 1 at ../kernel/sched/sched.h:797 detach_task_cfs_rq() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() detach_task_cfs_rq() switched_from_fair() __sched_setscheduler() _sched_setscheduler() sched_set_stop_task() cpu_stop_create() __smpboot_create_thread.part.2() smpboot_register_percpu_thread_cpumask() cpu_stop_init() do_one_initcall() ? print_cpu_info() kernel_init_freeable() ? rest_init() kernel_init() ret_from_fork() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 80f5c1b84baa8180c3c27b7e227429712cd967b6) Change-Id: Ibffde077d18eabec4c2984158bd9d6d73bd0fb96 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7da9ce69e707..4097f9fa541b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3578,6 +3578,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -4095,6 +4096,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -8685,6 +8687,7 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) rq = task_rq_lock(task, &flags); + update_rq_clock(rq); sched_change_group(task, TASK_SET_GROUP); task_rq_unlock(rq, task, &flags); -- cgit v1.2.3 From bab39eb879251debe02e573f5453bd93ad5350bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:35:32 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() call for task_hot() Add the update_rq_clock() call at the top of the callstack instead of at the bottom where we find it missing, this to aid later effort to minimize the number of update_rq_lock() calls. WARNING: CPU: 30 PID: 194 at ../kernel/sched/sched.h:797 assert_clock_updated() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() assert_clock_updated.isra.63.part.64() can_migrate_task() load_balance() pick_next_task_fair() __schedule() schedule() worker_thread() kthread() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 3bed5e2166a5e433bf62162f3cd3c5174d335934) Change-Id: Ief5070dcce486535334dcb739ee16b989ea9df42 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4a20c392c0eb..f6e730bcde27 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8998,6 +8998,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -9395,6 +9396,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd, alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { -- cgit v1.2.3 From fd4a95dab858fa1350cc94ce4b435b7295db3ed3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:44:25 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() call in set_user_nice() Address this rq-clock update bug: WARNING: CPU: 30 PID: 195 at ../kernel/sched/sched.h:797 set_next_entity() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() set_next_entity() ? _raw_spin_lock() set_curr_task_fair() set_user_nice.part.85() set_user_nice() create_worker() worker_thread() kthread() ret_from_fork() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 2fb8d36787affe26f3536c3d8ec094995a48037d) Change-Id: I53ba056e72820c7fadb3f022e4ee3b821c0de17d Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4097f9fa541b..ca9d72fa8e66 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3670,6 +3670,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected -- cgit v1.2.3 From 795a6867cfe1ea0bbe967d2b037ae856484556a4 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 7 Aug 2017 17:39:00 +0100 Subject: UPSTREAM: sched/fair: Force balancing on nohz balance if local group has capacity The "goto force_balance" here is intended to mitigate the fact that avg_load calculations can result in bad placement decisions when priority is asymmetrical. The original commit that adds it: fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity") explains: Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU capacity) systems - consider 8 always-running, same-priority tasks on a system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on the "big" CPUs (which will be represented by one sched_group in the DIE sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving one CPU unused. Because the "big" group has a higher group_capacity its avg_load may not present an imbalance that would cause migrating a task to the idle "little". The force_balance case here solves the problem but currently only for CPU_NEWLY_IDLE balances, which in theory might never happen on the unused CPU. Including CPU_IDLE in the force_balance case means there's an upper bound on the time before we can attempt to solve the underutilization: after DIE's sd->balance_interval has passed the next nohz balance kick will help us out. Change-Id: I807ba5cba0ef1b8bbec02cbcd4755fd32af10135 Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 583ffd99d765 tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6e730bcde27..ae94c1124655 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8707,8 +8707,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; -- cgit v1.2.3 From 0f743ce7458c3c7d4be2367c8ed538069f1472b6 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:57:58 +0100 Subject: BACKPORT: sched/fair: Move select_task_rq_fair slow-path into its own function In preparation for changes that would otherwise require adding a new level of indentation to the while(sd) loop, create a new function find_idlest_cpu() which contains this loop, and rename the existing find_idlest_cpu() to find_idlest_group_cpu(). Code inside the while(sd) loop is unchanged. @new_cpu is added as a variable in the new function, with the same initial value as the @new_cpu in select_task_rq_fair(). Change-Id: I9842308cab00dc9cd6c513fc38c609089a1aaaaf Suggested-by: Peter Zijlstra Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (reworked for eas/cas schedstats added in Android) (cherry-picked commit 18bd1b4bd53a from tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae94c1124655..561d1c505512 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6072,10 +6072,10 @@ skip_spare: } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -6122,6 +6122,65 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + } + +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = prev_cpu; + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); + } + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); + } + + return new_cpu; } /* @@ -6698,56 +6757,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else { - int wu = sd_flag & SD_BALANCE_WAKE; - int cas_cpu = -1; - - if (wu) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); - schedstat_inc(this_rq(), eas_stats.cas_attempts); - } - - while (sd) { - struct sched_group *group; - int weight; - - if (wu) - schedstat_inc(sd, eas_stats.cas_attempts); - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = cas_cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - if (wu && (cas_cpu >= 0)) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_count); - schedstat_inc(this_rq(), eas_stats.cas_count); - } + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); -- cgit v1.2.3 From 529def2ffe532855f570a3a00e9f78ce59c8d84b Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:57:59 +0100 Subject: UPSTREAM: sched/fair: Remove unnecessary comparison with -1 Since commit: 83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu") find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1, so we can simplify the checking of the return value in find_idlest_cpu(). Change-Id: I98f4b9f178cd93a30408e024e608d36771764c7b Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from commit e90381eaecf6 in tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 561d1c505512..cf28d82fad41 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6156,7 +6156,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } new_cpu = find_idlest_group_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { + if (new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; continue; -- cgit v1.2.3 From 9c825cf6165c1662ce588a1fe11a912eaeaec928 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:58:00 +0100 Subject: BACKPORT: sched/fair: Fix find_idlest_group when local group is not allowed When the local group is not allowed we do not modify this_*_load from their initial value of 0. That means that the load checks at the end of find_idlest_group cause us to incorrectly return NULL. Fixing the initial values to ULONG_MAX means we will instead return the idlest remote group in that case. BACKPORT: Note 4.4 is missing commit 6b94780e45c1 "sched/core: Use load_avg for selecting idlest group", so we only have to fix this_load instead of this_runnable_load and this_avg_load. Change-Id: I41f775b0e7c8f5e675c2780f955bb130a563cba7 Signed-off-by: Brendan Jackman Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Vincent Guittot Cc: Josef Bacik Cc: Ingo Molnar Cc: Morten Rasmussen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 0d10ab952e99 tip:sched/core) (backport changes described above) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf28d82fad41..dc685b67d08b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5983,7 +5983,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; -- cgit v1.2.3 From 411654764590c071ef55242ffd50c4e74930353b Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:58:01 +0100 Subject: UPSTREAM: sched/fair: Fix usage of find_idlest_group() when no groups are allowed When 'p' is not allowed on any of the CPUs in the sched_domain, we currently return NULL from find_idlest_group(), and pointlessly continue the search on lower sched_domain levels (where 'p' is also not allowed) before returning prev_cpu regardless (as we have not updated new_cpu). Add an explicit check for this case, and add a comment to find_idlest_group(). Now when find_idlest_group() returns NULL, it always means that the local group is allowed and idlest. Change-Id: I5f2648d2f7fb0465677961ecb7473df3d06f0057 Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 6fee85ccbc76 tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc685b67d08b..0e3a9935515c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5976,6 +5976,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -6136,6 +6138,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p schedstat_inc(this_rq(), eas_stats.cas_attempts); } + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + while (sd) { struct sched_group *group; struct sched_domain *tmp; -- cgit v1.2.3 From 5a8663664915417aec806da2b0bc534d675f413d Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:58:02 +0100 Subject: UPSTREAM: sched/fair: Fix usage of find_idlest_group() when the local group is idlest find_idlest_group() returns NULL when the local group is idlest. The caller then continues the find_idlest_group() search at a lower level of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is not consulted and, crucially, @new_cpu is not updated. This means the search is pointless and we return @prev_cpu from select_task_rq_fair(). This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu. Change-Id: Ie531f5bb29775952bdc4c148b6e974b2f5f32b7a Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 93f50f90247e tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e3a9935515c..bf7b7a7e778b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6129,7 +6129,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { - int new_cpu = prev_cpu; + int new_cpu = cpu; int wu = sd_flag & SD_BALANCE_WAKE; int cas_cpu = -1; -- cgit v1.2.3 From 2f30db8df4076c54f92ac2ebe26734e27c164fd3 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 1 Aug 2017 15:48:37 +0100 Subject: UPSTREAM: sched/fair: Sync task util before slow-path wakeup We use task_util() in find_idlest_group() via capacity_spare_wake(). This task_util() updated in wake_cap(). However wake_cap() is not the only reason for ending up in find_idlest_group() - we could have been sent there by wake_wide(). So explicitly sync the task util with prev_cpu when we are about to head to find_idlest_group(). We could simply do this at the beginning of select_task_rq_fair() (i.e. irrespective of whether we're heading to select_idle_sibling() or find_idlest_group() & co), but I didn't want to slow down the select_idle_sibling() path more than necessary. Don't do this during fork balancing, we won't need the task_util and we'd just clobber the last_update_time, which is supposed to be 0. Change-Id: I935f4bfdfec3e8b914457aac3387ce264d5fd484 Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Andres Oportus Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit ea16f0ea6c3d tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf7b7a7e778b..5d8081b77f8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6757,6 +6757,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.2.3 From 2aada289d7be37116f21d70177cb06a929b5d961 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 7 Sep 2017 12:24:45 +0100 Subject: sched/fair: trace energy_diff for non boosted tasks In systems where SchedTune is enabled, we do not report energy diff for non boosted tasks. Let's fix this by always genereting an energy_diff event where however: nrg.delta = 0, since we skip energy normalization payoff = nrg.diff, since the payoff is defined just by the energy difference Change-Id: I9a11ec19b6f56da04147f5ae5b47daf1dd180445 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d8081b77f8a..a17820385ee2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5702,8 +5702,14 @@ energy_diff(struct energy_env *eenv) __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ - if (boost == 0) + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); return eenv->nrg.diff; + } /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); -- cgit v1.2.3 From 4edc5b0e387a906a00cb45af2abf76aa3a930438 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 7 Sep 2017 12:27:56 +0100 Subject: sched/fair: ignore backup CPU when not valid The find_best_target can sometimes not return a valid backup CPU, either because it cannot find one or just becasue it returns prev_cpu as a backup. In these cases we should skip the energy_diff evaluation for the backup CPU. Change-Id: I3787dbdfe74122348dd7a7485b88c4679051bd32 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a17820385ee2..417b373e4074 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6686,7 +6686,9 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync /* No energy saving for target_cpu, try backup */ target_cpu = tmp_backup; eenv.dst_cpu = target_cpu; - if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); target_cpu = prev_cpu; -- cgit v1.2.3 From ca42e804464bc9cb81e2ad07fe689c9f8514d5fe Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 17 Jul 2017 15:54:39 +0100 Subject: sched/fair: enforce EAS mode For non latency sensitive tasks the goal is to optimize for energy efficiency. Thus, we should try our best to avoid moving a task on a CPU which is then going to be marked as overutilized. Let's use the capacity_margin metric to verify if a candidate target CPU should be considered without risking to bail out of EAS mode. Change-Id: Ib3697106f4073aedf4a6c6ce42bd5d000fa8c007 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 417b373e4074..0784864eeb30 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6469,6 +6469,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, continue; } + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + /* * Case B) Non latency sensitive tasks on IDLE CPUs. * -- cgit v1.2.3 From 5f8b3a757d6561e5668cb09c75b856347263718b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 12 Sep 2017 14:44:24 +0100 Subject: sched/fair: consider task utilization in group_norm_util() The group_norm_util() function is used to compute the normalized utilization of a SG given a certain energy_env configuration. The main client of this function is the energy_diff function when it comes to compute the SG energy for one of the before/after scheduling candidates. Currently, the energy_diff function sets util_delta = 0 when it wants to compute the energy corresponding to the scheduling candidate where the task runs in the previous CPU. This implies that, for the task waking up in the previous CPU we consider only its blocked load tracked by the CPU RQ. However, in case of a medium-big task which is waking up on a long time idle CPU, this blocked load can be already completely decayed. More in general, the current approach is biased towards under-estimating the energy consumption for the "before" scheduling candidate. This patch fixes this by: - always use the cpu_util_wake() to properly get the utilization of a CPU without any (partially decayed) contribution of the waking up task - adding the task utilization to the cpu_util_wake just for the target cpu The "target CPU" is defined by the energy_env to be either the src_cpu or the dst_cpu, depending on which scheduling candidate we are considering. This patch update also the definition of __cpu_norm_util(), which is currently called just by the group_norm_util() function. This allows to simplify the code by using this function just to normalize a specified utilization with respect to a given capacity. This update allows to completely remove any dependency of group_norm_util() from calc_util_delta(). Change-Id: I3b6ec50ce8decb1521faae660e326ab3319d3c82 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0784864eeb30..9575473f0cf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5297,6 +5297,7 @@ struct energy_env { int util_delta; int src_cpu; int dst_cpu; + int trg_cpu; int energy; int payoff; struct task_struct *task; @@ -5313,11 +5314,14 @@ struct energy_env { } cap; }; +static int cpu_util_wake(int cpu, struct task_struct *p); + /* * __cpu_norm_util() returns the cpu util relative to a specific capacity, - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for - * energy calculations. Using the scale-invariant util returned by - * cpu_util() and approximating scale-invariant util by: + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: * * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time * @@ -5327,10 +5331,8 @@ struct energy_env { * * norm_util = running_time/time ~ util/capacity */ -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) { - int util = __cpu_util(cpu, delta); - if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -5362,28 +5364,37 @@ unsigned long group_max_util(struct energy_env *eenv) /* * group_norm_util() returns the approximated group util relative to it's - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in - * energy calculations. Since task executions may or may not overlap in time in - * the group the true normalized util is between max(cpu_norm_util(i)) and - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The - * latter is used as the estimate as it leads to a more pessimistic energy + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy * estimate (more busy). */ static unsigned long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i, delta; - unsigned long util_sum = 0; unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long util, util_sum = 0; + int cpu; - for_each_cpu(i, sched_group_cpus(sg)) { - delta = calc_util_delta(eenv, i); - util_sum += __cpu_norm_util(i, capacity, delta); + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); } - if (util_sum > SCHED_CAPACITY_SCALE) - return SCHED_CAPACITY_SCALE; - return util_sum; + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } static int find_new_capacity(struct energy_env *eenv, @@ -5575,6 +5586,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5590,11 +5603,13 @@ static inline int __energy_diff(struct energy_env *eenv) int diff, margin; struct energy_env eenv_before = { - .util_delta = 0, + .util_delta = task_util(eenv->task), .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, + .task = eenv->task, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -5972,8 +5987,6 @@ boosted_task_util(struct task_struct *task) return util + margin; } -static int cpu_util_wake(int cpu, struct task_struct *p); - static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); @@ -6681,6 +6694,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .src_cpu = prev_cpu, .dst_cpu = target_cpu, .task = p, + .trg_cpu = target_cpu, }; -- cgit v1.2.3 From 3c71cbb896fe15de4f365223af45096c4098c309 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 1 Jun 2017 16:40:22 +0100 Subject: sched/fair: consider task utilization in group_max_util() The group_max_util() function is used to compute the maximum utilization across the CPUs of a certain energy_env configuration. Its main client is the energy_diff function when it needs to compute the SG capacity for one of the before/after scheduling candidates. Currently, the energy_diff function sets util_delta = 0 when it wants to compute the energy corresponding to the scheduling candidate where the task runs in the previous CPU. This implies that, for the task waking up in the previous CPU we consider only its blocked load tracked by the CPU RQ. However, in case of a medium-big task which is waking up on a long time idle CPU, this blocked load can be already completely decayed. More in general, the current approach is biased towards under-estimating the capacity requirements for the "before" scheduling candidate. This patch fixes this by: - always use the cpu_util_wake() to properly get the utilization of a CPU without any (partially decayed) contribution of the waking up task - adding the task utilization to the cpu_util_wake just for the target cpu The "target CPU" is defined by the energy_env to be either the src_cpu or the dst_cpu, depending on which scheduling candidate we are considering. Finally, since this update removes the last usage of calc_util_delta() this function is now safely removed. Change-Id: I20ee1bcf40cee6bf6e265fb2d32ef79061ad6ced Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9575473f0cf6..e9919aeacdd5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5339,24 +5339,24 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static int calc_util_delta(struct energy_env *eenv, int cpu) +static unsigned long group_max_util(struct energy_env *eenv) { - if (cpu == eenv->src_cpu) - return -eenv->util_delta; - if (cpu == eenv->dst_cpu) - return eenv->util_delta; - return 0; -} - -static -unsigned long group_max_util(struct energy_env *eenv) -{ - int i, delta; unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { - delta = calc_util_delta(eenv, i); - max_util = max(max_util, __cpu_util(i, delta)); + max_util = max(max_util, util); } return max_util; -- cgit v1.2.3 From e3ba92c160d3fb21af5b28a21c63851fad21b168 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 8 Jun 2017 11:38:59 -0700 Subject: sched/tune: access schedtune_initialized under CGROUP_SCHEDTUNE schedtune_initialized is protected by CONFIG_CGROUP_SCHEDTUNE, but is being used without CONFIG_CGROUP_SCHEDTUNE being defined. Add appropriate ifdefs around the usage of schedtune_initialized to avoid a compilation error when CONFIG_CGROUP_SCHEDTUNE is not defined. Change-Id: Iab79bf053d74db3eeb84c09d71d43b4e39746ed2 Signed-off-by: Russ Weight Signed-off-by: Fei Yang --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9919aeacdd5..6023d9e3a9f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5668,7 +5668,11 @@ static inline int __energy_diff(struct energy_env *eenv) #ifdef CONFIG_SCHED_TUNE struct target_nrg schedtune_target_nrg; + +#ifdef CONFIG_CGROUP_SCHEDTUNE extern bool schedtune_initialized; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + /* * System energy normalization * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], @@ -5679,9 +5683,11 @@ normalize_energy(int energy_diff) { u32 normalized_nrg; +#ifdef CONFIG_CGROUP_SCHEDTUNE /* during early setup, we don't know the extents */ if (unlikely(!schedtune_initialized)) return energy_diff < 0 ? -1 : 1 ; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ #ifdef CONFIG_SCHED_DEBUG { -- cgit v1.2.3 From effc721b3c9b3bd258313450cdfd3d0e644f4d85 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 7 Aug 2017 18:14:37 +0800 Subject: sched/fair: remove useless variable in find_best_target Patch 5680f23f20c7 ("sched/fair: streamline find_best_target heuristics") has reworked function find_best_target, as result the variable "target_util" is useless now. So remove it. Change-Id: I5447062419e5828a49115119984fac6cd37db034 Signed-off-by: Leo Yan --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6023d9e3a9f5..9bc717ef81f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6350,7 +6350,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; - unsigned long target_util = ULONG_MAX; unsigned long best_active_util = ULONG_MAX; int best_idle_cstate = INT_MAX; struct sched_domain *sd; @@ -6579,7 +6578,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, target_max_spare_cap = capacity_orig - new_util; target_capacity = capacity_orig; - target_util = new_util; target_cpu = i; } -- cgit v1.2.3 From 43bd960dfe728284e1059f11d6c686d23887c1c6 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 3 Feb 2017 11:15:31 -0800 Subject: sched: WALT: account cumulative window demand Energy cost estimation has been a long lasting challenge for WALT because WALT guides CPU frequency based on the CPU utilization of previous window. Consequently it's not possible to know newly waking-up task's energy cost until WALT's end of the current window. The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum) and 'Cumulative Runnable Average' (cr_avg). They are designed for CPU frequency guidance and task placement but unfortunately both are not suitable for the energy cost estimation. It's because using prev_runnable_sum for energy cost calculation would make us to account CPU and task's energy solely based on activity in the previous window so for example, any task didn't have an activity in the previous window will be accounted as a 'zero energy cost' task. Energy estimation with cr_avg is what energy_diff() relies on at present. However cr_avg can only represent instantaneous picture of energy cost thus for example, if a CPU was fully occupied for an entire WALT window and became idle just before window boundary, and if there is a wake-up, energy_diff() accounts that CPU is a 'zero energy cost' CPU. As a result, introduce a new accounting unit 'Cumulative Window Demand'. The cumulative window demand tracks all the tasks' demands have seen in current window which is neither instantaneous nor actual execution time. Because task demand represents estimated scaled execution time when the task runs a full window, accumulation of all the demands represents predicted CPU load at the end of window. Thus we can estimate CPU's frequency at the end of current WALT window with the cumulative window demand. The use of prev_runnable_sum for the CPU frequency guidance and cr_avg for the task placement have not changed and these are going to be used for both purpose while this patch aims to add an additional statistics. Change-Id: I9908c77ead9973a26dea2b36c001c2baf944d4f5 Signed-off-by: Joonwoo Park --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 12 +++++++++++ kernel/sched/walt.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 73 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca9d72fa8e66..563f316f5330 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2170,6 +2170,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SCHED_WALT + p->last_sleep_ts = 0; +#endif + INIT_LIST_HEAD(&p->se.group_node); walt_init_new_task_load(p); @@ -3379,6 +3383,10 @@ static void __sched notrace __schedule(bool preempt) rq->clock_skip_update = 0; if (likely(prev != next)) { +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif rq->nr_switches++; rq->curr = next; ++*switch_count; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8d3712107e61..deba080af845 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -690,6 +690,7 @@ struct rq { u64 cur_irqload; u64 avg_irqload; u64 irqload_ts; + u64 cum_window_demand; #endif /* CONFIG_SCHED_WALT */ @@ -2101,6 +2102,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_SCHED_WALT + +static inline bool +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && + (p->on_rq || p->last_sleep_ts >= rq->window_start); +} + +#endif /* CONFIG_SCHED_WALT */ + #ifdef arch_scale_freq_capacity #ifndef arch_scale_freq_invariant #define arch_scale_freq_invariant() (true) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 441cba01bc04..93f61486f989 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -70,11 +70,28 @@ static unsigned int task_load(struct task_struct *p) return p->ravg.demand; } +static inline void fixup_cum_window_demand(struct rq *rq, s64 delta) +{ + rq->cum_window_demand += delta; + if (unlikely((s64)rq->cum_window_demand < 0)) + rq->cum_window_demand = 0; +} + void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { rq->cumulative_runnable_avg += p->ravg.demand; + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + fixup_cum_window_demand(rq, p->ravg.demand); } void @@ -83,6 +100,14 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, { rq->cumulative_runnable_avg -= p->ravg.demand; BUG_ON((s64)rq->cumulative_runnable_avg < 0); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + fixup_cum_window_demand(rq, -(s64)p->ravg.demand); } static void @@ -95,6 +120,8 @@ fixup_cumulative_runnable_avg(struct rq *rq, if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", task_load_delta, task_load(p)); + + fixup_cum_window_demand(rq, task_load_delta); } u64 walt_ktime_clock(void) @@ -180,6 +207,8 @@ update_window_start(struct rq *rq, u64 wallclock) nr_windows = div64_u64(delta, walt_ravg_window); rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; + + rq->cum_window_demand = rq->cumulative_runnable_avg; } /* @@ -568,10 +597,20 @@ static void update_history(struct rq *rq, struct task_struct *p, * A throttled deadline sched class task gets dequeued without * changing p->on_rq. Since the dequeue decrements hmp stats * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. */ - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || - !p->dl.dl_throttled)) - fixup_cumulative_runnable_avg(rq, p, demand); + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p)) + fixup_cumulative_runnable_avg(rq, p, demand); + else if (rq->curr == p) + fixup_cum_window_demand(rq, demand); + } p->ravg.demand = demand; @@ -792,6 +831,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && + p->last_sleep_ts >= src_rq->window_start) { + fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand); + fixup_cum_window_demand(dest_rq, p->ravg.demand); + } + if (p->ravg.curr_window) { src_rq->curr_runnable_sum -= p->ravg.curr_window; dest_rq->curr_runnable_sum += p->ravg.curr_window; -- cgit v1.2.3 From 38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 7 Aug 2017 15:46:13 +0100 Subject: FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide (from https://patchwork.kernel.org/patch/9895261/) This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu= and 0 tasks with prev_cpu=. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Josef Bacik Cc: Joel Fernandes Cc: Mike Galbraith Cc: Matt Fleming --- kernel/sched/core.c | 35 +++++++++++++++++++++++------------ kernel/sched/deadline.c | 3 ++- kernel/sched/fair.c | 21 ++++++++++++++------- kernel/sched/idle_task.c | 3 ++- kernel/sched/rt.c | 3 ++- kernel/sched/sched.h | 3 ++- kernel/sched/stop_task.c | 3 ++- 7 files changed, 47 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 563f316f5330..18d607f9a417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -546,6 +546,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; + head->count++; + get_task_struct(task); /* @@ -555,6 +557,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint); + void wake_up_q(struct wake_q_head *head) { struct wake_q_node *node = head->first; @@ -569,10 +575,10 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing + * try_to_wake_up() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + try_to_wake_up(task, TASK_NORMAL, 0, head->count); put_task_struct(task); } } @@ -1642,12 +1648,14 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, + int sibling_count_hint) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1932,6 +1940,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) + * @sibling_count_hint: A hint at the number of threads that are being woken up + * in this event. * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -1943,7 +1953,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * or @state didn't match @p's state. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint) { unsigned long flags; int cpu, success = 0; @@ -2044,8 +2055,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, + sibling_count_hint); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -2127,13 +2138,13 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0); + return try_to_wake_up(p, TASK_NORMAL, 0, 1); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 1); } /* @@ -2467,7 +2478,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p); update_rq_clock(rq); @@ -2905,7 +2916,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3560,7 +3571,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 1); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ab1a9a99660d..cdf2a0b97611 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1070,7 +1070,8 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9bc717ef81f5..b904a023be95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5773,15 +5773,18 @@ energy_diff(struct energy_env *eenv) * being client/server, worker/dispatcher, interrupt source or whatever is * irrelevant, spread criteria is apparent partner count exceeds socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -6754,7 +6757,8 @@ unlock: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -6762,9 +6766,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); + } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..33d7003fa1b8 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3715473fd8f8..069f8982867f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1372,7 +1372,8 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index deba080af845..a2d5de8415e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1261,7 +1261,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags, + int subling_count_hint); void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 61f852d46858..a5567ccd8803 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -12,7 +12,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.2.3 From e79f447a9762f68d6ecf8371bcf3e970bceb662a Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 10 Aug 2017 17:26:20 -0700 Subject: sched: walt: Correct WALT window size initialization It is preferable that WALT window rollover occurs just before a tick, since the tick is an opportune moment to record a complete window's statistics, as well as report those stats to the cpu frequency governor. When CONFIG_HZ results in a TICK_NSEC that isn't a integral number, this requirement may be violated. Account for this by reducing the WALT window size to the nearest multiple of TICK_NSEC. Commit d368c6faa19b ("sched: walt: fix window misalignment when HZ=300") attempted to do this but WALT isn't using MIN_SCHED_RAVG_WINDOW as the window size and the patch was doing nothing. Also, change the type of 'walt_disabled' to bool and warn if an invalid window size causes WALT to be disabled. Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7 Signed-off-by: Vikram Mulukutla --- kernel/sched/sched.h | 2 +- kernel/sched/walt.c | 46 ++++++++++++++++++++++++++++------------------ kernel/sched/walt.h | 2 +- 3 files changed, 30 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a2d5de8415e1..dd86072eaf4e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1560,7 +1560,7 @@ static inline unsigned long capacity_orig_of(int cpu) extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int walt_ravg_window; -extern unsigned int walt_disabled; +extern bool walt_disabled; /* * cpu_util returns the amount of capacity of a CPU that is used by CFS diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 93f61486f989..8d25ffbe4fed 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -41,25 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0; unsigned int sysctl_sched_walt_init_task_load_pct = 15; -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ -unsigned int __read_mostly walt_disabled = 0; +/* true -> use PELT based load stats, false -> use window-based load stats */ +bool __read_mostly walt_disabled = false; -/* Window size (in ns) */ -__read_mostly unsigned int walt_ravg_window = 20000000; - -/* Min window size (in ns) = 10ms */ -#ifdef CONFIG_HZ_300 /* - * Tick interval becomes to 3333333 due to - * rounding error when HZ=300. + * Window size (in ns). Adjust for the tick size so that the window + * rollover occurs just before the tick boundary. */ -#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) -#else -#define MIN_SCHED_RAVG_WINDOW 10000000 -#endif - -/* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 +__read_mostly unsigned int walt_ravg_window = + (20000000 / TICK_NSEC) * TICK_NSEC; +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC) +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) static unsigned int sync_cpu; static ktime_t ktime_last; @@ -180,10 +172,28 @@ static int exiting_task(struct task_struct *p) static int __init set_walt_ravg_window(char *str) { + unsigned int adj_window; + bool no_walt = walt_disabled; + get_option(&str, &walt_ravg_window); - walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || - walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + /* Adjust for CONFIG_HZ */ + adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC; + + /* Warn if we're a bit too far away from the expected window size */ + WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC, + "tick-adjusted window size %u, original was %u\n", adj_window, + walt_ravg_window); + + walt_ravg_window = adj_window; + + walt_disabled = walt_disabled || + (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + + WARN(!no_walt && walt_disabled, + "invalid window size, disabling WALT\n"); + return 0; } diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index f56c4da16d0b..de7edac43674 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -59,6 +59,6 @@ static inline u64 walt_ktime_clock(void) { return 0; } #endif /* CONFIG_SCHED_WALT */ -extern unsigned int walt_disabled; +extern bool walt_disabled; #endif -- cgit v1.2.3 From a21299785a502ca4b3592a0f977aa1202b105260 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 22 Sep 2016 12:25:56 +0100 Subject: sched/core: Warn if ENERGY_AWARE is enabled but data is missing If the EAS energy model is missing or incomplete, i.e. sd_scs is NULL, then sched_group_energy will return -EINVAL on the assumption that it raced with a CPU hotplug event. In that case, energy_diff will return 0 and the energy-aware wake path will silently fail to trigger any migrations. This case can be triggered by disabling CONFIG_SCHED_MC on existing platforms, so that there are no sched_groups with the SD_SHARE_CAP_STATES flag, so that sd_scs is NULL. Add checks so that a warning is printed if EAS is ever enabled while the necessary data is not present. Change-Id: Id233a510b5ad8b7fcecac0b1d789e730bbfc7c4a Signed-off-by: Brendan Jackman --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18d607f9a417..4f11b84eaf0a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,6 +91,8 @@ #include #include "walt.h" +static bool have_sched_energy_data(void); + DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -193,6 +195,10 @@ static int sched_feat_set(char *cmp) sysctl_sched_features &= ~(1UL << i); sched_feat_disable(i); } else { + if (i == __SCHED_FEAT_ENERGY_AWARE) + WARN(!have_sched_energy_data(), + "Missing sched energy data\n"); + sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } @@ -6649,6 +6655,19 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +static bool have_sched_energy_data(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (!rcu_dereference(per_cpu(sd_scs, cpu)) || + !rcu_dereference(per_cpu(sd_ea, cpu))) + return false; + } + + return true; +} + /* * Check that the per-cpu provided sd energy data is consistent for all cpus * within the mask. @@ -7461,6 +7480,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, } rcu_read_unlock(); + WARN(sched_feat(ENERGY_AWARE) && !have_sched_energy_data(), + "Missing data for energy aware scheduling\n"); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); -- cgit v1.2.3 From a899b9085c8d5d581b214c24dc707466e8cb479f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 27 Oct 2017 13:23:05 -0700 Subject: sched/core: fix have_sched_energy_data build warning have_sched_energy_data is defined only for CONFIG_SMP, so declare it only with CONFIG_SMP. Fixes warning from intel bot: tree: https://android.googlesource.com/kernel/msm android-4.4 head: a21299785a502ca4b3592a0f977aa1202b105260 commit: a21299785a502ca4b3592a0f977aa1202b105260 [5/5] sched/core: Warn if ENERGY_AWARE is enabled but data is missing config: i386-randconfig-x002-201743 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: git checkout a21299785a502ca4b3592a0f977aa1202b105260 # save the attached .config to linux build tree make ARCH=i386 All warnings (new ones prefixed by >>): >> kernel//sched/core.c:94:13: warning: 'have_sched_energy_data' used but never defined static bool have_sched_energy_data(void); ^~~~~~~~~~~~~~~~~~~~~~ vim +/have_sched_energy_data +94 kernel//sched/core.c 93 > 94 static bool have_sched_energy_data(void); 95 Change-Id: I266b63ece6fb31d2b5b11821a8244e147ba6d3a4 Signed-off-by: Joel Fernandes --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f11b84eaf0a..4e5298a1977e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,9 @@ #include #include "walt.h" +#ifdef CONFIG_SMP static bool have_sched_energy_data(void); +#endif DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -195,10 +197,11 @@ static int sched_feat_set(char *cmp) sysctl_sched_features &= ~(1UL << i); sched_feat_disable(i); } else { +#ifdef CONFIG_SMP if (i == __SCHED_FEAT_ENERGY_AWARE) WARN(!have_sched_energy_data(), "Missing sched energy data\n"); - +#endif sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } -- cgit v1.2.3