From ebf270d24640abda4ddc8061e615facdb9b074d0 Mon Sep 17 00:00:00 2001 From: John Dias Date: Mon, 6 Aug 2018 14:08:03 -0700 Subject: sched/fair: vruntime should normalize when switching from fair When rt_mutex_setprio changes a task's scheduling class to RT, we're seeing cases where the task's vruntime is not updated correctly upon return to the fair class. Specifically, the following is being observed: - task is deactivated while still in the fair class - task is boosted to RT via rt_mutex_setprio, which changes the task to RT and calls check_class_changed. - check_class_changed leads to detach_task_cfs_rq, at which point the vruntime_normalized check sees that the task's state is TASK_WAKING, which results in skipping the subtraction of the rq's min_vruntime from the task's vruntime - later, when the prio is deboosted and the task is moved back to the fair class, the fair rq's min_vruntime is added to the task's vruntime, even though it wasn't subtracted earlier. The immediate result is inflation of the task's vruntime, giving it lower priority (starving it if there's enough available work). The longer-term effect is inflation of all vruntimes because the task's vruntime becomes the rq's min_vruntime when the higher priority tasks go idle. That leads to a vicious cycle, where the vruntime inflation repeatedly doubled. The change here is to detect when vruntime_normalized is being called when the task is waking but is waking in another class, and to conclude that this is a case where vruntime has not been normalized. Bug: 80502612 Change-Id: If0bb02eb16939ca5e91ef282b7f9119ff68622c4 Signed-off-by: John Dias --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51443a801af5..3c0a8050b77d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11772,7 +11772,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_class == &fair_sched_class)) return true; return false; -- cgit v1.2.3 From a9314f9d8ad402f17e107f2f4a11636e50301cfa Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Mon, 15 Apr 2019 12:41:12 +0800 Subject: sched/fair: Allow load bigger task load balance when nr_running is 2 When there is only 2 tasks in 1 cpu and the other task is currently running, allow load bigger task to be balanced if the other task is currently running. Change-Id: I489e9624ba010f9293272a67585e8209a786b787 Signed-off-by: Maria Yu --- kernel/sched/fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c0a8050b77d..941424604fdd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8953,7 +8953,17 @@ redo: if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load / 2) > env->imbalance) + /* + * p is not running task when we goes until here, so if p is one + * of the 2 task in src cpu rq and not the running one, + * that means it is the only task that can be balanced. + * So only when there is other tasks can be balanced or + * there is situation to ignore big task, it is needed + * to skip the task load bigger than 2*imbalance. + */ + if (((cpu_rq(env->src_cpu)->nr_running > 2) || + (env->flags & LBF_IGNORE_BIG_TASKS)) && + ((load / 2) > env->imbalance)) goto next; detach_task(p, env); -- cgit v1.2.3 From eccc8acbe705a20e0911ea776371d84eba53cc8e Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Fri, 26 Apr 2019 15:20:18 +0800 Subject: sched/fair: Avoid unnecessary active load balance When find busiest group, it will avoid load balance if it is only 1 task running on src cpu. Consider race when different cpus do newly idle load balance at the same time, check src cpu nr_running to avoid unnecessary active load balance again. See the race condition example here: 1) cpu2 have 2 tasks, so cpu2 rq->nr_running == 2 and cfs.h_nr_running ==2. 2) cpu4 and cpu5 doing newly idle load balance at the same time. 3) cpu4 and cpu5 both see cpu2 sched_load_balance_sg_stats sum_nr_run=2 so they are both see cpu2 as the busiest rq. 4) cpu5 did a success migration task from cpu2, so cpu2 only have 1 task left, cpu2 rq->nr_running == 1 and cfs.h_nr_running ==1. 5) cpu4 surely goes to no_move because currently cpu4 only have 1 task which is currently running. 6) and then cpu4 goes here to check if cpu2 need active load balance. Change-Id: Ia9539a43e9769c4936f06ecfcc11864984c50c29 Signed-off-by: Maria Yu --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 941424604fdd..42f05c742846 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10456,8 +10456,10 @@ static int need_active_balance(struct lb_env *env) * It's worth migrating the task if the src_cpu's capacity is reduced * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. + * Avoid pulling the CFS task if it is the only task running. */ if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->nr_running > 1) && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) -- cgit v1.2.3 From 8d8a48aecde5c4be6c57b9108dc22e8e0cd7f235 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 19 May 2017 23:49:11 -0400 Subject: sched/fair: Fix load_balance() affinity redo path If load_balance() fails to migrate any tasks because all tasks were affined, load_balance() removes the source cpu from consideration and attempts to redo and balance among the new subset of cpus. There is a bug in this code path where the algorithm considers all active cpus in the system (minus the source that was just masked out). This is not valid for two reasons: some active cpus may not be in the current scheduling domain and one of the active cpus is dst_cpu. These cpus should not be considered, as we cannot pull load from them. Instead of failing out of load_balance(), we may end up redoing the search with no valid cpus and incorrectly concluding the domain is balanced. Additionally, if the group_imbalance flag was just set, it may also be incorrectly unset, thus the flag will not be seen by other cpus in future load_balance() runs as that algorithm intends. Fix the check by removing cpus not in the current domain and the dst_cpu from considertation, thus limiting the evaluation to valid remaining cpus from which load might be migrated. Co-authored-by: Austin Christ Co-authored-by: Dietmar Eggemann Signed-off-by: Jeffrey Hugo Tested-by: Tyler Baicar Change-Id: Ife6701c9c62e7155493d9db9398f08c4474e94b3 --- kernel/sched/fair.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42f05c742846..a2f52c35c76a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10697,7 +10697,24 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * dst_cpu is not a valid busiest cpu in the following + * check since load cannot be pulled from dst_cpu to be + * put on dst_cpu. + */ + cpumask_clear_cpu(env.dst_cpu, cpus); + /* + * Go back to "redo" iff the load-balance cpumask + * contains other potential busiest cpus for the + * current sched domain. + */ + if (cpumask_intersects(cpus, sched_domain_span(env.sd))) { + /* + * Now that the check has passed, reenable + * dst_cpu so that load can be calculated on + * it in the redo path. + */ + cpumask_set_cpu(env.dst_cpu, cpus); env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; -- cgit v1.2.3 From c0fa7577022c4169e1aaaf1bd9e04f63d285beb2 Mon Sep 17 00:00:00 2001 From: Ethan Chen Date: Sat, 20 Jan 2018 16:35:53 -0800 Subject: sched/walt: Re-add code to allow WALT to function Change-Id: Ieb1067c5e276f872ed4c722b7d1fabecbdad87e7 --- kernel/sched/fair.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2f52c35c76a..08e608a04f5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,12 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5961,6 +5967,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; @@ -5969,6 +5976,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) @@ -6005,6 +6013,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) schedtune_enqueue_task(p, cpu_of(rq)); if (energy_aware() && !se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; @@ -6042,6 +6051,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ @@ -6062,6 +6072,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) @@ -7098,6 +7109,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif return p->se.avg.util_avg; } @@ -7656,6 +7673,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, if (new_util > capacity_orig) continue; +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif + /* * Case A) Latency sensitive tasks * -- cgit v1.2.3 From ebdb82f7b34aeab34623d7a5e4dd673fc2807842 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 20 Jul 2017 23:46:56 -0700 Subject: sched/fair: Skip frequency updates if CPU about to idle If CPU is about to idle, prevent a frequency update. With the number of schedutil governor wake ups are reduced by more than half on a test playing bluetooth audio. Test: sugov wake ups drop by more than half when playing music with screen off (476 / 1092) Bug: 64689959 Change-Id: I400026557b4134c0ac77f51c79610a96eb985b4a Signed-off-by: Joel Fernandes --- kernel/sched/fair.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08e608a04f5b..ee5f8e686a31 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4332,6 +4332,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) */ #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 +#define SKIP_CPUFREQ 0x4 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct sched_entity *se, int flags) @@ -4352,7 +4353,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ)); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -4528,6 +4529,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 +#define SKIP_CPUFREQ 0x3 static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void @@ -4750,6 +4752,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int update_flags; + /* * Update run-time statistics of the 'current'. */ @@ -4763,7 +4767,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(se, UPDATE_TG); + update_flags = UPDATE_TG; + + if (flags & DEQUEUE_IDLE) + update_flags |= SKIP_CPUFREQ; + + update_load_avg(se, update_flags); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -6038,6 +6047,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + if (task_sleep && rq->nr_running == 1) + flags |= DEQUEUE_IDLE; + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -6078,7 +6090,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(se, UPDATE_TG | (flags & DEQUEUE_IDLE)); update_cfs_shares(se); } -- cgit v1.2.3 From b775cb29f66382f04ba4c1e7ad385081a020269b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 27 Feb 2018 15:29:09 +0000 Subject: ANDROID: Move schedtune en/dequeue before schedutil update triggers CPU rq util updates happen when rq signals are updated as part of enqueue and dequeue operations. Doing these updates triggers a call to the registered util update handler, which takes schedtune boosting into account. Enqueueing the task in the correct schedtune group after this happens means that we will potentially not see the boost for an entire throttle period. Move the enqueue/dequeue operations for schedtune before the signal updates which can trigger OPP changes. Change-Id: I4236e6b194bc5daad32ff33067d4be1987996780 Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 63 +++++++++++++++++++++++++---------------------------- 1 file changed, 30 insertions(+), 33 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee5f8e686a31..ef6046d3a016 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5951,6 +5951,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; + + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); #endif /* @@ -6001,26 +6020,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - - /* - * Update SchedTune accounting. - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - * - * We do it also in the case where we enqueue a throttled task; - * we could argue that a throttled task should not boost a CPU, - * however: - * a) properly implementing CPU boosting considering throttled - * tasks will increase a lot the complexity of the solution - * b) it's not easy to quantify the benefits introduced by - * such a more complex solution. - * Thus, for the time being we go for the simple solution and boost - * also for throttled RQs. - */ - schedtune_enqueue_task(p, cpu_of(rq)); - if (energy_aware() && !se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && @@ -6050,6 +6049,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_sleep && rq->nr_running == 1) flags |= DEQUEUE_IDLE; +#ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); +#endif + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -6099,19 +6109,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dec_rq_hmp_stats(rq, p, 1); } -#ifdef CONFIG_SMP - - /* - * Update SchedTune accounting - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - */ - schedtune_dequeue_task(p, cpu_of(rq)); - -#endif /* CONFIG_SMP */ - hrtick_update(rq); } -- cgit v1.2.3 From 891a63210e1d65ed226050ce7921dcec210a671a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 18 Aug 2017 13:29:46 -0700 Subject: sched/fair: Fix issue where frequency update not skipped This patch fixes one of the infrequent conditions in commit 54b6baeca500 ("sched/fair: Skip frequency updates if CPU about to idle") where we could have skipped a frequency update. The fix is to use the correct flag which skips freq updates. Note that this is a rare issue (can show up only during CFS throttling) and even then we just do an additional frequency update which we were doing anyway before the above patch. Bug: 64689959 Change-Id: I0117442f395cea932ad56617065151bdeb9a3b53 Signed-off-by: Joel Fernandes --- kernel/sched/fair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef6046d3a016..2015cf048a44 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4529,7 +4529,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 -#define SKIP_CPUFREQ 0x3 +#define SKIP_CPUFREQ 0x0 static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void @@ -6092,6 +6092,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { + int update_flags; + cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); @@ -6100,7 +6102,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG | (flags & DEQUEUE_IDLE)); + update_flags = UPDATE_TG; + + if (flags & DEQUEUE_IDLE) + update_flags |= SKIP_CPUFREQ; + + update_load_avg(se, update_flags); update_cfs_shares(se); } -- cgit v1.2.3 From 82d3f23d6dc53c564c1c8550f9ee6ac72f85c004 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:33 +0800 Subject: sched/fair: Fix bandwidth timer clock drift condition commit 512ac999d2755d2b7109e996a76b6fb8b888631d upstream. I noticed that cgroup task groups constantly get throttled even if they have low CPU usage, this causes some jitters on the response time to some of our business containers when enabling CPU quotas. It's very simple to reproduce: mkdir /sys/fs/cgroup/cpu/test cd /sys/fs/cgroup/cpu/test echo 100000 > cpu.cfs_quota_us echo $$ > tasks then repeat: cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily After some analysis, we found that cfs_rq::runtime_remaining will be cleared by expire_cfs_rq_runtime() due to two equal but stale "cfs_{b|q}->runtime_expires" after period timer is re-armed. The current condition to judge clock drift in expire_cfs_rq_runtime() is wrong, the two runtime_expires are actually the same when clock drift happens, so this condtion can never hit. The orginal design was correctly done by this commit: a9cf55b28610 ("sched: Expire invalid runtime") ... but was changed to be the current implementation due to its locking bug. This patch introduces another way, it adds a new field in both structures cfs_rq and cfs_bandwidth to record the expiration update sequence, and uses them to figure out if clock drift happens (true if they are equal). Change-Id: I8168fe3b45785643536f289ea823d1a62d9d8ab2 Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) [alakeshh: backport: Fixed merge conflicts: - sched.h: Fix the indentation and order in which the variables are declared to match with coding style of the existing code in 4.14 Struct members of same type were declared in separate lines in upstream patch which has been changed back to having multiple members of same type in the same line. e.g. int a; int b; -> int a, b; ] Signed-off-by: Alakesh Haloi Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: # 4.14.x Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2015cf048a44..63f9ad66cf11 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5068,6 +5068,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5090,6 +5091,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -5106,6 +5108,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -5115,8 +5118,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -5142,12 +5147,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { -- cgit v1.2.3 From b933e4d37bc023d27c7394626669bae0a201da52 Mon Sep 17 00:00:00 2001 From: Dave Chiluk Date: Tue, 23 Jul 2019 11:44:26 -0500 Subject: sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream. It has been observed, that highly-threaded, non-cpu-bound applications running under cpu.cfs_quota_us constraints can hit a high percentage of periods throttled while simultaneously not consuming the allocated amount of quota. This use case is typical of user-interactive non-cpu bound applications, such as those running in kubernetes or mesos when run on multiple cpu cores. This has been root caused to cpu-local run queue being allocated per cpu bandwidth slices, and then not fully using that slice within the period. At which point the slice and quota expires. This expiration of unused slice results in applications not being able to utilize the quota for which they are allocated. The non-expiration of per-cpu slices was recently fixed by 'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")'. Prior to that it appears that this had been broken since at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That added the following conditional which resulted in slices never being expired. if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; Because this was broken for nearly 5 years, and has recently been fixed and is now being noticed by many users running kubernetes (https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion that the mechanisms around expiring runtime should be removed altogether. This allows quota already allocated to per-cpu run-queues to live longer than the period boundary. This allows threads on runqueues that do not use much CPU to continue to use their remaining slice over a longer period of time than cpu.cfs_period_us. However, this helps prevent the above condition of hitting throttling while also not fully utilizing your cpu quota. This theoretically allows a machine to use slightly more than its allotted quota in some periods. This overflow would be bounded by the remaining quota left on each per-cpu runqueueu. This is typically no more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will change nothing, as they should theoretically fully utilize all of their quota in each period. For user-interactive tasks as described above this provides a much better user/application experience as their cpu utilization will more closely match the amount they requested when they hit throttling. This means that cpu limits no longer strictly apply per period for non-cpu bound applications, but that they are still accurate over longer timeframes. This greatly improves performance of high-thread-count, non-cpu bound applications with low cfs_quota_us allocation on high-core-count machines. In the case of an artificial testcase (10ms/100ms of quota on 80 CPU machine), this commit resulted in almost 30x performance improvement, while still maintaining correct cpu quota restrictions. That testcase is available at https://github.com/indeedeng/fibtest. Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition") Change-Id: I7d7a39fb554ec0c31f9381f492165f43c70b3924 Signed-off-by: Dave Chiluk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Cc: Ingo Molnar Cc: John Hammond Cc: Jonathan Corbet Cc: Kyle Anderson Cc: Gabriel Munos Cc: Peter Oskolkov Cc: Cong Wang Cc: Brendan Gregg Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 70 ++++++----------------------------------------------- 1 file changed, 7 insertions(+), 63 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63f9ad66cf11..266fc95f6c0f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5067,8 +5067,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5090,8 +5088,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -5108,61 +5105,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; @@ -5396,8 +5349,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cpu_temp(cpu_of(rq))); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -5418,7 +5370,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -5443,7 +5394,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -5471,8 +5422,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -5485,8 +5434,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; @@ -5563,8 +5511,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -5596,7 +5543,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); @@ -5613,7 +5559,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -5622,11 +5567,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } -- cgit v1.2.3