From 6adb092856e806d91f3fc22dff0ef36506dd0bae Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Tue, 6 Jun 2017 11:58:27 -0700 Subject: sched: cpufreq: Limit governor updates to WALT changes alone It's not necessary to keep reporting load to the governor if it doesn't change in a window. Limit updates to when we expect load changes - after window rollover and when we send updates related to intercluster migrations. [beykerykt]: Adapt for HMP Change-Id: I3232d40f3d54b0b81cfafdcdb99b534df79327bf Signed-off-by: Vikram Mulukutla --- kernel/sched/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 90cc450dff7e..40da1a509ded 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2856,8 +2856,10 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) #ifdef CONFIG_SCHED_HMP /* * Skip if we've already reported, but not if this is an inter-cluster - * migration + * migration. Also only allow WALT update sites. */ + if (!(flags & SCHED_CPUFREQ_WALT)) + return; if (!sched_disable_window_stats && (rq->load_reported_window == rq->window_start) && !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG)) -- cgit v1.2.3 From ef3fb04c7df43dfa1793e33f764a2581cda96310 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Mon, 8 May 2017 19:20:22 -0700 Subject: sched: cpufreq: Use sched_clock instead of rq_clock when updating schedutil rq_clock may not be updated often enough for schedutil or other cpufreq governors to work correctly when it's passed as the timestamp for a load report. Use sched_clock instead. [beykerykt]: Switch to sched_ktime_clock() Change-Id: I745b727870a31da25f766c2c2f37527f568c20da Signed-off-by: Vikram Mulukutla --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 40da1a509ded..e78a3e867472 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2869,7 +2869,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); if (data) - data->func(data, rq_clock(rq), flags); + data->func(data, sched_ktime_clock(), flags); } static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -- cgit v1.2.3 From 4dbe44554792f83b785eed187aa1bcd69e84094c Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Tue, 9 May 2017 17:49:47 -0700 Subject: sched: cpufreq: Use per_cpu_ptr instead of this_cpu_ptr when reporting load We need cpufreq_update_util to report load for the CPU corresponding to the rq that is passed in as an argument, rather than the CPU executing cpufreq_update_util. Change-Id: I8473f230d40928d5920c614760e96fef12745d5a Signed-off-by: Vikram Mulukutla --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e78a3e867472..1196276eddf6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2867,7 +2867,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) rq->load_reported_window = rq->window_start; #endif - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); if (data) data->func(data, sched_ktime_clock(), flags); } -- cgit v1.2.3 From c0fa7577022c4169e1aaaf1bd9e04f63d285beb2 Mon Sep 17 00:00:00 2001 From: Ethan Chen Date: Sat, 20 Jan 2018 16:35:53 -0800 Subject: sched/walt: Re-add code to allow WALT to function Change-Id: Ieb1067c5e276f872ed4c722b7d1fabecbdad87e7 --- kernel/sched/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1196276eddf6..284cc86d3ad4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -511,6 +511,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_SCHED_HMP @@ -819,6 +823,7 @@ struct rq { #endif #ifdef CONFIG_SCHED_WALT + unsigned int cur_freq; u64 cumulative_runnable_avg; u64 window_start; u64 curr_runnable_sum; -- cgit v1.2.3 From ebdb82f7b34aeab34623d7a5e4dd673fc2807842 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 20 Jul 2017 23:46:56 -0700 Subject: sched/fair: Skip frequency updates if CPU about to idle If CPU is about to idle, prevent a frequency update. With the number of schedutil governor wake ups are reduced by more than half on a test playing bluetooth audio. Test: sugov wake ups drop by more than half when playing music with screen off (476 / 1092) Bug: 64689959 Change-Id: I400026557b4134c0ac77f51c79610a96eb985b4a Signed-off-by: Joel Fernandes --- kernel/sched/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 284cc86d3ad4..bafa2931c898 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2032,6 +2032,7 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_IDLE 0x80 /* The last dequeue before IDLE */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 -- cgit v1.2.3 From 82d3f23d6dc53c564c1c8550f9ee6ac72f85c004 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:33 +0800 Subject: sched/fair: Fix bandwidth timer clock drift condition commit 512ac999d2755d2b7109e996a76b6fb8b888631d upstream. I noticed that cgroup task groups constantly get throttled even if they have low CPU usage, this causes some jitters on the response time to some of our business containers when enabling CPU quotas. It's very simple to reproduce: mkdir /sys/fs/cgroup/cpu/test cd /sys/fs/cgroup/cpu/test echo 100000 > cpu.cfs_quota_us echo $$ > tasks then repeat: cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily After some analysis, we found that cfs_rq::runtime_remaining will be cleared by expire_cfs_rq_runtime() due to two equal but stale "cfs_{b|q}->runtime_expires" after period timer is re-armed. The current condition to judge clock drift in expire_cfs_rq_runtime() is wrong, the two runtime_expires are actually the same when clock drift happens, so this condtion can never hit. The orginal design was correctly done by this commit: a9cf55b28610 ("sched: Expire invalid runtime") ... but was changed to be the current implementation due to its locking bug. This patch introduces another way, it adds a new field in both structures cfs_rq and cfs_bandwidth to record the expiration update sequence, and uses them to figure out if clock drift happens (true if they are equal). Change-Id: I8168fe3b45785643536f289ea823d1a62d9d8ab2 Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) [alakeshh: backport: Fixed merge conflicts: - sched.h: Fix the indentation and order in which the variables are declared to match with coding style of the existing code in 4.14 Struct members of same type were declared in separate lines in upstream patch which has been changed back to having multiple members of same type in the same line. e.g. int a; int b; -> int a, b; ] Signed-off-by: Alakesh Haloi Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: # 4.14.x Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bafa2931c898..eaf5d3af2e92 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,8 +227,9 @@ struct cfs_bandwidth { u64 quota, runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle, period_active; + short idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -522,6 +523,7 @@ struct cfs_rq { #endif int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; -- cgit v1.2.3 From b933e4d37bc023d27c7394626669bae0a201da52 Mon Sep 17 00:00:00 2001 From: Dave Chiluk Date: Tue, 23 Jul 2019 11:44:26 -0500 Subject: sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream. It has been observed, that highly-threaded, non-cpu-bound applications running under cpu.cfs_quota_us constraints can hit a high percentage of periods throttled while simultaneously not consuming the allocated amount of quota. This use case is typical of user-interactive non-cpu bound applications, such as those running in kubernetes or mesos when run on multiple cpu cores. This has been root caused to cpu-local run queue being allocated per cpu bandwidth slices, and then not fully using that slice within the period. At which point the slice and quota expires. This expiration of unused slice results in applications not being able to utilize the quota for which they are allocated. The non-expiration of per-cpu slices was recently fixed by 'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")'. Prior to that it appears that this had been broken since at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That added the following conditional which resulted in slices never being expired. if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; Because this was broken for nearly 5 years, and has recently been fixed and is now being noticed by many users running kubernetes (https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion that the mechanisms around expiring runtime should be removed altogether. This allows quota already allocated to per-cpu run-queues to live longer than the period boundary. This allows threads on runqueues that do not use much CPU to continue to use their remaining slice over a longer period of time than cpu.cfs_period_us. However, this helps prevent the above condition of hitting throttling while also not fully utilizing your cpu quota. This theoretically allows a machine to use slightly more than its allotted quota in some periods. This overflow would be bounded by the remaining quota left on each per-cpu runqueueu. This is typically no more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will change nothing, as they should theoretically fully utilize all of their quota in each period. For user-interactive tasks as described above this provides a much better user/application experience as their cpu utilization will more closely match the amount they requested when they hit throttling. This means that cpu limits no longer strictly apply per period for non-cpu bound applications, but that they are still accurate over longer timeframes. This greatly improves performance of high-thread-count, non-cpu bound applications with low cfs_quota_us allocation on high-core-count machines. In the case of an artificial testcase (10ms/100ms of quota on 80 CPU machine), this commit resulted in almost 30x performance improvement, while still maintaining correct cpu quota restrictions. That testcase is available at https://github.com/indeedeng/fibtest. Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition") Change-Id: I7d7a39fb554ec0c31f9381f492165f43c70b3924 Signed-off-by: Dave Chiluk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Cc: Ingo Molnar Cc: John Hammond Cc: Jonathan Corbet Cc: Kyle Anderson Cc: Gabriel Munos Cc: Peter Oskolkov Cc: Cong Wang Cc: Brendan Gregg Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com Signed-off-by: Greg Kroah-Hartman --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eaf5d3af2e92..4e1afb33166b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,8 +226,6 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; short idle, period_active; struct hrtimer period_timer, slack_timer; @@ -523,8 +521,6 @@ struct cfs_rq { #endif int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock, throttled_clock_task; -- cgit v1.2.3 From 7d11b1a7a11c598a07687f853ded9eca97d89043 Mon Sep 17 00:00:00 2001 From: Georg Veichtlbauer Date: Wed, 26 Jul 2023 21:00:09 +0200 Subject: Revert "sched: cpufreq: Use sched_clock instead of rq_clock when updating schedutil" That commit should have changed rq_clock to sched_clock, instead of sched_ktime_clock, which kept schedutil from making correct decisions. This reverts commit ef3fb04c7df43dfa1793e33f764a2581cda96310. Change-Id: Id4118894388c33bf2b2d3d5ee27eb35e82dc4a96 --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/sched.h') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4e1afb33166b..78ba150f2016 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2874,7 +2874,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) - data->func(data, sched_ktime_clock(), flags); + data->func(data, rq_clock(rq), flags); } static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -- cgit v1.2.3