From 6adb092856e806d91f3fc22dff0ef36506dd0bae Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Tue, 6 Jun 2017 11:58:27 -0700
Subject: sched: cpufreq: Limit governor updates to WALT changes alone

It's not necessary to keep reporting load to the governor
if it doesn't change in a window. Limit updates to when
we expect load changes - after window rollover and when
we send updates related to intercluster migrations.

[beykerykt]: Adapt for HMP
Change-Id: I3232d40f3d54b0b81cfafdcdb99b534df79327bf
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/sched.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 90cc450dff7e..40da1a509ded 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2856,8 +2856,10 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 #ifdef CONFIG_SCHED_HMP
 	/*
 	 * Skip if we've already reported, but not if this is an inter-cluster
-	 * migration
+	 * migration. Also only allow WALT update sites.
 	 */
+	if (!(flags & SCHED_CPUFREQ_WALT))
+		return;
 	if (!sched_disable_window_stats &&
 		(rq->load_reported_window == rq->window_start) &&
 		!(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG))
-- 
cgit v1.2.3


From ef3fb04c7df43dfa1793e33f764a2581cda96310 Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Mon, 8 May 2017 19:20:22 -0700
Subject: sched: cpufreq: Use sched_clock instead of rq_clock when updating
 schedutil

rq_clock may not be updated often enough for schedutil or other
cpufreq governors to work correctly when it's passed as the
timestamp for a load report. Use sched_clock instead.

[beykerykt]: Switch to sched_ktime_clock()

Change-Id: I745b727870a31da25f766c2c2f37527f568c20da
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 40da1a509ded..e78a3e867472 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2869,7 +2869,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 
         data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
         if (data)
-                data->func(data, rq_clock(rq), flags);
+                data->func(data, sched_ktime_clock(), flags);
 }
 
 static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-- 
cgit v1.2.3


From 4dbe44554792f83b785eed187aa1bcd69e84094c Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Tue, 9 May 2017 17:49:47 -0700
Subject: sched: cpufreq: Use per_cpu_ptr instead of this_cpu_ptr when
 reporting load

We need cpufreq_update_util to report load for the CPU corresponding
to the rq that is passed in as an argument, rather than the CPU executing
cpufreq_update_util.

Change-Id: I8473f230d40928d5920c614760e96fef12745d5a
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e78a3e867472..1196276eddf6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2867,7 +2867,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 	rq->load_reported_window = rq->window_start;
 #endif
 
-        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+		data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+						cpu_of(rq)));
         if (data)
                 data->func(data, sched_ktime_clock(), flags);
 }
-- 
cgit v1.2.3


From c0fa7577022c4169e1aaaf1bd9e04f63d285beb2 Mon Sep 17 00:00:00 2001
From: Ethan Chen <intervigil@gmail.com>
Date: Sat, 20 Jan 2018 16:35:53 -0800
Subject: sched/walt: Re-add code to allow WALT to function

Change-Id: Ieb1067c5e276f872ed4c722b7d1fabecbdad87e7
---
 kernel/sched/sched.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1196276eddf6..284cc86d3ad4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -511,6 +511,10 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
+#ifdef CONFIG_SCHED_WALT
+	u64 cumulative_runnable_avg;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 
 #ifdef CONFIG_SCHED_HMP
@@ -819,6 +823,7 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_WALT
+	unsigned int cur_freq;
 	u64 cumulative_runnable_avg;
 	u64 window_start;
 	u64 curr_runnable_sum;
-- 
cgit v1.2.3


From ebdb82f7b34aeab34623d7a5e4dd673fc2807842 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 20 Jul 2017 23:46:56 -0700
Subject: sched/fair: Skip frequency updates if CPU about to idle

If CPU is about to idle, prevent a frequency update. With the number of
schedutil governor wake ups are reduced by more than half on a test
playing bluetooth audio.

Test: sugov wake ups drop by more than half when playing music with
screen off (476 / 1092)

Bug: 64689959

Change-Id: I400026557b4134c0ac77f51c79610a96eb985b4a
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 284cc86d3ad4..bafa2931c898 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2032,6 +2032,7 @@ static const u32 prio_to_wmult[40] = {
 #define DEQUEUE_SLEEP		0x01
 #define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_IDLE		0x80 /* The last dequeue before IDLE */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
-- 
cgit v1.2.3


From 82d3f23d6dc53c564c1c8550f9ee6ac72f85c004 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@linux.alibaba.com>
Date: Wed, 20 Jun 2018 18:18:33 +0800
Subject: sched/fair: Fix bandwidth timer clock drift condition

commit 512ac999d2755d2b7109e996a76b6fb8b888631d upstream.

I noticed that cgroup task groups constantly get throttled even
if they have low CPU usage, this causes some jitters on the response
time to some of our business containers when enabling CPU quotas.

It's very simple to reproduce:

  mkdir /sys/fs/cgroup/cpu/test
  cd /sys/fs/cgroup/cpu/test
  echo 100000 > cpu.cfs_quota_us
  echo $$ > tasks

then repeat:

  cat cpu.stat | grep nr_throttled  # nr_throttled will increase steadily

After some analysis, we found that cfs_rq::runtime_remaining will
be cleared by expire_cfs_rq_runtime() due to two equal but stale
"cfs_{b|q}->runtime_expires" after period timer is re-armed.

The current condition to judge clock drift in expire_cfs_rq_runtime()
is wrong, the two runtime_expires are actually the same when clock
drift happens, so this condtion can never hit. The orginal design was
correctly done by this commit:

  a9cf55b28610 ("sched: Expire invalid runtime")

... but was changed to be the current implementation due to its locking bug.

This patch introduces another way, it adds a new field in both structures
cfs_rq and cfs_bandwidth to record the expiration update sequence, and
uses them to figure out if clock drift happens (true if they are equal).

Change-Id: I8168fe3b45785643536f289ea823d1a62d9d8ab2
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[alakeshh: backport: Fixed merge conflicts:
 - sched.h: Fix the indentation and order in which the variables are
   declared to match with coding style of the existing code in 4.14
   Struct members of same type were declared in separate lines in
   upstream patch which has been changed back to having multiple
   members of same type in the same line.
   e.g. int a; int b; ->  int a, b; ]
Signed-off-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org> # 4.14.x
Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")
Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched/sched.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bafa2931c898..eaf5d3af2e92 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,8 +227,9 @@ struct cfs_bandwidth {
 	u64 quota, runtime;
 	s64 hierarchical_quota;
 	u64 runtime_expires;
+	int expires_seq;
 
-	int idle, period_active;
+	short idle, period_active;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -522,6 +523,7 @@ struct cfs_rq {
 #endif
 
 	int runtime_enabled;
+	int expires_seq;
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
-- 
cgit v1.2.3


From b933e4d37bc023d27c7394626669bae0a201da52 Mon Sep 17 00:00:00 2001
From: Dave Chiluk <chiluk+linux@indeed.com>
Date: Tue, 23 Jul 2019 11:44:26 -0500
Subject: sched/fair: Fix low cpu usage with high throttling by removing
 expiration of cpu-local slices

commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream.

It has been observed, that highly-threaded, non-cpu-bound applications
running under cpu.cfs_quota_us constraints can hit a high percentage of
periods throttled while simultaneously not consuming the allocated
amount of quota. This use case is typical of user-interactive non-cpu
bound applications, such as those running in kubernetes or mesos when
run on multiple cpu cores.

This has been root caused to cpu-local run queue being allocated per cpu
bandwidth slices, and then not fully using that slice within the period.
At which point the slice and quota expires. This expiration of unused
slice results in applications not being able to utilize the quota for
which they are allocated.

The non-expiration of per-cpu slices was recently fixed by
'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift
condition")'. Prior to that it appears that this had been broken since
at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some
cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That
added the following conditional which resulted in slices never being
expired.

if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
	/* extend local deadline, drift is bounded above by 2 ticks */
	cfs_rq->runtime_expires += TICK_NSEC;

Because this was broken for nearly 5 years, and has recently been fixed
and is now being noticed by many users running kubernetes
(https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion
that the mechanisms around expiring runtime should be removed
altogether.

This allows quota already allocated to per-cpu run-queues to live longer
than the period boundary. This allows threads on runqueues that do not
use much CPU to continue to use their remaining slice over a longer
period of time than cpu.cfs_period_us. However, this helps prevent the
above condition of hitting throttling while also not fully utilizing
your cpu quota.

This theoretically allows a machine to use slightly more than its
allotted quota in some periods. This overflow would be bounded by the
remaining quota left on each per-cpu runqueueu. This is typically no
more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will
change nothing, as they should theoretically fully utilize all of their
quota in each period. For user-interactive tasks as described above this
provides a much better user/application experience as their cpu
utilization will more closely match the amount they requested when they
hit throttling. This means that cpu limits no longer strictly apply per
period for non-cpu bound applications, but that they are still accurate
over longer timeframes.

This greatly improves performance of high-thread-count, non-cpu bound
applications with low cfs_quota_us allocation on high-core-count
machines. In the case of an artificial testcase (10ms/100ms of quota on
80 CPU machine), this commit resulted in almost 30x performance
improvement, while still maintaining correct cpu quota restrictions.
That testcase is available at https://github.com/indeedeng/fibtest.

Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")
Change-Id: I7d7a39fb554ec0c31f9381f492165f43c70b3924
Signed-off-by: Dave Chiluk <chiluk+linux@indeed.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: John Hammond <jhammond@indeed.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kyle Anderson <kwa@yelp.com>
Cc: Gabriel Munos <gmunoz@netflix.com>
Cc: Peter Oskolkov <posk@posk.io>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Brendan Gregg <bgregg@netflix.com>
Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched/sched.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eaf5d3af2e92..4e1afb33166b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,8 +226,6 @@ struct cfs_bandwidth {
 	ktime_t period;
 	u64 quota, runtime;
 	s64 hierarchical_quota;
-	u64 runtime_expires;
-	int expires_seq;
 
 	short idle, period_active;
 	struct hrtimer period_timer, slack_timer;
@@ -523,8 +521,6 @@ struct cfs_rq {
 #endif
 
 	int runtime_enabled;
-	int expires_seq;
-	u64 runtime_expires;
 	s64 runtime_remaining;
 
 	u64 throttled_clock, throttled_clock_task;
-- 
cgit v1.2.3


From 7d11b1a7a11c598a07687f853ded9eca97d89043 Mon Sep 17 00:00:00 2001
From: Georg Veichtlbauer <georg@vware.at>
Date: Wed, 26 Jul 2023 21:00:09 +0200
Subject: Revert "sched: cpufreq: Use sched_clock instead of rq_clock when
 updating schedutil"

That commit should have changed rq_clock to sched_clock, instead
of sched_ktime_clock, which kept schedutil from making correct
decisions.

This reverts commit ef3fb04c7df43dfa1793e33f764a2581cda96310.

Change-Id: Id4118894388c33bf2b2d3d5ee27eb35e82dc4a96
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e1afb33166b..78ba150f2016 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2874,7 +2874,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 		data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
 						cpu_of(rq)));
         if (data)
-                data->func(data, sched_ktime_clock(), flags);
+                data->func(data, rq_clock(rq), flags);
 }
 
 static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-- 
cgit v1.2.3