From 259bd4cd8a4c5776e86ae1a86f1f8e1168e84212 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Thu, 17 Sep 2015 16:10:56 +0100
Subject: cpufreq: Frequency invariant scheduler load-tracking support

Implements cpufreq_scale_freq_capacity() to provide the scheduler with a
frequency scaling correction factor for more accurate load-tracking.

The factor is:

	current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)

In fact, freq_scale should be a struct cpufreq_policy data member. But
this would require that the scheduler hot path (__update_load_avg()) would
have to grab the cpufreq lock. This can be avoided by using per-cpu data
initialized to SCHED_CAPACITY_SCALE for freq_scale.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 include/linux/cpufreq.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 177c7680c1a8..11c65f536b6c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -616,4 +616,7 @@ unsigned int cpufreq_generic_get(unsigned int cpu);
 int cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
+
+struct sched_domain;
+unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3


From 0b3bda54d245ebcfeefc61f0ff763e76ca2a8784 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 14 Nov 2014 16:08:45 +0000
Subject: sched: Introduce energy data structures

The struct sched_group_energy represents the per sched_group related
data which is needed for energy aware scheduling. It contains:

  (1) number of elements of the idle state array
  (2) pointer to the idle state array which comprises 'power consumption'
      for each idle state
  (3) number of elements of the capacity state array
  (4) pointer to the capacity state array which comprises 'compute
      capacity and power consumption' tuples for each capacity state

The struct sched_group obtains a pointer to a struct sched_group_energy.

The function pointer sched_domain_energy_f is introduced into struct
sched_domain_topology_level which will allow the arch to pass a particular
struct sched_group_energy from the topology shim layer into the scheduler
core.

The function pointer sched_domain_energy_f has an 'int cpu' parameter
since the folding of two adjacent sd levels via sd degenerate doesn't work
for all sd levels. I.e. it is not possible for example to use this feature
to provide per-cpu energy in sd level DIE on ARM's TC2 platform.

It was discussed that the folding of sd levels approach is preferable
over the cpu parameter approach, simply because the user (the arch
specifying the sd topology table) can introduce less errors. But since
it is not working, the 'int cpu' parameter is the only way out. It's
possible to use the folding of sd levels approach for
sched_domain_flags_f and the cpu parameter approach for the
sched_domain_energy_f at the same time though. With the use of the
'int cpu' parameter, an extra check function has to be provided to make
sure that all cpus spanned by a sched group are provisioned with the same
energy data.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 include/linux/sched.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index fa39434e3fdd..5bee272a86d8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1021,6 +1021,22 @@ struct sched_domain_attr {
 
 extern int sched_domain_level_max;
 
+struct capacity_state {
+	unsigned long cap;	/* compute capacity */
+	unsigned long power;	/* power consumption at this compute capacity */
+};
+
+struct idle_state {
+	unsigned long power;	 /* power consumption in this idle state */
+};
+
+struct sched_group_energy {
+	unsigned int nr_idle_states;	/* number of idle states */
+	struct idle_state *idle_states;	/* ptr to idle state array */
+	unsigned int nr_cap_states;	/* number of capacity states */
+	struct capacity_state *cap_states; /* ptr to capacity state array */
+};
+
 struct sched_group;
 
 struct sched_domain {
@@ -1119,6 +1135,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
+typedef
+const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
 
 #define SDTL_OVERLAP	0x01
 
@@ -1131,6 +1149,7 @@ struct sd_data {
 struct sched_domain_topology_level {
 	sched_domain_mask_f mask;
 	sched_domain_flags_f sd_flags;
+	sched_domain_energy_f energy;
 	int		    flags;
 	int		    numa_level;
 	struct sd_data      data;
-- 
cgit v1.2.3


From f0f739d887a4f144ab4937e619288d4cede2cc91 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 13 Jan 2015 13:50:46 +0000
Subject: sched: Introduce SD_SHARE_CAP_STATES sched_domain flag

cpufreq is currently keeping it a secret which cpus are sharing
clock source. The scheduler needs to know about clock domains as well
to become more energy aware. The SD_SHARE_CAP_STATES domain flag
indicates whether cpus belonging to the sched_domain share capacity
states (P-states).

There is no connection with cpufreq (yet). The flag must be set by
the arch specific topology code.

cc: Russell King <linux@arm.linux.org.uk>
cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5bee272a86d8..09e2d43406eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -989,6 +989,7 @@ extern void wake_up_q(struct wake_q_head *head);
 #define SD_PREFER_SIBLING	0x1000	/* Prefer to place tasks in a sibling domain */
 #define SD_OVERLAP		0x2000	/* sched_domains of this level overlap */
 #define SD_NUMA			0x4000	/* cross-node balancing */
+#define SD_SHARE_CAP_STATES	0x8000  /* Domain members share capacity state */
 
 #ifdef CONFIG_SCHED_SMT
 static inline int cpu_smt_flags(void)
-- 
cgit v1.2.3


From 19a5ebe13dded57ab2b04defd2e379579da05197 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 27 Jan 2015 13:48:07 +0000
Subject: sched, cpuidle: Track cpuidle state index in the scheduler

The idle-state of each cpu is currently pointed to by rq->idle_state but
there isn't any information in the struct cpuidle_state that can used to
look up the idle-state energy model data stored in struct
sched_group_energy. For this purpose is necessary to store the idle
state index as well. Ideally, the idle-state data should be unified.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
---
 include/linux/cpuidle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 786ad32631a6..6eae1576499e 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 #endif
 
 /* kernel/sched/idle.c */
-extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
 extern void default_idle_call(void);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
-- 
cgit v1.2.3


From d2b3db032e34938c970b148e71cc401fc4c541e8 Mon Sep 17 00:00:00 2001
From: Robin Randhawa <robin.randhawa@arm.com>
Date: Mon, 29 Jun 2015 18:01:58 +0100
Subject: sched: Support for extracting EAS energy costs from DT

This patch implements support for extracting energy cost data from DT.
The data should conform to the DT bindings for energy cost data needed
by EAS (energy aware scheduling).

Signed-off-by: Robin Randhawa <robin.randhawa@arm.com>
---
 include/linux/sched_energy.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 include/linux/sched_energy.h

(limited to 'include/linux')

diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
new file mode 100644
index 000000000000..a3f1627ac609
--- /dev/null
+++ b/include/linux/sched_energy.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_SCHED_ENERGY_H
+#define _LINUX_SCHED_ENERGY_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * There doesn't seem to be an NR_CPUS style max number of sched domain
+ * levels so here's an arbitrary constant one for the moment.
+ *
+ * The levels alluded to here correspond to entries in struct
+ * sched_domain_topology_level that are meant to be populated by arch
+ * specific code (topology.c).
+ */
+#define NR_SD_LEVELS 8
+
+#define SD_LEVEL0   0
+#define SD_LEVEL1   1
+#define SD_LEVEL2   2
+#define SD_LEVEL3   3
+#define SD_LEVEL4   4
+#define SD_LEVEL5   5
+#define SD_LEVEL6   6
+#define SD_LEVEL7   7
+
+/*
+ * Convenience macro for iterating through said sd levels.
+ */
+#define for_each_possible_sd_level(level)		    \
+	for (level = 0; level < NR_SD_LEVELS; level++)
+
+extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+void init_sched_energy_costs(void);
+
+#endif
-- 
cgit v1.2.3


From b03f1ba3d5bb9db0ca495c8c33ef88588daaae50 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Tue, 22 Sep 2015 16:47:48 +0100
Subject: cpufreq: Max freq invariant scheduler load-tracking and cpu capacity
 support

Implements cpufreq_scale_max_freq_capacity() to provide the scheduler
with a maximum frequency scaling correction factor for more accurate
load-tracking and cpu capacity handling by being able to deal with
frequency capping.

This scaling factor describes the influence of running a cpu with a
current maximum frequency lower than the absolute possible maximum
frequency on load tracking and cpu capacity.

The factor is:

	current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu)

In fact, max_freq_scale should be a struct cpufreq_policy data member.
But this would require that the scheduler hot path (__update_load_avg())
would have to grab the cpufreq lock. This can be avoided by using per-cpu
data initialized to SCHED_CAPACITY_SCALE for max_freq_scale.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---
 include/linux/cpufreq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 11c65f536b6c..9b1d61e1f1d7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -619,4 +619,5 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 
 struct sched_domain;
 unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu);
+unsigned long cpufreq_scale_max_freq_capacity(int cpu);
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3


From c3b2e765af4af80ada13e5e34a26a28e3081dd0e Mon Sep 17 00:00:00 2001
From: Michael Turquette <mturquette@baylibre.com>
Date: Tue, 30 Jun 2015 12:45:27 +0100
Subject: cpufreq: introduce cpufreq_driver_is_slow

Some architectures and platforms perform CPU frequency transitions
through a non-blocking method, while some might block or sleep. Even
when frequency transitions do not block or sleep they may be very slow.
This distinction is important when trying to change frequency from
a non-interruptible context in a scheduler hot path.

Describe this distinction with a cpufreq driver flag,
CPUFREQ_DRIVER_FAST. The default is to not have this flag set,
thus erring on the side of caution.

cpufreq_driver_is_slow() is also introduced in this patch. Setting
the above flag will allow this function to return false.

[smuckle@linaro.org: change flag/API to include drivers that are too
 slow for scheduler hot paths, in addition to those that block/sleep]

Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 include/linux/cpufreq.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9b1d61e1f1d7..b5ef79aae048 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
+bool cpufreq_driver_is_slow(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
@@ -317,6 +318,14 @@ struct cpufreq_driver {
  */
 #define CPUFREQ_NEED_INITIAL_FREQ_CHECK	(1 << 5)
 
+/*
+ * Indicates that it is safe to call cpufreq_driver_target from
+ * non-interruptable context in scheduler hot paths.  Drivers must
+ * opt-in to this flag, as the safe default is that they might sleep
+ * or be too slow for hot path use.
+ */
+#define CPUFREQ_DRIVER_FAST		(1 << 6)
+
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 
-- 
cgit v1.2.3


From a967a45c71ae8978b2c0d06ee0de23f47737cfd5 Mon Sep 17 00:00:00 2001
From: Michael Turquette <mturquette@baylibre.com>
Date: Tue, 30 Jun 2015 12:45:48 +0100
Subject: sched: scheduler-driven cpu frequency selection

Scheduler-driven CPU frequency selection hopes to exploit both
per-task and global information in the scheduler to improve frequency
selection policy, achieving lower power consumption, improved
responsiveness/performance, and less reliance on heuristics and
tunables. For further discussion on the motivation of this integration
see [0].

This patch implements a shim layer between the Linux scheduler and the
cpufreq subsystem. The interface accepts capacity requests from the
CFS, RT and deadline sched classes. The requests from each sched class
are summed on each CPU with a margin applied to the CFS and RT
capacity requests to provide some headroom. Deadline requests are
expected to be precise enough given their nature to not require
headroom. The maximum total capacity request for a CPU in a frequency
domain drives the requested frequency for that domain.

Policy is determined by both the sched classes and this shim layer.

Note that this algorithm is event-driven. There is no polling loop to
check cpu idle time nor any other method which is unsynchronized with
the scheduler, aside from a throttling mechanism to ensure frequency
changes are not attempted faster than the hardware can accommodate them.

Thanks to Juri Lelli <juri.lelli@arm.com> for contributing design ideas,
code and test results, and to Ricky Liang <jcliang@chromium.org>
for initialization and static key inc/dec fixes.

[0] http://article.gmane.org/gmane.linux.kernel/1499836

[smuckle@linaro.org: various additions and fixes, revised commit text]

CC: Ricky Liang <jcliang@chromium.org>
Signed-off-by: Michael Turquette <mturquette@baylibre.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
---
 include/linux/cpufreq.h | 3 +++
 include/linux/sched.h   | 8 ++++++++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b5ef79aae048..b5433349938a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -496,6 +496,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand;
 #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE)
 extern struct cpufreq_governor cpufreq_gov_conservative;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
+#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED)
+extern struct cpufreq_governor cpufreq_gov_sched;
+#define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_sched)
 #endif
 
 /*********************************************************************
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 09e2d43406eb..1460c6a4f65f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -927,6 +927,14 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SHIFT	10
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
+struct sched_capacity_reqs {
+	unsigned long cfs;
+	unsigned long rt;
+	unsigned long dl;
+
+	unsigned long total;
+};
+
 /*
  * Wake-queues are lists of tasks with a pending wakeup, whose
  * callers have already marked the task as woken internally,
-- 
cgit v1.2.3


From 344f4ec4293b75d83282eb13ede965995e5f312f Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Mon, 22 Jun 2015 18:11:44 +0100
Subject: sched/tune: add sysctl interface to define a boost value

The current (CFS) scheduler implementation does not allow "to boost"
tasks performance by running them at a higher OPP compared to the
minimum required to meet their workload demands.

To support tasks performance boosting the scheduler should provide a
"knob" which allows to tune how much the system is going to be optimised
for energy efficiency vs performance.

This patch is the first of a series which provides a simple interface to
define a tuning knob. One system-wide "boost" tunable is exposed via:
  /proc/sys/kernel/sched_cfs_boost
which can be configured in the range [0..100], to define a percentage
where:
  - 0%   boost requires to operate in "standard" mode by scheduling
         tasks at the minimum capacities required by the workload demand
  - 100% boost requires to push at maximum the task performances,
         "regardless" of the incurred energy consumption

A boost value in between these two boundaries is used to bias the
power/performance trade-off, the higher the boost value the more the
scheduler is biased toward performance boosting instead of energy
efficiency.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/linux/sched/sysctl.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731cf10b..4479e48c7712 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -77,6 +77,22 @@ extern int sysctl_sched_rt_runtime;
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
 
+#ifdef CONFIG_SCHED_TUNE
+extern unsigned int sysctl_sched_cfs_boost;
+int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+				   void __user *buffer, size_t *length,
+				   loff_t *ppos);
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+	return sysctl_sched_cfs_boost;
+}
+#else
+static inline unsigned int get_sysctl_sched_cfs_boost(void)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
-- 
cgit v1.2.3


From 13001f47c9a610705219700af4636386b647e231 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Tue, 23 Jun 2015 09:17:54 +0100
Subject: sched/tune: add initial support for CGroups based boosting

To support task performance boosting, the usage of a single knob has the
advantage to be a simple solution, both from the implementation and the
usability standpoint.  However, on a real system it can be difficult to
identify a single value for the knob which fits the needs of multiple
different tasks. For example, some kernel threads and/or user-space
background services should be better managed the "standard" way while we
still want to be able to boost the performance of specific workloads.

In order to improve the flexibility of the task boosting mechanism this
patch is the first of a small series which extends the previous
implementation to introduce a "per task group" support.
This first patch introduces just the basic CGroups support, a new
"schedtune" CGroups controller is added which allows to configure
different boost value for different groups of tasks.
To keep the implementation simple but still effective for a boosting
strategy, the new controller:
  1. allows only a two layer hierarchy
  2. supports only a limited number of boost groups

A two layer hierarchy allows to place each task either:
  a) in the root control group
     thus being subject to a system-wide boosting value
  b) in a child of the root group
     thus being subject to the specific boost value defined by that
     "boost group"

The limited number of "boost groups" supported is mainly motivated by
the observation that in a real system it could be useful to have only
few classes of tasks which deserve different treatment.
For example, background vs foreground or interactive vs low-priority.
As an additional benefit, a limited number of boost groups allows also
to have a simpler implementation especially for the code required to
compute the boost value for CPUs which have runnable tasks belonging to
different boost groups.

cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/linux/cgroup_subsys.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1a96fdaa33d5..e133705d794a 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -26,6 +26,10 @@ SUBSYS(cpu)
 SUBSYS(cpuacct)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE)
+SUBSYS(schedtune)
+#endif
+
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
 SUBSYS(io)
 #endif
-- 
cgit v1.2.3


From dd2460f3872f1b89839aedb21b999e802542c32b Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Thu, 30 Apr 2015 17:35:23 +0100
Subject: DEBUG: sched,cpufreq: add cpu_capacity change tracepoint

This is useful when we want to compare cpu utilization and
cpu curr capacity side by side.

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/linux/sched.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1460c6a4f65f..b9b11b2dbabd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,8 @@ struct sched_group_energy {
 	struct capacity_state *cap_states; /* ptr to capacity state array */
 };
 
+unsigned long capacity_curr_of(int cpu);
+
 struct sched_group;
 
 struct sched_domain {
-- 
cgit v1.2.3


From 3a400abdc57e047786d0b35d39617a794e2bb97e Mon Sep 17 00:00:00 2001
From: Joseph Lo <josephl@nvidia.com>
Date: Mon, 22 Apr 2013 14:39:18 +0800
Subject: CHROMIUM: sched: update the average of nr_running

Doing a Exponential moving average per nr_running++/-- does not
guarantee a fixed sample rate which induces errors if there are lots of
threads being enqueued/dequeued from the rq (Linpack mt). Instead of
keeping track of the avg, the scheduler now keeps track of the integral
of nr_running and allows the readers to perform filtering on top.

Original-author: Sai Charan Gurrappadi <sgurrappadi@nvidia.com>

Change-Id: Id946654f32fa8be0eaf9d8fa7c9a8039b5ef9fab
Signed-off-by: Joseph Lo <josephl@nvidia.com>
Signed-off-by: Andrew Bresticker <abrestic@chromium.org>
Reviewed-on: https://chromium-review.googlesource.com/174694
Reviewed-on: https://chromium-review.googlesource.com/272853
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/sched.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b9b11b2dbabd..ee59abcab30d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -173,6 +173,9 @@ extern bool single_task_running(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
 extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
+#ifdef CONFIG_CPU_QUIET
+extern u64 nr_running_integral(unsigned int cpu);
+#endif
 
 extern void calc_global_load(unsigned long ticks);
 
-- 
cgit v1.2.3


From 765c2ab363a02a71f9ef4c55a7f598e3dd401e0b Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Fri, 22 Jul 2016 11:35:59 +0100
Subject: FIXUP: sched: fix build for non-SMP target

Currently the build for a single-core (e.g. user-mode) Linux is broken
and this configuration is required (at least) to run some network tests.

The main issues for the current code support on single-core systems are:
1. {se,rq}::sched_avg is not available nor maintained for !SMP systems
   This means that load and utilisation signals are NOT available in single
   core systems. All the EAS code depends on these signals.
2. sched_group_energy is also SMP dependant. Again this means that all the
   EAS setup and preparation code (energyn model initialization) has to be
   properly guarded/disabled for !SMP systems.
3. SchedFreq depends on utilization signal, which is not available on
   !SMP systems.
4. SchedTune is useless on unicore systems if SchedFreq is not available.
5. WALT machinery is not required on single-core systems.

This patch addresses all these issues by enforcing some constraints for
single-core systems:
a) WALT, SchedTune and SchedTune are now dependant on SMP
b) The default governor for !SMP systems is INTERACTIVE
c) The energy model initialisation/build functions are
d) Other minor code re-arrangements and CONFIG_SMP guarding to enable
   single core builds.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 include/linux/sched_energy.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h
index a3f1627ac609..1daf3e1f98a7 100644
--- a/include/linux/sched_energy.h
+++ b/include/linux/sched_energy.h
@@ -29,8 +29,16 @@
 #define for_each_possible_sd_level(level)		    \
 	for (level = 0; level < NR_SD_LEVELS; level++)
 
+#ifdef CONFIG_SMP
+
 extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
 
 void init_sched_energy_costs(void);
 
+#else
+
+#define init_sched_energy_costs() do { } while (0)
+
+#endif /* CONFIG_SMP */
+
 #endif
-- 
cgit v1.2.3


From 2e9abbc942e6211f9ac4dadb14debc39bb9d1418 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Thu, 14 Jul 2016 09:57:29 +0100
Subject: sched: EAS: take cstate into account when selecting idle core

Introduce a new sysctl for this option, 'sched_cstate_aware'.
When this is enabled, select_idle_sibling in CFS is modified to
choose the idle CPU in the sibling group which has the lowest
idle state index - idle state indexes are assumed to increase
as sleep depth and hence wakeup latency increase. In this way,
we attempt to minimise wakeup latency when an idle CPU is
required.

Signed-off-by: Srinath Sridharan <srinathsr@google.com>

Includes:
sched: EAS: fix select_idle_sibling

when sysctl_sched_cstate_aware is enabled, best_idle cpu will not be chosen
in the original flow because it will goto done directly

Bug: 30107557
Change-Id: Ie09c2e3960cafbb976f8d472747faefab3b4d6ac
Signed-off-by: martin_liu <martin_liu@htc.com>
---
 include/linux/sched/sysctl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4479e48c7712..7d021393b0da 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_cstate_aware;
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
-- 
cgit v1.2.3


From 4a5e890ec60d2e341fa560d8149997f5f0c48d67 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@arm.com>
Date: Fri, 29 Jul 2016 14:04:11 +0100
Subject: sched/fair: add tunable to force selection at cpu granularity

EAS assumes that clusters with smaller capacity cores are more
energy-efficient. This may not be true on non-big-little devices,
so EAS can make incorrect cluster selections when finding a CPU
to wake. The "sched_is_big_little" hint can be used to cause a
cpu-based selection instead of cluster-based selection.

This change incorporates the addition of the sync hint enable patch

EAS did not honour synchronous wakeup hints, a new sysctl is
created to ask EAS to use this information when selecting a CPU.
The control is called "sched_sync_hint_enable".

Also contains:

EAS: sched/fair: for SMP bias toward idle core with capacity

For SMP devices, on wakeup bias towards idle cores that have capacity
vs busy devices that need a higher OPP

eas: favor idle cpus for boosted tasks

BUG: 29533997
BUG: 29512132
Change-Id: I0cc9a1b1b88fb52916f18bf2d25715bdc3634f9c
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>

eas/sched/fair: Favoring busy cpus with low OPPs

BUG: 29533997
BUG: 29512132
Change-Id: I9305b3239698d64278db715a2e277ea0bb4ece79

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 include/linux/sched/sysctl.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 7d021393b0da..4883dcf3e1a9 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,8 @@ extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_is_big_little;
+extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_cstate_aware;
 
 enum sched_tunable_scaling {
-- 
cgit v1.2.3


From c50cc2299cb1a9ae769c955f5dd09a9648b1600e Mon Sep 17 00:00:00 2001
From: Todd Kjos <tkjos@google.com>
Date: Fri, 11 Mar 2016 16:44:16 -0800
Subject: sched/fair: add tunable to set initial task load

The choice of initial task load upon fork has a large influence
on CPU and OPP selection when scheduler-driven DVFS is in use.
Make this tuneable by adding a new sysctl "sched_initial_task_util".

If the sched governor is not used, the default remains at SCHED_LOAD_SCALE
Otherwise, the value from the sysctl is used. This defaults to 0.

Signed-off-by: "Todd Kjos <tkjos@google.com>"
---
 include/linux/sched/sysctl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4883dcf3e1a9..2834841c507e 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_is_big_little;
 extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
 
 enum sched_tunable_scaling {
-- 
cgit v1.2.3


From efb86bd08a2e9217d0b3c33753cf63d27e7c86da Mon Sep 17 00:00:00 2001
From: Srivatsa Vaddagiri <vatsa@codeaurora.org>
Date: Tue, 31 May 2016 09:08:38 -0700
Subject: sched: Introduce Window Assisted Load Tracking (WALT)

use a window based view of time in order to track task
demand and CPU utilization in the scheduler.

Window Assisted Load Tracking (WALT) implementation credits:
 Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
 Pavan Kumar Kondeti, Olav Haugan

2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
            and Todd Kjos

Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708

Includes fixes for issues:

eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a
race resulting in watchdog resets
BUG: 29353986
Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb

Handle walt accounting anomoly during resume

During resume, there is a corner case where on wakeup, a task's
prev_runnable_sum can go negative. This is a workaround that
fixes the condition and warns (instead of crashing).

BUG: 29464099
Change-Id: I173e7874324b31a3584435530281708145773508

Signed-off-by: Todd Kjos <tkjos@google.com>
Signed-off-by: Srinath Sridharan <srinathsr@google.com>
Signed-off-by: Juri Lelli <juri.lelli@arm.com>
[jstultz: fwdported to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/sched.h        | 53 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sched/sysctl.h |  5 +++++
 2 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ee59abcab30d..ce8db12f9e72 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -317,6 +317,15 @@ extern char ___assert_task_state[1 - 2*!!(
 /* Task command name length */
 #define TASK_COMM_LEN 16
 
+enum task_event {
+	PUT_PREV_TASK   = 0,
+	PICK_NEXT_TASK  = 1,
+	TASK_WAKE       = 2,
+	TASK_MIGRATE    = 3,
+	TASK_UPDATE     = 4,
+	IRQ_UPDATE	= 5,
+};
+
 #include <linux/spinlock.h>
 
 /*
@@ -1274,6 +1283,41 @@ struct sched_statistics {
 };
 #endif
 
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX  5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+	/*
+	 * 'mark_start' marks the beginning of an event (task waking up, task
+	 * starting to execute, task being preempted) within a window
+	 *
+	 * 'sum' represents how runnable a task has been within current
+	 * window. It incorporates both running time and wait time and is
+	 * frequency scaled.
+	 *
+	 * 'sum_history' keeps track of history of 'sum' seen over previous
+	 * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+	 * ignored.
+	 *
+	 * 'demand' represents maximum sum seen over previous
+	 * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+	 * demand for tasks.
+	 *
+	 * 'curr_window' represents task's contribution to cpu busy time
+	 * statistics (rq->curr_runnable_sum) in current window
+	 *
+	 * 'prev_window' represents task's contribution to cpu busy time
+	 * statistics (rq->prev_runnable_sum) in previous window
+	 */
+	u64 mark_start;
+	u32 sum, demand;
+	u32 sum_history[RAVG_HIST_SIZE_MAX];
+	u32 curr_window, prev_window;
+	u16 active_windows;
+};
+#endif
+
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
@@ -1431,6 +1475,15 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+	struct ravg ravg;
+	/*
+	 * 'init_load_pct' represents the initial task load assigned to children
+	 * of this task
+	 */
+	u32 init_load_pct;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 2834841c507e..710f58a28d63 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -43,6 +43,11 @@ extern unsigned int sysctl_sched_is_big_little;
 extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_initial_task_util;
 extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+#endif
 
 enum sched_tunable_scaling {
 	SCHED_TUNABLESCALING_NONE,
-- 
cgit v1.2.3


From 519c62750eb6ebbb5783315272398ced72d7a036 Mon Sep 17 00:00:00 2001
From: Srinath Sridharan <srinathsr@google.com>
Date: Fri, 22 Jul 2016 13:21:15 +0100
Subject: sched/walt: Accounting for number of irqs pending on each core

Schedules on a core whose irq count is less than a threshold.
Improves I/O performance of EAS.

Change-Id: I08ff7dd0d22502a0106fc636b1af2e6fe9e758b5
---
 include/linux/sched/sysctl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 710f58a28d63..d68e88c9d4d7 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -47,6 +47,7 @@ extern unsigned int sysctl_sched_cstate_aware;
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int sysctl_sched_use_walt_task_util;
 extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
 #endif
 
 enum sched_tunable_scaling {
-- 
cgit v1.2.3


From 142b2acc79777f3cac93144d7ebf9caa53e97f9b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 14 Jan 2016 15:21:40 -0800
Subject: vmstat: make vmstat_updater deferrable again and shut down on idle

Currently the vmstat updater is not deferrable as a result of commit
ba4877b9ca51 ("vmstat: do not use deferrable delayed work for
vmstat_update").  This in turn can cause multiple interruptions of the
applications because the vmstat updater may run at

Make vmstate_update deferrable again and provide a function that folds
the differentials when the processor is going to idle mode thus
addressing the issue of the above commit in a clean way.

Note that the shepherd thread will continue scanning the differentials
from another processor and will reenable the vmstat workers if it
detects any changes.

Change-Id: Idf256cfacb40b4dc8dbb6795cf06b34e8fec7a06
Fixes: ba4877b9ca51 ("vmstat: do not use deferrable delayed work for vmstat_update")
Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
Git-commit: 0eb77e9880321915322d42913c3b53241739c8aa
[shashim@codeaurora.org: resolve minor merge conflicts]
Signed-off-by: Shiraz Hashim <shashim@codeaurora.org>
[jstultz: fwdport to 4.4]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/vmstat.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3e5d9075960f..73fae8c4a5fb 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -189,6 +189,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
 extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
+void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
@@ -249,6 +250,7 @@ static inline void __dec_zone_page_state(struct page *page,
 
 static inline void refresh_zone_stat_thresholds(void) { }
 static inline void cpu_vm_stats_fold(int cpu) { }
+static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_pageset *pset) { }
-- 
cgit v1.2.3