summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSrivatsa Vaddagiri <vatsa@codeaurora.org>2014-11-21 18:25:11 +0530
committerDavid Keitel <dkeitel@codeaurora.org>2016-03-23 20:01:03 -0700
commit8e3aa6790ca4ff4744e30720e97c458375a35237 (patch)
treeb850a59db6aaa036957f304fb6ee6edae1c3a8c4
parent2365b0cbd64fe7a00ec2cfd3b7d8a20df640e095 (diff)
sched: Packing support until a frequency threshold
Add another dimension for task packing based on frequency. This patch adds a per-cpu tunable, rq->mostly_idle_freq, which when set will result in tasks being packed on a single cpu in cluster as long as cluster frequency is less than set threshold. Change-Id: I318e9af6c8788ddf5dfcda407d621449ea5343c0 Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org>
-rw-r--r--Documentation/scheduler/sched-hmp.txt84
-rw-r--r--drivers/base/cpu.c39
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/fair.c62
-rw-r--r--kernel/sched/sched.h1
6 files changed, 188 insertions, 2 deletions
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index d3321cb208e9..d0a998345799 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -22,7 +22,7 @@ CONTENTS
5.3 Scheduler Tick
5.4 Load Balancer
5.5 Real Time Tasks
- 5.6 Stop-Class Tasks
+ 5.6 Task packing
6. Frequency Guidance
6.1 Per-CPU Window-Based Stats
6.1 Per-task Window-Based Stats
@@ -571,15 +571,19 @@ both tasks and CPUs to aid in the placement of tasks.
the scheduler is tracking the demand of each task it can make an educated
guess as to whether a CPU will become idle in the near future.
- There are two tunable parameters which are used to determine whether
+ There are three tunable parameters which are used to determine whether
a CPU is mostly idle:
/sys/devices/system/cpu/cpuX/sched_mostly_idle_nr_run
/sys/devices/system/cpu/cpuX/sched_mostly_idle_load
+ /sys/devices/system/cpu/cpuX/sched_mostly_idle_freq
Note that these tunables are per-cpu. If a CPU does not have more than
sched_mostly_idle_nr_run runnable tasks and is not more than
sched_mostly_idle_load percent busy, it is considered mostly idle.
+ Additionally if a cpu's sched_mostly_idle_freq is non-zero and its current
+ frequency is less than threshold, then scheduler will attempt to pack
+ tasks on the most power-efficient cpu in the cluster.
- spill threshold
@@ -894,6 +898,71 @@ HMP scheduler brings in a change which avoids fast-path and always resorts to
slow-path. Further cpu with lowest power-rating from candidate list of cpus is
chosen as cpu for placing waking real-time task.
+*** 5.6 Task packing
+
+Task packing is letting one cpu take up more than one task in an attempt to
+improve power (and in some cases performance). Power benefit is derived by
+avoiding wakeup cost for idle cpus from their deep sleep states. For example,
+consider a system with one cpu busy while other cpus are idle and in deep
+sleep state. A small task in this situation needs to be placed on a suitable
+cpu. Placing the small task on the busy cpu will likely not hurt its
+performance (it is after all a low-demand task) while helping gain on power
+because we avoid the cost associated with waking idle cpu from deep sleep
+state.
+
+Task packing can have good or bad implications for power and performance.
+
+a. Power implications
+
+As described in the small task wakeup example, task packing can be beneficial
+for power. However, the adverse impact on power can arise when packing on one
+cpu can increase its busy time and hence result in frequency raise.
+
+b. Performance implications
+
+The most obvious negative impact on performance because of packing is
+increased scheduling latencies for tasks that can occur. Positive impact on
+performance from packing has also been seen. This arises from the fact
+that a waking task, when woken to busy cpu because of packing, will incur very
+low latency to run immediately, when compared to being woken to a idle cpu in
+deep sleep state. In later case, task has to wait for cpu to exit sleep state,
+considerable enough in some cases to hurt performance.
+
+Packing thus is a delicate matter to play with! The following parameters control
+packing behavior.
+
+- sched_small_task
+ This parameter specifies demand threshold below which a task will be
+classified as "small". As described in Sec 5.2 ("Task Wakeup and
+select_best_cpu()"), for small tasks wakeups, a busy cpu is prefered as target
+rather than idle cpu.
+
+- mostly_idle_load and mostly_idle_nr_run
+
+These are per-cpu parameters that define mostly_idle thresholds for a cpu. A cpu
+whose load < mostly_idle_load AND whose nr_running is < mostly_idle_nr_run is
+classified as mostly_idle. See further description of "mostly_idle" thresholds
+in Sec 5.
+
+- mostly_idle_freq
+
+This is a per-cpu parameter. If non-zero for a cpu which is part of a cluster
+and cluster current frequency is less than this threshold, then scheduler will
+poack all tasks on a single cpu in cluster. The cpu chosen is the first most
+power-efficient cpu found while scanning cluster's online cpus.
+
+For some low band of frequency, spread of task on all available cpus can be
+groslly power-inefficient. As an example, consider two tasks that each need
+500MHz. Packing them on one cpu could lead to 1GHz. In spread case, we incur
+cost of two cpus running at 500MHz, while in packed case, we incur the cost of
+one cpu running at 1GHz. Based on the silicon characteristics, where leakage
+power can be dominant factor, former can be worse on power rather than latter.
+Running at slow frequency (in spread case) can actually makes it worse on
+leakage power (especially if 500MHz and 1GHz share the same voltage point).
+sched_mostly_idle_freq is set based on silicon characteristics and can provide
+a winning argument for both power and performance.
+
+
=====================
6. FREQUENCY GUIDANCE
=====================
@@ -1271,6 +1340,17 @@ comparison. Scheduler will request a raise in cpu frequency when heavy tasks
wakeup after at least one window of sleep, where window size is defined by
sched_ravg_window. Value 0 will disable this feature.
+** 7.21 sched_mostly_idle_freq
+
+Appears at: /sys/devices/system/cpu/cpuX/sched_mostly_idle_freq
+
+Default value: 0
+
+This tunable is intended to achieve task packing behavior based on cluster
+frequency. Hence it is strongly advised to have all cpus in a cluster have the
+same value for mostly_idle_freq. For more details, see section on "Task
+packing" (sec 5.6).
+
=========================
8. HMP SCHEDULER TRACE POINTS
=========================
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 763fd00c697b..a59fa57ef42e 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -217,6 +217,42 @@ static ssize_t __ref store_sched_mostly_idle_load(struct device *dev,
return err;
}
+static ssize_t show_sched_mostly_idle_freq(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
+ ssize_t rc;
+ int cpunum;
+ unsigned int mostly_idle_freq;
+
+ cpunum = cpu->dev.id;
+
+ mostly_idle_freq = sched_get_cpu_mostly_idle_freq(cpunum);
+
+ rc = snprintf(buf, PAGE_SIZE-2, "%d\n", mostly_idle_freq);
+
+ return rc;
+}
+
+static ssize_t __ref store_sched_mostly_idle_freq(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
+ int cpuid = cpu->dev.id, err;
+ unsigned int mostly_idle_freq;
+
+ err = kstrtoint(strstrip((char *)buf), 0, &mostly_idle_freq);
+ if (err)
+ return err;
+
+ err = sched_set_cpu_mostly_idle_freq(cpuid, mostly_idle_freq);
+ if (err >= 0)
+ err = count;
+
+ return err;
+}
+
static ssize_t show_sched_mostly_idle_nr_run(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -253,6 +289,8 @@ static ssize_t __ref store_sched_mostly_idle_nr_run(struct device *dev,
return err;
}
+static DEVICE_ATTR(sched_mostly_idle_freq, 0664, show_sched_mostly_idle_freq,
+ store_sched_mostly_idle_freq);
static DEVICE_ATTR(sched_mostly_idle_load, 0664, show_sched_mostly_idle_load,
store_sched_mostly_idle_load);
static DEVICE_ATTR(sched_mostly_idle_nr_run, 0664,
@@ -261,6 +299,7 @@ static DEVICE_ATTR(sched_mostly_idle_nr_run, 0664,
static struct attribute *hmp_sched_cpu_attrs[] = {
&dev_attr_sched_mostly_idle_load.attr,
&dev_attr_sched_mostly_idle_nr_run.attr,
+ &dev_attr_sched_mostly_idle_freq.attr,
NULL
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 701611dad0fd..fd4d79416cdc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2335,6 +2335,9 @@ extern int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct);
extern int sched_get_cpu_mostly_idle_load(int cpu);
extern int sched_set_cpu_mostly_idle_nr_run(int cpu, int nr_run);
extern int sched_get_cpu_mostly_idle_nr_run(int cpu);
+extern int
+sched_set_cpu_mostly_idle_freq(int cpu, unsigned int mostly_idle_freq);
+extern unsigned int sched_get_cpu_mostly_idle_freq(int cpu);
#else
static inline int sched_set_boost(int enable)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7324abaeb3f..d43925c6e560 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9134,6 +9134,7 @@ void __init sched_init(void)
rq->hmp_flags = 0;
rq->mostly_idle_load = pct_to_real(20);
rq->mostly_idle_nr_run = 3;
+ rq->mostly_idle_freq = 0;
#ifdef CONFIG_SCHED_FREQ_INPUT
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
rq->old_busy_time = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 087f5e072e35..1238a6825e7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,6 +2703,25 @@ int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct)
return 0;
}
+int sched_set_cpu_mostly_idle_freq(int cpu, unsigned int mostly_idle_freq)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (mostly_idle_freq > rq->max_possible_freq)
+ return -EINVAL;
+
+ rq->mostly_idle_freq = mostly_idle_freq;
+
+ return 0;
+}
+
+unsigned int sched_get_cpu_mostly_idle_freq(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return rq->mostly_idle_freq;
+}
+
int sched_get_cpu_mostly_idle_load(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -3109,6 +3128,42 @@ static int skip_cpu(struct task_struct *p, int cpu, int reason)
return skip;
}
+/*
+ * Select a single cpu in cluster as target for packing, iff cluster frequency
+ * is less than a threshold level
+ */
+static int select_packing_target(struct task_struct *p, int best_cpu)
+{
+ struct rq *rq = cpu_rq(best_cpu);
+ struct cpumask search_cpus;
+ int i;
+ int min_cost = INT_MAX;
+ int target = best_cpu;
+
+ if (rq->cur_freq >= rq->mostly_idle_freq)
+ return best_cpu;
+
+ /* Don't pack if current freq is low because of throttling */
+ if (rq->max_freq <= rq->mostly_idle_freq)
+ return best_cpu;
+
+ cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+ cpumask_and(&search_cpus, &search_cpus, &rq->freq_domain_cpumask);
+
+ /* Pick the first lowest power cpu as target */
+ for_each_cpu(i, &search_cpus) {
+ int cost = power_cost(p, i);
+
+ if (cost < min_cost) {
+ target = i;
+ min_cost = cost;
+ }
+ }
+
+ return target;
+}
+
+
/* return cheapest cpu that can fit this task */
static int select_best_cpu(struct task_struct *p, int target, int reason)
{
@@ -3220,6 +3275,9 @@ done:
best_cpu = fallback_idle_cpu;
}
+ if (cpu_rq(best_cpu)->mostly_idle_freq)
+ best_cpu = select_packing_target(p, best_cpu);
+
return best_cpu;
}
@@ -9286,6 +9344,10 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
struct sched_domain *sd;
int i;
+ if (rq->mostly_idle_freq && rq->cur_freq < rq->mostly_idle_freq
+ && rq->max_freq > rq->mostly_idle_freq)
+ return 0;
+
if (rq->nr_running >= 2 && (rq->nr_running - rq->nr_small_tasks >= 2 ||
rq->nr_running > rq->mostly_idle_nr_run ||
cpu_load(cpu) > rq->mostly_idle_load)) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d0193932354f..fcdf4063ac11 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -660,6 +660,7 @@ struct rq {
u64 window_start;
u32 mostly_idle_load;
int mostly_idle_nr_run;
+ int mostly_idle_freq;
#ifdef CONFIG_SCHED_FREQ_INPUT
unsigned int old_busy_time;