summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-hmp.txt29
-rw-r--r--drivers/base/cpu.c95
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/sched/sysctl.h2
-rw-r--r--kernel/sched/core.c10
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c80
-rw-r--r--kernel/sched/sched.h8
-rw-r--r--kernel/sysctl.c14
9 files changed, 182 insertions, 65 deletions
diff --git a/Documentation/scheduler/sched-hmp.txt b/Documentation/scheduler/sched-hmp.txt
index 63ef1f558ce0..d3321cb208e9 100644
--- a/Documentation/scheduler/sched-hmp.txt
+++ b/Documentation/scheduler/sched-hmp.txt
@@ -564,23 +564,22 @@ both tasks and CPUs to aid in the placement of tasks.
The "mostly_idle" classification applies to CPUs. This
classification attempts to answer the following question: if a task
- is put on this CPU, is it likely to be able to run soon? One
- possible way to answer this question would be to just check whether
- the CPU is idle or not. That may be too conservative however. The
- CPU may be currently executing a very small task and could become
- idle soon. Since the scheduler is tracking the demand of each task
- it can make an educated guess as to whether a CPU will become idle
- in the near future.
+ is put on this CPU, is it likely to be able to run with low contention for
+ bandwidth? One possible way to answer this question would be to just check
+ whether the CPU is idle or not. That may be too conservative however. The CPU
+ may be currently executing a very small task and could become idle soon. Since
+ the scheduler is tracking the demand of each task it can make an educated
+ guess as to whether a CPU will become idle in the near future.
There are two tunable parameters which are used to determine whether
a CPU is mostly idle:
- /proc/sys/kernel/sched_mostly_idle_nr_run
- /proc/sys/kernel/sched_mostly_idle_load
+ /sys/devices/system/cpu/cpuX/sched_mostly_idle_nr_run
+ /sys/devices/system/cpu/cpuX/sched_mostly_idle_load
- If a CPU does not have more than sched_mostly_idle_nr_run runnable
- tasks and is not more than sched_mostly_idle_load percent busy, it
- is considered mostly idle.
+ Note that these tunables are per-cpu. If a CPU does not have more than
+ sched_mostly_idle_nr_run runnable tasks and is not more than
+ sched_mostly_idle_load percent busy, it is considered mostly idle.
- spill threshold
@@ -1042,9 +1041,9 @@ IRQ_UPDATE
*** 7.1 sched_mostly_idle_nr_run
-Appears at: /proc/sys/kernel/sched_mostly_idle_nr_run
+Appears at: /sys/devices/system/cpu/cpuX/sched_mostly_idle_nr_run
-Default value: 4
+Default value: 3
If a CPU has this many runnable tasks (or less), it is considered
"mostly idle." A mostly idle CPU is a preferred destination for a
@@ -1054,7 +1053,7 @@ than sched_mostly_idle_load percent busy.
*** 7.2 sched_mostly_idle_load
-Appears at: /proc/sys/kernel/sched_mostly_idle_load
+Appears at: /sys/devices/system/cpu/cpuX/sched_mostly_idle_load
Default value: 20
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 91bbb1959d8d..763fd00c697b 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -180,10 +180,102 @@ static struct attribute_group crash_note_cpu_attr_group = {
};
#endif
+#ifdef CONFIG_SCHED_HMP
+static ssize_t show_sched_mostly_idle_load(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
+ ssize_t rc;
+ int cpunum;
+ int mostly_idle_pct;
+
+ cpunum = cpu->dev.id;
+
+ mostly_idle_pct = sched_get_cpu_mostly_idle_load(cpunum);
+
+ rc = snprintf(buf, PAGE_SIZE-2, "%d\n", mostly_idle_pct);
+
+ return rc;
+}
+
+static ssize_t __ref store_sched_mostly_idle_load(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
+ int cpuid = cpu->dev.id;
+ int mostly_idle_load, err;
+
+ err = kstrtoint(strstrip((char *)buf), 0, &mostly_idle_load);
+ if (err)
+ return err;
+
+ err = sched_set_cpu_mostly_idle_load(cpuid, mostly_idle_load);
+ if (err >= 0)
+ err = count;
+
+ return err;
+}
+
+static ssize_t show_sched_mostly_idle_nr_run(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
+ ssize_t rc;
+ int cpunum;
+ int mostly_idle_nr_run;
+
+ cpunum = cpu->dev.id;
+
+ mostly_idle_nr_run = sched_get_cpu_mostly_idle_nr_run(cpunum);
+
+ rc = snprintf(buf, PAGE_SIZE-2, "%d\n", mostly_idle_nr_run);
+
+ return rc;
+}
+
+static ssize_t __ref store_sched_mostly_idle_nr_run(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpu *cpu = container_of(dev, struct cpu, dev);
+ int cpuid = cpu->dev.id;
+ int mostly_idle_nr_run, err;
+
+ err = kstrtoint(strstrip((char *)buf), 0, &mostly_idle_nr_run);
+ if (err)
+ return err;
+
+ err = sched_set_cpu_mostly_idle_nr_run(cpuid, mostly_idle_nr_run);
+ if (err >= 0)
+ err = count;
+
+ return err;
+}
+
+static DEVICE_ATTR(sched_mostly_idle_load, 0664, show_sched_mostly_idle_load,
+ store_sched_mostly_idle_load);
+static DEVICE_ATTR(sched_mostly_idle_nr_run, 0664,
+ show_sched_mostly_idle_nr_run, store_sched_mostly_idle_nr_run);
+
+static struct attribute *hmp_sched_cpu_attrs[] = {
+ &dev_attr_sched_mostly_idle_load.attr,
+ &dev_attr_sched_mostly_idle_nr_run.attr,
+ NULL
+};
+
+static struct attribute_group sched_hmp_cpu_attr_group = {
+ .attrs = hmp_sched_cpu_attrs,
+};
+
+#endif /* CONFIG_SCHED_HMP */
static const struct attribute_group *common_cpu_attr_groups[] = {
#ifdef CONFIG_KEXEC
&crash_note_cpu_attr_group,
#endif
+#ifdef CONFIG_SCHED_HMP
+ &sched_hmp_cpu_attr_group,
+#endif
NULL
};
@@ -191,6 +283,9 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
#ifdef CONFIG_KEXEC
&crash_note_cpu_attr_group,
#endif
+#ifdef CONFIG_SCHED_HMP
+ &sched_hmp_cpu_attr_group,
+#endif
NULL
};
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 740696b0a57d..701611dad0fd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2327,9 +2327,15 @@ sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
#endif
#ifdef CONFIG_SCHED_HMP
+
extern int sched_set_boost(int enable);
extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct);
extern u32 sched_get_init_task_load(struct task_struct *p);
+extern int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct);
+extern int sched_get_cpu_mostly_idle_load(int cpu);
+extern int sched_set_cpu_mostly_idle_nr_run(int cpu, int nr_run);
+extern int sched_get_cpu_mostly_idle_nr_run(int cpu);
+
#else
static inline int sched_set_boost(int enable)
{
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 8cadba92aee0..bd2abda891cd 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -59,9 +59,7 @@ extern int sysctl_sched_freq_dec_notify;
#ifdef CONFIG_SCHED_HMP
extern unsigned int sysctl_sched_spill_nr_run;
-extern unsigned int sysctl_sched_mostly_idle_nr_run;
extern unsigned int sysctl_sched_spill_load_pct;
-extern unsigned int sysctl_sched_mostly_idle_load_pct;
extern unsigned int sysctl_sched_small_task_pct;
extern unsigned int sysctl_sched_upmigrate_pct;
extern unsigned int sysctl_sched_downmigrate_pct;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7ec7b5442f41..07aac49174dd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2045,6 +2045,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
u64 start_ts = sched_clock();
int reason = WINDOW_CHANGE;
unsigned int old = 0, new = 0;
+ unsigned int old_window_size = sched_ravg_window;
disable_window_stats();
@@ -2067,8 +2068,13 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- if (window_start)
+ if (window_start) {
+ u32 mostly_idle_load = rq->mostly_idle_load;
+
rq->window_start = window_start;
+ rq->mostly_idle_load = div64_u64((u64)mostly_idle_load *
+ (u64)sched_ravg_window, (u64)old_window_size);
+ }
#ifdef CONFIG_SCHED_FREQ_INPUT
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
#endif
@@ -9126,6 +9132,8 @@ void __init sched_init(void)
rq->window_start = 0;
rq->nr_small_tasks = rq->nr_big_tasks = 0;
rq->hmp_flags = 0;
+ rq->mostly_idle_load = pct_to_real(20);
+ rq->mostly_idle_nr_run = 3;
#ifdef CONFIG_SCHED_FREQ_INPUT
rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
rq->old_busy_time = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f9bb03279152..752a0de12871 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -310,6 +310,8 @@ do { \
P(cpu_capacity);
#endif
#ifdef CONFIG_SCHED_HMP
+ P(mostly_idle_load);
+ P(mostly_idle_nr_run);
P(load_scale_factor);
P(capacity);
P(max_possible_capacity);
@@ -402,7 +404,6 @@ static void sched_debug_header(struct seq_file *m)
P(sysctl_sched_child_runs_first);
P(sysctl_sched_features);
#ifdef CONFIG_SCHED_HMP
- P(sched_mostly_idle_load);
P(sched_small_task);
P(sched_upmigrate);
P(sched_downmigrate);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 833225a9fe57..dcc3f57668e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2570,14 +2570,6 @@ unsigned int __read_mostly sched_enable_hmp = 0;
unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
/*
- * A cpu is considered practically idle, if:
- *
- * rq->nr_running <= sysctl_sched_mostly_idle_nr_run &&
- * rq->cumulative_runnable_avg <= sched_mostly_idle_load
- */
-unsigned int __read_mostly sysctl_sched_mostly_idle_nr_run = 3;
-
-/*
* Control whether or not individual CPU power consumption is used to
* guide task placement.
*/
@@ -2591,16 +2583,6 @@ unsigned int __read_mostly sched_enable_power_aware = 0;
unsigned int __read_mostly sysctl_sched_powerband_limit_pct = 20;
/*
- * Conversion of *_pct to absolute form is based on max_task_load().
- *
- * For example:
- * sched_mostly_idle_load =
- * (sysctl_sched_mostly_idle_load_pct * max_task_load()) / 100;
- */
-unsigned int __read_mostly sched_mostly_idle_load;
-unsigned int __read_mostly sysctl_sched_mostly_idle_load_pct = 20;
-
-/*
* CPUs with load greater than the sched_spill_load_threshold are not
* eligible for task placement. When all CPUs in a cluster achieve a
* load higher than this level, tasks becomes eligible for inter
@@ -2666,17 +2648,11 @@ static inline int available_cpu_capacity(int cpu)
return rq->capacity;
}
-#define pct_to_real(tunable) \
- (div64_u64((u64)tunable * (u64)max_task_load(), 100))
-
void set_hmp_defaults(void)
{
sched_spill_load =
pct_to_real(sysctl_sched_spill_load_pct);
- sched_mostly_idle_load =
- pct_to_real(sysctl_sched_mostly_idle_load_pct);
-
sched_small_task =
pct_to_real(sysctl_sched_small_task_pct);
@@ -2715,6 +2691,44 @@ int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
return 0;
}
+int sched_set_cpu_mostly_idle_load(int cpu, int mostly_idle_pct)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (mostly_idle_pct < 0 || mostly_idle_pct > 100)
+ return -EINVAL;
+
+ rq->mostly_idle_load = pct_to_real(mostly_idle_pct);
+
+ return 0;
+}
+
+int sched_get_cpu_mostly_idle_load(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ int mostly_idle_pct;
+
+ mostly_idle_pct = real_to_pct(rq->mostly_idle_load);
+
+ return mostly_idle_pct;
+}
+
+int sched_set_cpu_mostly_idle_nr_run(int cpu, int nr_run)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->mostly_idle_nr_run = nr_run;
+
+ return 0;
+}
+
+int sched_get_cpu_mostly_idle_nr_run(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return rq->mostly_idle_nr_run;
+}
+
/*
* 'load' is in reference to "best cpu" at its best frequency.
* Scale that in reference to a given cpu, accounting for how bad it is
@@ -2776,9 +2790,12 @@ spill_threshold_crossed(struct task_struct *p, struct rq *rq, int cpu)
int mostly_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
+ int mostly_idle;
+
+ mostly_idle = (cpu_load(cpu) <= rq->mostly_idle_load
+ && rq->nr_running <= rq->mostly_idle_nr_run);
- return (cpu_load(cpu) <= sched_mostly_idle_load
- && rq->nr_running <= sysctl_sched_mostly_idle_nr_run);
+ return mostly_idle;
}
static int boost_refcount;
@@ -3344,10 +3361,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
return ret;
if ((sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) ||
- (sysctl_sched_mostly_idle_load_pct >
- sysctl_sched_spill_load_pct) || *data > 100) {
- *data = old_val;
- return -EINVAL;
+ *data > 100) {
+ *data = old_val;
+ return -EINVAL;
}
/*
@@ -9254,8 +9270,8 @@ static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type)
int i;
if (rq->nr_running >= 2 && (rq->nr_running - rq->nr_small_tasks >= 2 ||
- rq->nr_running > sysctl_sched_mostly_idle_nr_run ||
- cpu_load(cpu) > sched_mostly_idle_load)) {
+ rq->nr_running > rq->mostly_idle_nr_run ||
+ cpu_load(cpu) > rq->mostly_idle_load)) {
if (rq->capacity == max_capacity)
return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4c7dde3c892..d0193932354f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -658,6 +658,8 @@ struct rq {
int capacity;
int max_possible_capacity;
u64 window_start;
+ u32 mostly_idle_load;
+ int mostly_idle_nr_run;
#ifdef CONFIG_SCHED_FREQ_INPUT
unsigned int old_busy_time;
@@ -1013,6 +1015,12 @@ dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
BUG_ON((s64)rq->cumulative_runnable_avg < 0);
}
+#define pct_to_real(tunable) \
+ (div64_u64((u64)tunable * (u64)max_task_load(), 100))
+
+#define real_to_pct(tunable) \
+ (div64_u64((u64)tunable * (u64)100, (u64)max_task_load()))
+
#else /* CONFIG_SCHED_HMP */
static inline int pct_task_load(struct task_struct *p) { return 0; }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e7b03e816bd7..2281f24194db 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -366,20 +366,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_hmp_proc_update_handler,
},
{
- .procname = "sched_mostly_idle_load",
- .data = &sysctl_sched_mostly_idle_load_pct,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_hmp_proc_update_handler,
- },
- {
- .procname = "sched_mostly_idle_nr_run",
- .data = &sysctl_sched_mostly_idle_nr_run,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "sched_spill_load",
.data = &sysctl_sched_spill_load_pct,
.maxlen = sizeof(unsigned int),