summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--kernel/sched/fair.c55
-rw-r--r--kernel/sysctl.c7
3 files changed, 50 insertions, 13 deletions
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4479e48c7712..7d021393b0da 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -39,6 +39,7 @@ extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
+extern unsigned int sysctl_sched_cstate_aware;
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b12b14756ec..14c8527d7bcb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -51,6 +51,7 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_cstate_aware = 1;
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -5468,15 +5469,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);
+ int best_idle = -1;
+ int best_idle_cstate = -1;
+ int best_idle_capacity = INT_MAX;
- if (idle_cpu(target))
- return target;
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target))
+ return target;
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+ }
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
@@ -5489,18 +5495,41 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ struct rq *rq = cpu_rq(i);
+ int idle_idx = idle_get_state_idx(rq);
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target))
+ return target;
+
+ if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+ best_idle = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- target = cpumask_first_and(sched_group_cpus(sg),
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (best_idle > 0)
+ target = best_idle;
+
done:
return target;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98c0f6ef01f1..cc0bf3ecdbda 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -304,6 +304,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_granularity_ns,
},
{
+ .procname = "sched_cstate_aware",
+ .data = &sysctl_sched_cstate_aware,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),