summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--kernel/sched/core.c215
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/sched/sched.h35
-rw-r--r--kernel/sysctl.c7
5 files changed, 211 insertions, 53 deletions
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index fb51198716fa..411812931a26 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -45,6 +45,7 @@ extern unsigned int sysctl_sched_window_stats_policy;
extern unsigned int sysctl_sched_account_wait_time;
extern unsigned int sysctl_sched_ravg_hist_size;
extern unsigned int sysctl_sched_freq_legacy_mode;
+extern unsigned int sysctl_sched_gov_response_time;
#if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP)
extern unsigned int sysctl_sched_init_task_load_pct;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1e93e65cd074..a23148908738 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1211,6 +1211,12 @@ __read_mostly int sysctl_sched_freq_dec_notify_slack_pct = INT_MAX;
static __read_mostly unsigned int sched_io_is_busy;
/*
+ * Force-issue notification to governor if we waited long enough since sending
+ * last notification and did not see any freq change.
+ */
+__read_mostly unsigned int sysctl_sched_gov_response_time = 10000000;
+
+/*
* Maximum possible frequency across all cpus. Task demand and cpu
* capacity (cpu_power) metrics are scaled in reference to it.
*/
@@ -1236,33 +1242,119 @@ static u64 sched_clock_at_init_jiffy;
#define PREV_WINDOW_CONTRIB 2
#define DONT_ACCOUNT 4
-/* Returns how undercommitted a CPU is given its current frequency and
- * task load (as measured in the previous window). Returns this value
- * as a percentage of the CPU's maximum frequency. A negative value
- * means the CPU is overcommitted at its current frequency.
- */
-int rq_freq_margin(struct rq *rq)
+#ifdef CONFIG_SCHED_FREQ_INPUT
+
+/* Does freq_required sufficiently exceed or fall behind cur_freq? */
+static inline int
+nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
{
- unsigned int freq_required;
int margin;
- u64 demand;
+
+ margin = cur_freq - freq_required;
+ margin *= 100;
+ margin /= (int)cur_freq;
+
+ /*
+ * + margin implies cur_freq > req_freq
+ * - margin implies cur_freq < req_freq
+ */
+
+ return (margin > sysctl_sched_freq_inc_notify_slack_pct &&
+ margin < sysctl_sched_freq_dec_notify_slack_pct);
+}
+
+/* Is governor late in responding? */
+static inline int freq_request_timeout(struct rq *rq)
+{
+ u64 now = sched_clock();
+
+ return ((now - rq->freq_requested_ts) > sysctl_sched_gov_response_time);
+}
+
+/* Should scheduler alert governor for changing frequency? */
+static int send_notification(struct rq *rq, unsigned int freq_required)
+{
+ int cpu, rc = 0;
+ unsigned int freq_requested = rq->freq_requested;
+ struct rq *domain_rq;
+ unsigned long flags;
+
+ if (freq_required > rq->max_freq)
+ freq_required = rq->max_freq;
+ else if (freq_required < rq->min_freq)
+ freq_required = rq->min_freq;
+
+ if (nearly_same_freq(rq->cur_freq, freq_required))
+ return 0;
+
+ if (freq_requested && nearly_same_freq(freq_requested, freq_required) &&
+ !freq_request_timeout(rq))
+ return 0;
+
+ cpu = cpumask_first(&rq->freq_domain_cpumask);
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ domain_rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&domain_rq->lock, flags);
+ freq_requested = domain_rq->freq_requested;
+ if (!freq_requested ||
+ !nearly_same_freq(freq_requested, freq_required) ||
+ freq_request_timeout(domain_rq)) {
+
+ u64 now = sched_clock();
+
+ /*
+ * Cache the new frequency requested in rq of all cpus that are
+ * in same freq domain. This saves frequent grabbing of
+ * domain_rq->lock
+ */
+ for_each_cpu(cpu, &rq->freq_domain_cpumask) {
+ cpu_rq(cpu)->freq_requested = freq_required;
+ cpu_rq(cpu)->freq_requested_ts = now;
+ }
+ rc = 1;
+ }
+ raw_spin_unlock_irqrestore(&domain_rq->lock, flags);
+
+ return rc;
+}
+
+/* Alert governor if there is a need to change frequency */
+void check_for_freq_change(struct rq *rq)
+{
+ unsigned int freq_required;
+ int i, max_demand_cpu = 0;
+ u64 max_demand = 0;
if (!sched_enable_hmp)
- return INT_MAX;
+ return;
+
+ /* Find out max demand across cpus in same frequency domain */
+ for_each_cpu(i, &rq->freq_domain_cpumask) {
+ if (cpu_rq(i)->prev_runnable_sum > max_demand) {
+ max_demand = cpu_rq(i)->prev_runnable_sum;
+ max_demand_cpu = i;
+ }
+ }
- demand = scale_load_to_cpu(rq->prev_runnable_sum, rq->cpu);
- demand *= 128;
- demand = div64_u64(demand, max_task_load());
+ max_demand = scale_load_to_cpu(max_demand, rq->cpu);
+ max_demand *= 128;
+ max_demand = div64_u64(max_demand, max_task_load());
- freq_required = demand * rq->max_possible_freq;
+ freq_required = max_demand * rq->max_possible_freq;
freq_required /= 128;
- margin = rq->cur_freq - freq_required;
- margin *= 100;
- margin /= (int)rq->max_possible_freq;
- return margin;
+ if (!send_notification(rq, freq_required))
+ return;
+
+ atomic_notifier_call_chain(
+ &load_alert_notifier_head, 0,
+ (void *)(long)max_demand_cpu);
}
+#endif /* CONFIG_SCHED_FREQ_INPUT */
+
/*
* Called when new window is starting for a task, to record cpu usage over
* recently concluded window(s). Normally 'samples' should be 1. It can be > 1
@@ -1841,6 +1933,7 @@ unsigned long sched_get_busy(int cpu)
{
unsigned long flags;
struct rq *rq = cpu_rq(cpu);
+ u64 load;
/*
* This function could be called in timer context, and the
@@ -1851,8 +1944,17 @@ unsigned long sched_get_busy(int cpu)
update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0);
raw_spin_unlock_irqrestore(&rq->lock, flags);
- return div64_u64(scale_load_to_cpu(rq->prev_runnable_sum, cpu),
- NSEC_PER_USEC);
+ /*
+ * Scale load in reference to rq->max_possible_freq.
+ *
+ * Note that scale_load_to_cpu() scales load in reference to
+ * rq->max_freq
+ */
+ load = scale_load_to_cpu(rq->prev_runnable_sum, cpu);
+ load = div64_u64(load * (u64)rq->max_freq, (u64)rq->max_possible_freq);
+ load = div64_u64(load, NSEC_PER_USEC);
+
+ return load;
}
void sched_set_io_is_busy(int val)
@@ -2081,6 +2183,23 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
cpu_rq(cpu)->cur_freq = new_freq;
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ /* clear freq request for CPUs in the same freq domain */
+ if (!rq->freq_requested)
+ return 0;
+
+ /* The first CPU (and its rq lock) in a freq domain is used to
+ * serialize all freq change tests and notifications for CPUs
+ * in that domain. */
+ cpu = cpumask_first(&rq->freq_domain_cpumask);
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_cpu(cpu, &rq->freq_domain_cpumask)
+ cpu_rq(cpu)->freq_requested = 0;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
return 0;
}
@@ -2122,7 +2241,6 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
struct rq *src_rq = task_rq(p);
struct rq *dest_rq = cpu_rq(new_cpu);
u64 wallclock;
- int freq_notify = 0;
if (p->state == TASK_WAKING)
double_rq_lock(src_rq, dest_rq);
@@ -2130,7 +2248,6 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
if (sched_disable_window_stats)
goto done;
- freq_notify = 1;
wallclock = sched_clock();
update_task_ravg(task_rq(p)->curr, task_rq(p),
@@ -2185,29 +2302,25 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu)
done:
if (p->state == TASK_WAKING)
double_rq_unlock(src_rq, dest_rq);
+}
- if (!freq_notify && cpumask_test_cpu(new_cpu,
- &src_rq->freq_domain_cpumask))
- return;
-
- /* Evaluate possible frequency notifications for
- * source and destination CPUs in different frequency
- * domains. */
- if (rq_freq_margin(dest_rq) <
- sysctl_sched_freq_inc_notify_slack_pct)
- atomic_notifier_call_chain(
- &load_alert_notifier_head, 0,
- (void *)(long)new_cpu);
+/* A long sleep is defined as sleeping at least one full window prior
+ * to the current window start. */
+static inline int is_long_sleep(struct rq *rq, struct task_struct *p)
+{
+ if (p->ravg.mark_start > rq->window_start)
+ return 0;
- if (rq_freq_margin(src_rq) >
- sysctl_sched_freq_dec_notify_slack_pct)
- atomic_notifier_call_chain(
- &load_alert_notifier_head, 0,
- (void *)(long)task_cpu(p));
+ return ((rq->window_start - p->ravg.mark_start) > sched_ravg_window);
}
#else /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+static inline int is_long_sleep(struct rq *rq, struct task_struct *p)
+{
+ return 0;
+}
+
static inline void
update_task_ravg(struct task_struct *p, struct rq *rq,
int event, u64 wallclock, u64 irqtime)
@@ -3141,6 +3254,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
int cpu, src_cpu, success = 0;
int notify = 0;
struct migration_notify_data mnd;
+ int long_sleep = 0;
#ifdef CONFIG_SMP
struct rq *rq;
u64 wallclock;
@@ -3208,6 +3322,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
raw_spin_lock(&rq->lock);
wallclock = sched_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+ long_sleep = is_long_sleep(rq, p);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
raw_spin_unlock(&rq->lock);
@@ -3224,14 +3339,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (src_cpu != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
- } else {
-#ifdef CONFIG_SCHED_FREQ_INPUT
- if (rq_freq_margin(cpu_rq(cpu)) <
- sysctl_sched_freq_inc_notify_slack_pct)
- atomic_notifier_call_chain(
- &load_alert_notifier_head, 0,
- (void *)(long)cpu);
-#endif
}
#endif /* CONFIG_SMP */
@@ -3263,6 +3370,11 @@ out:
atomic_notifier_call_chain(&migration_notifier_head,
0, (void *)&mnd);
+ if (long_sleep || !same_freq_domain(src_cpu, cpu))
+ check_for_freq_change(cpu_rq(cpu));
+ if (!long_sleep && !same_freq_domain(src_cpu, cpu))
+ check_for_freq_change(cpu_rq(src_cpu));
+
return success;
}
@@ -3662,13 +3774,6 @@ void wake_up_new_task(struct task_struct *p)
rq = __task_rq_lock(p);
mark_task_starting(p);
-#ifdef CONFIG_SCHED_FREQ_INPUT
- if (rq_freq_margin(task_rq(p)) <
- sysctl_sched_freq_inc_notify_slack_pct)
- atomic_notifier_call_chain(
- &load_alert_notifier_head, 0,
- (void *)(long)task_cpu(p));
-#endif
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
@@ -3685,6 +3790,8 @@ void wake_up_new_task(struct task_struct *p)
}
#endif
task_rq_unlock(rq, p, &flags);
+ if (init_task_load)
+ check_for_freq_change(rq);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -8815,6 +8922,8 @@ void __init sched_init(void)
rq->capacity = 1024;
rq->load_scale_factor = 1024;
rq->window_start = 0;
+ rq->freq_requested = 0;
+ rq->freq_requested_ts = 0;
#endif
rq->max_idle_balance_cost = sysctl_sched_migration_cost;
rq->cstate = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6dc05ff2c6be..5e6f9a6047e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8464,6 +8464,12 @@ more_balance:
per_cpu(dbs_boost_load_moved, this_cpu) = 0;
}
+
+ /* Assumes one 'busiest' cpu that we pulled tasks from */
+ if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
+ check_for_freq_change(this_rq);
+ check_for_freq_change(busiest);
+ }
}
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5a804e108e32..decbbd2c6f48 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -651,6 +651,8 @@ struct rq {
*/
unsigned int cur_freq, max_freq, min_freq, max_possible_freq;
struct cpumask freq_domain_cpumask;
+ unsigned int freq_requested;
+ u64 freq_requested_ts;
u64 cumulative_runnable_avg;
int efficiency; /* Differentiate cpus with different IPC capability */
@@ -1036,6 +1038,39 @@ static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
#endif /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */
+#ifdef CONFIG_SCHED_FREQ_INPUT
+extern void check_for_freq_change(struct rq *rq);
+
+/* Is frequency of two cpus synchronized with each other? */
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+ struct rq *rq = cpu_rq(src_cpu);
+
+ if (src_cpu == dst_cpu)
+ return 1;
+
+ return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask);
+}
+
+#ifdef CONFIG_SCHED_HMP
+#define init_task_load sysctl_sched_init_task_load_pct
+#else
+#define init_task_load 0
+#endif
+
+#else /* CONFIG_SCHED_FREQ_INPUT */
+
+#define init_task_load 0
+
+static inline void check_for_freq_change(struct rq *rq) { }
+
+static inline int same_freq_domain(int src_cpu, int dst_cpu)
+{
+ return 1;
+}
+
+#endif /* CONFIG_SCHED_FREQ_INPUT */
+
#ifdef CONFIG_SCHED_HMP
#define BOOST_KICK 0
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7fbe9b146343..7d54b6b1bfed 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -338,6 +338,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_window_update_handler,
},
{
+ .procname = "sched_gov_response_time",
+ .data = &sysctl_sched_gov_response_time,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "sched_wakeup_load_threshold",
.data = &sysctl_sched_wakeup_load_threshold,
.maxlen = sizeof(unsigned int),