diff options
| -rw-r--r-- | include/linux/sched/sysctl.h | 1 | ||||
| -rw-r--r-- | kernel/sched/core.c | 215 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 6 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 35 | ||||
| -rw-r--r-- | kernel/sysctl.c | 7 |
5 files changed, 211 insertions, 53 deletions
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index fb51198716fa..411812931a26 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -45,6 +45,7 @@ extern unsigned int sysctl_sched_window_stats_policy; extern unsigned int sysctl_sched_account_wait_time; extern unsigned int sysctl_sched_ravg_hist_size; extern unsigned int sysctl_sched_freq_legacy_mode; +extern unsigned int sysctl_sched_gov_response_time; #if defined(CONFIG_SCHED_FREQ_INPUT) || defined(CONFIG_SCHED_HMP) extern unsigned int sysctl_sched_init_task_load_pct; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1e93e65cd074..a23148908738 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1211,6 +1211,12 @@ __read_mostly int sysctl_sched_freq_dec_notify_slack_pct = INT_MAX; static __read_mostly unsigned int sched_io_is_busy; /* + * Force-issue notification to governor if we waited long enough since sending + * last notification and did not see any freq change. + */ +__read_mostly unsigned int sysctl_sched_gov_response_time = 10000000; + +/* * Maximum possible frequency across all cpus. Task demand and cpu * capacity (cpu_power) metrics are scaled in reference to it. */ @@ -1236,33 +1242,119 @@ static u64 sched_clock_at_init_jiffy; #define PREV_WINDOW_CONTRIB 2 #define DONT_ACCOUNT 4 -/* Returns how undercommitted a CPU is given its current frequency and - * task load (as measured in the previous window). Returns this value - * as a percentage of the CPU's maximum frequency. A negative value - * means the CPU is overcommitted at its current frequency. - */ -int rq_freq_margin(struct rq *rq) +#ifdef CONFIG_SCHED_FREQ_INPUT + +/* Does freq_required sufficiently exceed or fall behind cur_freq? */ +static inline int +nearly_same_freq(unsigned int cur_freq, unsigned int freq_required) { - unsigned int freq_required; int margin; - u64 demand; + + margin = cur_freq - freq_required; + margin *= 100; + margin /= (int)cur_freq; + + /* + * + margin implies cur_freq > req_freq + * - margin implies cur_freq < req_freq + */ + + return (margin > sysctl_sched_freq_inc_notify_slack_pct && + margin < sysctl_sched_freq_dec_notify_slack_pct); +} + +/* Is governor late in responding? */ +static inline int freq_request_timeout(struct rq *rq) +{ + u64 now = sched_clock(); + + return ((now - rq->freq_requested_ts) > sysctl_sched_gov_response_time); +} + +/* Should scheduler alert governor for changing frequency? */ +static int send_notification(struct rq *rq, unsigned int freq_required) +{ + int cpu, rc = 0; + unsigned int freq_requested = rq->freq_requested; + struct rq *domain_rq; + unsigned long flags; + + if (freq_required > rq->max_freq) + freq_required = rq->max_freq; + else if (freq_required < rq->min_freq) + freq_required = rq->min_freq; + + if (nearly_same_freq(rq->cur_freq, freq_required)) + return 0; + + if (freq_requested && nearly_same_freq(freq_requested, freq_required) && + !freq_request_timeout(rq)) + return 0; + + cpu = cpumask_first(&rq->freq_domain_cpumask); + if (cpu >= nr_cpu_ids) + return 0; + + domain_rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&domain_rq->lock, flags); + freq_requested = domain_rq->freq_requested; + if (!freq_requested || + !nearly_same_freq(freq_requested, freq_required) || + freq_request_timeout(domain_rq)) { + + u64 now = sched_clock(); + + /* + * Cache the new frequency requested in rq of all cpus that are + * in same freq domain. This saves frequent grabbing of + * domain_rq->lock + */ + for_each_cpu(cpu, &rq->freq_domain_cpumask) { + cpu_rq(cpu)->freq_requested = freq_required; + cpu_rq(cpu)->freq_requested_ts = now; + } + rc = 1; + } + raw_spin_unlock_irqrestore(&domain_rq->lock, flags); + + return rc; +} + +/* Alert governor if there is a need to change frequency */ +void check_for_freq_change(struct rq *rq) +{ + unsigned int freq_required; + int i, max_demand_cpu = 0; + u64 max_demand = 0; if (!sched_enable_hmp) - return INT_MAX; + return; + + /* Find out max demand across cpus in same frequency domain */ + for_each_cpu(i, &rq->freq_domain_cpumask) { + if (cpu_rq(i)->prev_runnable_sum > max_demand) { + max_demand = cpu_rq(i)->prev_runnable_sum; + max_demand_cpu = i; + } + } - demand = scale_load_to_cpu(rq->prev_runnable_sum, rq->cpu); - demand *= 128; - demand = div64_u64(demand, max_task_load()); + max_demand = scale_load_to_cpu(max_demand, rq->cpu); + max_demand *= 128; + max_demand = div64_u64(max_demand, max_task_load()); - freq_required = demand * rq->max_possible_freq; + freq_required = max_demand * rq->max_possible_freq; freq_required /= 128; - margin = rq->cur_freq - freq_required; - margin *= 100; - margin /= (int)rq->max_possible_freq; - return margin; + if (!send_notification(rq, freq_required)) + return; + + atomic_notifier_call_chain( + &load_alert_notifier_head, 0, + (void *)(long)max_demand_cpu); } +#endif /* CONFIG_SCHED_FREQ_INPUT */ + /* * Called when new window is starting for a task, to record cpu usage over * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 @@ -1841,6 +1933,7 @@ unsigned long sched_get_busy(int cpu) { unsigned long flags; struct rq *rq = cpu_rq(cpu); + u64 load; /* * This function could be called in timer context, and the @@ -1851,8 +1944,17 @@ unsigned long sched_get_busy(int cpu) update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_clock(), 0); raw_spin_unlock_irqrestore(&rq->lock, flags); - return div64_u64(scale_load_to_cpu(rq->prev_runnable_sum, cpu), - NSEC_PER_USEC); + /* + * Scale load in reference to rq->max_possible_freq. + * + * Note that scale_load_to_cpu() scales load in reference to + * rq->max_freq + */ + load = scale_load_to_cpu(rq->prev_runnable_sum, cpu); + load = div64_u64(load * (u64)rq->max_freq, (u64)rq->max_possible_freq); + load = div64_u64(load, NSEC_PER_USEC); + + return load; } void sched_set_io_is_busy(int val) @@ -2081,6 +2183,23 @@ static int cpufreq_notifier_trans(struct notifier_block *nb, cpu_rq(cpu)->cur_freq = new_freq; raw_spin_unlock_irqrestore(&rq->lock, flags); + /* clear freq request for CPUs in the same freq domain */ + if (!rq->freq_requested) + return 0; + + /* The first CPU (and its rq lock) in a freq domain is used to + * serialize all freq change tests and notifications for CPUs + * in that domain. */ + cpu = cpumask_first(&rq->freq_domain_cpumask); + if (cpu >= nr_cpu_ids) + return 0; + + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_cpu(cpu, &rq->freq_domain_cpumask) + cpu_rq(cpu)->freq_requested = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + return 0; } @@ -2122,7 +2241,6 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) struct rq *src_rq = task_rq(p); struct rq *dest_rq = cpu_rq(new_cpu); u64 wallclock; - int freq_notify = 0; if (p->state == TASK_WAKING) double_rq_lock(src_rq, dest_rq); @@ -2130,7 +2248,6 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) if (sched_disable_window_stats) goto done; - freq_notify = 1; wallclock = sched_clock(); update_task_ravg(task_rq(p)->curr, task_rq(p), @@ -2185,29 +2302,25 @@ static void fixup_busy_time(struct task_struct *p, int new_cpu) done: if (p->state == TASK_WAKING) double_rq_unlock(src_rq, dest_rq); +} - if (!freq_notify && cpumask_test_cpu(new_cpu, - &src_rq->freq_domain_cpumask)) - return; - - /* Evaluate possible frequency notifications for - * source and destination CPUs in different frequency - * domains. */ - if (rq_freq_margin(dest_rq) < - sysctl_sched_freq_inc_notify_slack_pct) - atomic_notifier_call_chain( - &load_alert_notifier_head, 0, - (void *)(long)new_cpu); +/* A long sleep is defined as sleeping at least one full window prior + * to the current window start. */ +static inline int is_long_sleep(struct rq *rq, struct task_struct *p) +{ + if (p->ravg.mark_start > rq->window_start) + return 0; - if (rq_freq_margin(src_rq) > - sysctl_sched_freq_dec_notify_slack_pct) - atomic_notifier_call_chain( - &load_alert_notifier_head, 0, - (void *)(long)task_cpu(p)); + return ((rq->window_start - p->ravg.mark_start) > sched_ravg_window); } #else /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */ +static inline int is_long_sleep(struct rq *rq, struct task_struct *p) +{ + return 0; +} + static inline void update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) @@ -3141,6 +3254,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) int cpu, src_cpu, success = 0; int notify = 0; struct migration_notify_data mnd; + int long_sleep = 0; #ifdef CONFIG_SMP struct rq *rq; u64 wallclock; @@ -3208,6 +3322,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) raw_spin_lock(&rq->lock); wallclock = sched_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + long_sleep = is_long_sleep(rq, p); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); raw_spin_unlock(&rq->lock); @@ -3224,14 +3339,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (src_cpu != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); - } else { -#ifdef CONFIG_SCHED_FREQ_INPUT - if (rq_freq_margin(cpu_rq(cpu)) < - sysctl_sched_freq_inc_notify_slack_pct) - atomic_notifier_call_chain( - &load_alert_notifier_head, 0, - (void *)(long)cpu); -#endif } #endif /* CONFIG_SMP */ @@ -3263,6 +3370,11 @@ out: atomic_notifier_call_chain(&migration_notifier_head, 0, (void *)&mnd); + if (long_sleep || !same_freq_domain(src_cpu, cpu)) + check_for_freq_change(cpu_rq(cpu)); + if (!long_sleep && !same_freq_domain(src_cpu, cpu)) + check_for_freq_change(cpu_rq(src_cpu)); + return success; } @@ -3662,13 +3774,6 @@ void wake_up_new_task(struct task_struct *p) rq = __task_rq_lock(p); mark_task_starting(p); -#ifdef CONFIG_SCHED_FREQ_INPUT - if (rq_freq_margin(task_rq(p)) < - sysctl_sched_freq_inc_notify_slack_pct) - atomic_notifier_call_chain( - &load_alert_notifier_head, 0, - (void *)(long)task_cpu(p)); -#endif activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -3685,6 +3790,8 @@ void wake_up_new_task(struct task_struct *p) } #endif task_rq_unlock(rq, p, &flags); + if (init_task_load) + check_for_freq_change(rq); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -8815,6 +8922,8 @@ void __init sched_init(void) rq->capacity = 1024; rq->load_scale_factor = 1024; rq->window_start = 0; + rq->freq_requested = 0; + rq->freq_requested_ts = 0; #endif rq->max_idle_balance_cost = sysctl_sched_migration_cost; rq->cstate = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6dc05ff2c6be..5e6f9a6047e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8464,6 +8464,12 @@ more_balance: per_cpu(dbs_boost_load_moved, this_cpu) = 0; } + + /* Assumes one 'busiest' cpu that we pulled tasks from */ + if (!same_freq_domain(this_cpu, cpu_of(busiest))) { + check_for_freq_change(this_rq); + check_for_freq_change(busiest); + } } if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5a804e108e32..decbbd2c6f48 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -651,6 +651,8 @@ struct rq { */ unsigned int cur_freq, max_freq, min_freq, max_possible_freq; struct cpumask freq_domain_cpumask; + unsigned int freq_requested; + u64 freq_requested_ts; u64 cumulative_runnable_avg; int efficiency; /* Differentiate cpus with different IPC capability */ @@ -1036,6 +1038,39 @@ static inline void sched_account_irqtime(int cpu, struct task_struct *curr, #endif /* CONFIG_SCHED_FREQ_INPUT || CONFIG_SCHED_HMP */ +#ifdef CONFIG_SCHED_FREQ_INPUT +extern void check_for_freq_change(struct rq *rq); + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +#ifdef CONFIG_SCHED_HMP +#define init_task_load sysctl_sched_init_task_load_pct +#else +#define init_task_load 0 +#endif + +#else /* CONFIG_SCHED_FREQ_INPUT */ + +#define init_task_load 0 + +static inline void check_for_freq_change(struct rq *rq) { } + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +#endif /* CONFIG_SCHED_FREQ_INPUT */ + #ifdef CONFIG_SCHED_HMP #define BOOST_KICK 0 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7fbe9b146343..7d54b6b1bfed 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -338,6 +338,13 @@ static struct ctl_table kern_table[] = { .proc_handler = sched_window_update_handler, }, { + .procname = "sched_gov_response_time", + .data = &sysctl_sched_gov_response_time, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "sched_wakeup_load_threshold", .data = &sysctl_sched_wakeup_load_threshold, .maxlen = sizeof(unsigned int), |
