diff options
Diffstat (limited to 'kernel/sched/hmp.c')
| -rw-r--r-- | kernel/sched/hmp.c | 386 |
1 files changed, 179 insertions, 207 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index c1c35bb582fb..968a41e0e81e 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -17,8 +17,6 @@ #include <linux/cpufreq.h> #include <linux/list_sort.h> #include <linux/syscore_ops.h> -#include <linux/of.h> -#include <linux/sched/core_ctl.h> #include "sched.h" @@ -231,52 +229,6 @@ fail: return ret; } -/* - * It is possible that CPUs of the same micro architecture can have slight - * difference in the efficiency due to other factors like cache size. The - * BOOST_ON_BIG policy may not be optimial for such systems. The required - * boost policy can be specified via device tree to handle this. - */ -static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE; - -/* - * This should be called after clusters are populated and - * the respective efficiency values are initialized. - */ -void init_sched_hmp_boost_policy(void) -{ - /* - * Initialize the boost type here if it is not passed from - * device tree. - */ - if (sched_boost_policy == SCHED_BOOST_NONE) { - if (max_possible_efficiency != min_possible_efficiency) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - -void sched_hmp_parse_dt(void) -{ - struct device_node *sn; - const char *boost_policy; - - if (!sched_enable_hmp) - return; - - sn = of_find_node_by_path("/sched-hmp"); - if (!sn) - return; - - if (!of_property_read_string(sn, "boost-policy", &boost_policy)) { - if (!strcmp(boost_policy, "boost-on-big")) - sched_boost_policy = SCHED_BOOST_ON_BIG; - else if (!strcmp(boost_policy, "boost-on-all")) - sched_boost_policy = SCHED_BOOST_ON_ALL; - } -} - unsigned int max_possible_efficiency = 1; unsigned int min_possible_efficiency = UINT_MAX; @@ -680,29 +632,6 @@ int __init set_sched_enable_hmp(char *str) early_param("sched_enable_hmp", set_sched_enable_hmp); -int got_boost_kick(void) -{ - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - return test_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void clear_boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - clear_bit(BOOST_KICK, &rq->hmp_flags); -} - -inline void boost_kick(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags)) - smp_send_reschedule(cpu); -} - /* Clear any HMP scheduler related requests pending from or on cpu */ void clear_hmp_request(int cpu) { @@ -840,6 +769,9 @@ min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Window size (in ns) */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; +/* Maximum allowed threshold before freq aggregation must be enabled */ +#define MAX_FREQ_AGGR_THRESH 1000 + /* Temporarily disable window-stats activity on all cpus */ unsigned int __read_mostly sched_disable_window_stats; @@ -919,8 +851,8 @@ static const unsigned int top_tasks_bitmap_size = * C1 busy time = 5 + 5 + 6 = 16ms * */ -static __read_mostly unsigned int sched_freq_aggregate; -__read_mostly unsigned int sysctl_sched_freq_aggregate; +static __read_mostly unsigned int sched_freq_aggregate = 1; +__read_mostly unsigned int sysctl_sched_freq_aggregate = 1; unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct; static unsigned int __read_mostly sched_freq_aggregate_threshold; @@ -937,14 +869,6 @@ unsigned int max_task_load(void) /* Use this knob to turn on or off HMP-aware task placement logic */ unsigned int __read_mostly sched_enable_hmp; -/* - * Scheduler boost is a mechanism to temporarily place tasks on CPUs - * with higher capacity than those where a task would have normally - * ended up with their load characteristics. Any entity enabling - * boost is responsible for disabling it as well. - */ -unsigned int sysctl_sched_boost; - /* A cpu can no longer accommodate more tasks if: * * rq->nr_running > sysctl_sched_spill_nr_run || @@ -996,6 +920,21 @@ unsigned int __read_mostly sched_downmigrate; unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60; /* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95; + +/* * The load scale factor of a CPU gets boosted when its max frequency * is restricted due to which the tasks are migrating to higher capacity * CPUs early. The sched_upmigrate threshold is auto-upgraded by @@ -1017,33 +956,46 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; -void update_up_down_migrate(void) +static void +_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { - unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); - unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); unsigned int delta; if (up_down_migrate_scale_factor == 1024) - goto done; + return; - delta = up_migrate - down_migrate; + delta = *up_migrate - *down_migrate; - up_migrate /= NSEC_PER_USEC; - up_migrate *= up_down_migrate_scale_factor; - up_migrate >>= 10; - up_migrate *= NSEC_PER_USEC; + *up_migrate /= NSEC_PER_USEC; + *up_migrate *= up_down_migrate_scale_factor; + *up_migrate >>= 10; + *up_migrate *= NSEC_PER_USEC; - up_migrate = min(up_migrate, sched_ravg_window); + *up_migrate = min(*up_migrate, sched_ravg_window); - down_migrate /= NSEC_PER_USEC; - down_migrate *= up_down_migrate_scale_factor; - down_migrate >>= 10; - down_migrate *= NSEC_PER_USEC; + *down_migrate /= NSEC_PER_USEC; + *down_migrate *= up_down_migrate_scale_factor; + *down_migrate >>= 10; + *down_migrate *= NSEC_PER_USEC; - down_migrate = min(down_migrate, up_migrate - delta); -done: + *down_migrate = min(*down_migrate, *up_migrate - delta); +} + +static void update_up_down_migrate(void) +{ + unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct); + unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); sched_upmigrate = up_migrate; sched_downmigrate = down_migrate; + + up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct); + down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct); + + _update_up_down_migrate(&up_migrate, &down_migrate); + sched_group_upmigrate = up_migrate; + sched_group_downmigrate = down_migrate; } void set_hmp_defaults(void) @@ -1134,82 +1086,6 @@ u64 cpu_load_sync(int cpu, int sync) return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu); } -static int boost_refcount; -static DEFINE_SPINLOCK(boost_lock); -static DEFINE_MUTEX(boost_mutex); - -static void boost_kick_cpus(void) -{ - int i; - - for_each_online_cpu(i) { - if (cpu_capacity(i) != max_capacity) - boost_kick(i); - } -} - -int sched_boost(void) -{ - return boost_refcount > 0; -} - -int sched_set_boost(int enable) -{ - unsigned long flags; - int ret = 0; - int old_refcount; - - if (!sched_enable_hmp) - return -EINVAL; - - spin_lock_irqsave(&boost_lock, flags); - - old_refcount = boost_refcount; - - if (enable == 1) { - boost_refcount++; - } else if (!enable) { - if (boost_refcount >= 1) - boost_refcount--; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (!old_refcount && boost_refcount) - boost_kick_cpus(); - - if (boost_refcount <= 1) - core_ctl_set_boost(boost_refcount == 1); - trace_sched_set_boost(boost_refcount); - spin_unlock_irqrestore(&boost_lock, flags); - - return ret; -} - -int sched_boost_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - int ret; - - mutex_lock(&boost_mutex); - if (!write) - sysctl_sched_boost = sched_boost(); - - ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write) - goto done; - - ret = (sysctl_sched_boost <= 1) ? - sched_set_boost(sysctl_sched_boost) : -EINVAL; - -done: - mutex_unlock(&boost_mutex); - return ret; -} - /* * Task will fit on a cpu if it's bandwidth consumption on that cpu * will be less than sched_upmigrate. A big task that was previously @@ -1219,60 +1095,63 @@ done: * tasks with load close to the upmigrate threshold */ int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu, - enum sched_boost_type boost_type) + enum sched_boost_policy boost_policy) { - int upmigrate; + int upmigrate = sched_upmigrate; if (cpu_capacity(cpu) == max_capacity) return 1; - if (boost_type != SCHED_BOOST_ON_BIG) { + if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) + upmigrate = sched_downmigrate; + + if (boost_policy != SCHED_BOOST_ON_BIG) { if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p)) return 1; - upmigrate = sched_upmigrate; - if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu)) - upmigrate = sched_downmigrate; - if (task_load < upmigrate) return 1; + } else { + if (task_sched_boost(p) || task_load >= upmigrate) + return 0; + + return 1; } return 0; } -enum sched_boost_type sched_boost_type(void) -{ - if (sched_boost()) - return sched_boost_policy; - - return SCHED_BOOST_NONE; -} - int task_will_fit(struct task_struct *p, int cpu) { u64 tload = scale_load_to_cpu(task_load(p), cpu); - return task_load_will_fit(p, tload, cpu, sched_boost_type()); + return task_load_will_fit(p, tload, cpu, sched_boost_policy()); } -int group_will_fit(struct sched_cluster *cluster, - struct related_thread_group *grp, u64 demand) +static int +group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp, + u64 demand, bool group_boost) { int cpu = cluster_first_cpu(cluster); int prev_capacity = 0; - unsigned int threshold = sched_upmigrate; + unsigned int threshold = sched_group_upmigrate; u64 load; if (cluster->capacity == max_capacity) return 1; + if (group_boost) + return 0; + + if (!demand) + return 1; + if (grp->preferred_cluster) prev_capacity = grp->preferred_cluster->capacity; if (cluster->capacity < prev_capacity) - threshold = sched_downmigrate; + threshold = sched_group_downmigrate; load = scale_load_to_cpu(demand, cpu); if (load < threshold) @@ -1495,6 +1374,23 @@ void post_big_task_count_change(const struct cpumask *cpus) DEFINE_MUTEX(policy_mutex); +unsigned int update_freq_aggregate_threshold(unsigned int threshold) +{ + unsigned int old_threshold; + + mutex_lock(&policy_mutex); + + old_threshold = sysctl_sched_freq_aggregate_threshold_pct; + + sysctl_sched_freq_aggregate_threshold_pct = threshold; + sched_freq_aggregate_threshold = + pct_to_real(sysctl_sched_freq_aggregate_threshold_pct); + + mutex_unlock(&policy_mutex); + + return old_threshold; +} + static inline int invalid_value_freq_input(unsigned int *data) { if (data == &sysctl_sched_freq_aggregate) @@ -1578,7 +1474,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write, if (write && (old_val == *data)) goto done; - if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) { + if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct || + sysctl_sched_group_downmigrate_pct > + sysctl_sched_group_upmigrate_pct) { *data = old_val; ret = -EINVAL; goto done; @@ -3801,13 +3699,13 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus) } /* Return cluster which can offer required capacity for group */ -static struct sched_cluster * -best_cluster(struct related_thread_group *grp, u64 total_demand) +static struct sched_cluster *best_cluster(struct related_thread_group *grp, + u64 total_demand, bool group_boost) { struct sched_cluster *cluster = NULL; for_each_sched_cluster(cluster) { - if (group_will_fit(cluster, grp, total_demand)) + if (group_will_fit(cluster, grp, total_demand, group_boost)) return cluster; } @@ -3818,6 +3716,9 @@ static void _set_preferred_cluster(struct related_thread_group *grp) { struct task_struct *p; u64 combined_demand = 0; + bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG; + bool group_boost = false; + u64 wallclock; if (!sysctl_sched_enable_colocation) { grp->last_update = sched_ktime_clock(); @@ -3825,19 +3726,36 @@ static void _set_preferred_cluster(struct related_thread_group *grp) return; } + if (list_empty(&grp->tasks)) + return; + + wallclock = sched_ktime_clock(); + /* * wakeup of two or more related tasks could race with each other and * could result in multiple calls to _set_preferred_cluster being issued * at same time. Avoid overhead in such cases of rechecking preferred * cluster */ - if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + if (wallclock - grp->last_update < sched_ravg_window / 10) return; - list_for_each_entry(p, &grp->tasks, grp_list) + list_for_each_entry(p, &grp->tasks, grp_list) { + if (boost_on_big && task_sched_boost(p)) { + group_boost = true; + break; + } + + if (p->ravg.mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + combined_demand += p->ravg.demand; - grp->preferred_cluster = best_cluster(grp, combined_demand); + } + + grp->preferred_cluster = best_cluster(grp, + combined_demand, group_boost); grp->last_update = sched_ktime_clock(); trace_sched_set_preferred_cluster(grp, combined_demand); } @@ -3852,6 +3770,8 @@ void set_preferred_cluster(struct related_thread_group *grp) #define ADD_TASK 0 #define REM_TASK 1 +#define DEFAULT_CGROUP_COLOC_ID 1 + static inline void free_group_cputime(struct related_thread_group *grp) { free_percpu(grp->cpu_time); @@ -4109,7 +4029,8 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_unlock(&grp->lock); - if (empty_group) { + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { list_del(&grp->list); call_rcu(&grp->rcu, free_related_thread_group); } @@ -4143,20 +4064,25 @@ void add_new_task_to_grp(struct task_struct *new) { unsigned long flags; struct related_thread_group *grp; - struct task_struct *parent; + struct task_struct *leader = new->group_leader; + unsigned int leader_grp_id = sched_get_group_id(leader); - if (!sysctl_sched_enable_thread_grouping) + if (!sysctl_sched_enable_thread_grouping && + leader_grp_id != DEFAULT_CGROUP_COLOC_ID) return; if (thread_group_leader(new)) return; - parent = new->group_leader; + if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) { + if (!same_schedtune(new, leader)) + return; + } write_lock_irqsave(&related_thread_group_lock, flags); rcu_read_lock(); - grp = task_related_thread_group(parent); + grp = task_related_thread_group(leader); rcu_read_unlock(); /* @@ -4179,6 +4105,47 @@ void add_new_task_to_grp(struct task_struct *new) write_unlock_irqrestore(&related_thread_group_lock, flags); } +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + if (IS_ERR(grp)) { + WARN_ON(1); + return -ENOMEM; + } + + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return sched_set_group_id(p, grp_id); +} +#endif + int sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; @@ -4201,6 +4168,12 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) grp = lookup_related_thread_group(group_id); if (!grp) { + /* This is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) { + rc = -EINVAL; + goto done; + } + grp = alloc_related_thread_group(group_id); if (IS_ERR(grp)) { rc = -ENOMEM; @@ -4210,7 +4183,6 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) list_add(&grp->list, &related_thread_groups); } - BUG_ON(!grp); rc = add_task_to_group(p, grp); done: write_unlock(&related_thread_group_lock); @@ -4459,7 +4431,7 @@ bool early_detection_notify(struct rq *rq, u64 wallclock) struct task_struct *p; int loop_max = 10; - if (!sched_boost() || !rq->cfs.h_nr_running) + if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running) return 0; rq->ed_task = NULL; |
