diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/core.c | 238 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 141 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 37 | ||||
| -rw-r--r-- | kernel/sysctl.c | 9 |
4 files changed, 412 insertions, 13 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8cdd373a8980..da693099cc40 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1651,6 +1651,8 @@ __read_mostly unsigned int sysctl_sched_account_wait_time = 1; __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC); +unsigned int __read_mostly sysctl_sched_enable_colocation = 1; + #ifdef CONFIG_SCHED_FREQ_INPUT static __read_mostly unsigned int sched_migration_fixup = 1; @@ -2575,6 +2577,8 @@ void sched_exit(struct task_struct *p) struct rq *rq = cpu_rq(cpu); u64 wallclock; + sched_set_group_id(p, 0); + raw_spin_lock_irqsave(&rq->lock, flags); /* rq->curr == p */ wallclock = sched_ktime_clock(); @@ -2979,6 +2983,206 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus) update_up_down_migrate(); } +static LIST_HEAD(related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); +static int nr_related_thread_groups; + +/* Return cluster which can offer required capacity for group */ +static struct sched_cluster * +best_cluster(struct related_thread_group *grp, u64 total_demand) +{ + struct sched_cluster *cluster = NULL; + + for_each_sched_cluster(cluster) { + if (group_will_fit(cluster, grp, total_demand)) + return cluster; + } + + return NULL; +} + +static void _set_preferred_cluster(struct related_thread_group *grp) +{ + struct task_struct *p; + u64 combined_demand = 0; + + if (!sysctl_sched_enable_colocation) { + grp->last_update = sched_ktime_clock(); + grp->preferred_cluster = NULL; + return; + } + + /* + * wakeup of two or more related tasks could race with each other and + * could result in multiple calls to _set_preferred_cluster being issued + * at same time. Avoid overhead in such cases of rechecking preferred + * cluster + */ + if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10) + return; + + list_for_each_entry(p, &grp->tasks, grp_list) + combined_demand += p->ravg.demand; + + grp->preferred_cluster = best_cluster(grp, combined_demand); + grp->last_update = sched_ktime_clock(); + trace_sched_set_preferred_cluster(grp, combined_demand); +} + +static void set_preferred_cluster(struct related_thread_group *grp) +{ + raw_spin_lock(&grp->lock); + _set_preferred_cluster(grp); + raw_spin_unlock(&grp->lock); +} + +struct related_thread_group *alloc_related_thread_group(int group_id) +{ + struct related_thread_group *grp; + + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) + return ERR_PTR(-ENOMEM); + + grp->id = group_id; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + return grp; +} + +struct related_thread_group *lookup_related_thread_group(unsigned int group_id) +{ + struct related_thread_group *grp; + + list_for_each_entry(grp, &related_thread_groups, list) { + if (grp->id == group_id) + return grp; + } + + return NULL; +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct related_thread_group *grp = p->grp; + struct rq *rq; + int empty_group = 1; + + raw_spin_lock(&grp->lock); + + rq = __task_rq_lock(p); + list_del_init(&p->grp_list); + p->grp = NULL; + __task_rq_unlock(rq); + + if (!list_empty(&grp->tasks)) { + empty_group = 0; + _set_preferred_cluster(grp); + } + + raw_spin_unlock(&grp->lock); + + if (empty_group) { + list_del(&grp->list); + nr_related_thread_groups--; + /* See comments before preferred_cluster() */ + kfree_rcu(grp, rcu); + } +} + +static int +add_task_to_group(struct task_struct *p, struct related_thread_group *grp) +{ + struct rq *rq; + + raw_spin_lock(&grp->lock); + + /* + * Change p->grp under rq->lock. Will prevent races with read-side + * reference of p->grp in various hot-paths + */ + rq = __task_rq_lock(p); + p->grp = grp; + list_add(&p->grp_list, &grp->tasks); + __task_rq_unlock(rq); + + _set_preferred_cluster(grp); + + raw_spin_unlock(&grp->lock); + + return 0; +} + +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0, destroy = 0; + unsigned long flags; + struct related_thread_group *grp = NULL, *new = NULL; + +redo: + raw_spin_lock_irqsave(&p->pi_lock, flags); + + if ((current != p && p->flags & PF_EXITING) || + (!p->grp && !group_id) || + (p->grp && p->grp->id == group_id)) + goto done; + + write_lock(&related_thread_group_lock); + + if (!group_id) { + remove_task_from_group(p); + write_unlock(&related_thread_group_lock); + goto done; + } + + if (p->grp && p->grp->id != group_id) + remove_task_from_group(p); + + grp = lookup_related_thread_group(group_id); + if (!grp && !new) { + /* New group */ + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + new = alloc_related_thread_group(group_id); + if (IS_ERR(new)) + return -ENOMEM; + destroy = 1; + /* Rerun checks (like task exiting), since we dropped pi_lock */ + goto redo; + } else if (!grp && new) { + /* New group - use object allocated before */ + destroy = 0; + nr_related_thread_groups++; + list_add(&new->list, &related_thread_groups); + grp = new; + } + + BUG_ON(!grp); + rc = add_task_to_group(p, grp); + write_unlock(&related_thread_group_lock); +done: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + if (destroy) + kfree(new); + + return rc; +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned long flags; + unsigned int group_id; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + group_id = p->grp ? p->grp->id : 0; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return group_id; +} + static int cpufreq_notifier_policy(struct notifier_block *nb, unsigned long val, void *data) { @@ -3161,6 +3365,25 @@ static void restore_orig_mark_start(struct task_struct *p, u64 mark_start) p->ravg.mark_start = mark_start; } +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + u32 new_load = task_load(p); + + if (!grp) + return 0; + + /* + * Update if task's load has changed significantly or a complete window + * has passed since we last updated preference + */ + if (abs(new_load - old_load) > sched_ravg_window / 4 || + sched_ktime_clock() - p->grp->last_update > sched_ravg_window) + return 1; + + return 0; +} + #else /* CONFIG_SCHED_HMP */ static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } @@ -4121,8 +4344,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) struct migration_notify_data mnd; int heavy_task = 0; #ifdef CONFIG_SMP + unsigned int old_load; struct rq *rq; u64 wallclock; + struct related_thread_group *grp = NULL; #endif /* @@ -4185,12 +4410,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) rq = cpu_rq(task_cpu(p)); raw_spin_lock(&rq->lock); + old_load = task_load(p); + grp = task_related_thread_group(p); wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); heavy_task = heavy_task_wakeup(p, rq, TASK_WAKE); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); raw_spin_unlock(&rq->lock); + if (update_preferred_cluster(grp, p, old_load)) + set_preferred_cluster(grp); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -5156,10 +5386,14 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; u64 wallclock; bool early_notif; + u32 old_load; + struct related_thread_group *grp; sched_clock_tick(); raw_spin_lock(&rq->lock); + old_load = task_load(curr); + grp = task_related_thread_group(curr); set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); @@ -5181,6 +5415,10 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + if (update_preferred_cluster(grp, curr, old_load)) + set_preferred_cluster(grp); + if (curr->sched_class == &fair_sched_class) check_for_migration(rq, curr); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 950ab9229cfc..1b64a6ae333c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2992,6 +2992,30 @@ static int task_will_fit(struct task_struct *p, int cpu) return task_load_will_fit(p, tload, cpu); } +int group_will_fit(struct sched_cluster *cluster, + struct related_thread_group *grp, u64 demand) +{ + int cpu = cluster_first_cpu(cluster); + int prev_capacity = 0; + unsigned int threshold = sched_upmigrate; + u64 load; + + if (cluster->capacity == max_capacity) + return 1; + + if (grp->preferred_cluster) + prev_capacity = grp->preferred_cluster->capacity; + + if (cluster->capacity < prev_capacity) + threshold = sched_downmigrate; + + load = scale_load_to_cpu(demand, cpu); + if (load < threshold) + return 1; + + return 0; +} + struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void) { return NULL; @@ -3070,6 +3094,7 @@ unlock: struct cpu_select_env { struct task_struct *p; + struct related_thread_group *rtg; u8 reason; u8 need_idle:1; u8 boost:1; @@ -3093,6 +3118,34 @@ struct cluster_cpu_stats { #define DOWN_MIGRATION 2 #define IRQLOAD_MIGRATION 3 +/* + * Invoked from three places: + * 1) try_to_wake_up() -> ... -> select_best_cpu() + * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu() + * 3) can_migrate_task() + * + * Its safe to de-reference p->grp in first case (since p->pi_lock is held) + * but not in other cases. p->grp is hence freed after a RCU grace period and + * accessed under rcu_read_lock() + */ +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + struct related_thread_group *grp; + int rc = 0; + + rcu_read_lock(); + + grp = p->grp; + if (!grp || !sysctl_sched_enable_colocation) + rc = 1; + else + rc = (grp->preferred_cluster == cluster); + + rcu_read_unlock(); + return rc; +} + static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) { @@ -3158,6 +3211,9 @@ acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env) static int skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env) { + if (!test_bit(cluster->id, env->candidate_list)) + return 1; + if (!acceptable_capacity(cluster, env)) { __clear_bit(cluster->id, env->candidate_list); return 1; @@ -3171,6 +3227,12 @@ select_least_power_cluster(struct cpu_select_env *env) { struct sched_cluster *cluster; + if (env->rtg) { + env->task_load = scale_load_to_cpu(task_load(env->p), + cluster_first_cpu(env->rtg->preferred_cluster)); + return env->rtg->preferred_cluster; + } + for_each_sched_cluster(cluster) { if (!skip_cluster(cluster, env)) { int cpu = cluster_first_cpu(cluster); @@ -3241,6 +3303,9 @@ next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env) __clear_bit(cluster->id, env->candidate_list); + if (env->rtg && preferred_cluster(cluster, env->p)) + return NULL; + do { if (bitmap_empty(env->candidate_list, num_clusters)) return NULL; @@ -3397,13 +3462,26 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) return true; } +static inline int +cluster_allowed(struct task_struct *p, struct sched_cluster *cluster) +{ + cpumask_t tmp_mask; + + cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask); + cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed); + + return !cpumask_empty(&tmp_mask); +} + + /* return cheapest cpu that can fit this task */ static int select_best_cpu(struct task_struct *p, int target, int reason, int sync) { - struct sched_cluster *cluster; + struct sched_cluster *cluster, *pref_cluster = NULL; struct cluster_cpu_stats stats; bool fast_path = false; + struct related_thread_group *grp; struct cpu_select_env env = { .p = p, @@ -3413,6 +3491,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, .sync = sync, .prev_cpu = target, .ignore_prev_cpu = 0, + .rtg = NULL, }; bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); @@ -3420,26 +3499,39 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, init_cluster_cpu_stats(&stats); - if (bias_to_prev_cpu(&env, &stats)) { + rcu_read_lock(); + + grp = p->grp; + + if (grp && grp->preferred_cluster) { + pref_cluster = grp->preferred_cluster; + if (!cluster_allowed(p, pref_cluster)) + clear_bit(pref_cluster->id, env.candidate_list); + else + env.rtg = grp; + } else if (bias_to_prev_cpu(&env, &stats)) { fast_path = true; goto out; } - rcu_read_lock(); +retry: cluster = select_least_power_cluster(&env); - if (!cluster) { - rcu_read_unlock(); + if (!cluster) goto out; - } + + /* + * 'cluster' now points to the minimum power cluster which can satisfy + * task's perf goals. Walk down the cluster list starting with that + * cluster. For non-small tasks, skip clusters that don't have + * mostly_idle/idle cpus + */ do { find_best_cpu_in_cluster(cluster, &env, &stats); } while ((cluster = next_best_cluster(cluster, &env))); - rcu_read_unlock(); - if (stats.best_idle_cpu >= 0) { target = stats.best_idle_cpu; } else if (stats.best_cpu >= 0) { @@ -3449,12 +3541,18 @@ static int select_best_cpu(struct task_struct *p, int target, int reason, target = stats.best_cpu; } else { + if (env.rtg) { + env.rtg = NULL; + goto retry; + } + find_backup_cluster(&env, &stats); if (stats.best_capacity_cpu >= 0) target = stats.best_capacity_cpu; } out: + rcu_read_unlock(); trace_sched_task_load(p, sched_boost(), env.reason, env.sync, env.need_idle, fast_path, target); return target; @@ -3949,11 +4047,11 @@ static inline int migration_needed(struct task_struct *p, int cpu) return IRQLOAD_MIGRATION; nice = task_nice(p); - if ((nice > sched_upmigrate_min_nice || upmigrate_discouraged(p)) && - cpu_capacity(cpu) > min_capacity) + if (!p->grp && (nice > sched_upmigrate_min_nice || + upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity) return DOWN_MIGRATION; - if (!task_will_fit(p, cpu)) + if (!p->grp && !task_will_fit(p, cpu)) return UP_MIGRATION; return 0; @@ -4092,6 +4190,12 @@ inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } static inline void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { } +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + return 1; +} + #endif /* CONFIG_SCHED_HMP */ @@ -4489,6 +4593,8 @@ void init_new_task_load(struct task_struct *p) u32 init_load_pct = current->init_load_pct; p->init_load_pct = 0; + p->grp = NULL; + INIT_LIST_HEAD(&p->grp_list); memset(&p->ravg, 0, sizeof(struct ravg)); if (init_load_pct) { @@ -7347,6 +7453,7 @@ enum fbq_type { regular, remote, all }; #define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \ LBF_BIG_TASK_ACTIVE_BALANCE) #define LBF_IGNORE_BIG_TASKS 0x100 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 struct lb_env { struct sched_domain *sd; @@ -7534,6 +7641,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (env->flags & LBF_IGNORE_BIG_TASKS && !twf) return 0; + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && + !preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p)) + return 0; + /* * Group imbalance can sometimes cause work to be pulled across groups * even though the group could have managed the imbalance on its own. @@ -7644,6 +7755,8 @@ static int detach_tasks(struct lb_env *env) if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) && !sched_boost()) env->flags |= LBF_IGNORE_BIG_TASKS; + else if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; redo: while (!list_empty(tasks)) { @@ -7708,9 +7821,11 @@ next: list_move_tail(&p->se.group_node, tasks); } - if (env->flags & LBF_IGNORE_BIG_TASKS && !detached) { + if (env->flags & (LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) { tasks = &env->src_rq->cfs_tasks; - env->flags &= ~LBF_IGNORE_BIG_TASKS; + env->flags &= ~(LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS); env->loop = orig_loop; goto redo; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9e4f0887136c..6cd1dc3b6267 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -395,9 +395,21 @@ static inline int cluster_first_cpu(struct sched_cluster *cluster) return cpumask_first(&cluster->cpus); } +struct related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + struct sched_cluster *preferred_cluster; + struct rcu_head rcu; + u64 last_update; +}; + extern struct list_head cluster_head; extern int num_clusters; extern struct sched_cluster *sched_cluster[NR_CPUS]; +extern int group_will_fit(struct sched_cluster *cluster, + struct related_thread_group *grp, u64 demand); #define for_each_sched_cluster(cluster) \ list_for_each_entry_rcu(cluster, &cluster_head, list) @@ -1035,6 +1047,7 @@ extern unsigned int max_task_load(void); extern void sched_account_irqtime(int cpu, struct task_struct *curr, u64 delta, u64 wallclock); unsigned int cpu_temp(int cpu); +int sched_set_group_id(struct task_struct *p, unsigned int group_id); extern unsigned int nr_eligible_big_tasks(int cpu); extern void update_up_down_migrate(void); @@ -1188,11 +1201,18 @@ static inline int sched_cpu_high_irqload(int cpu) return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; } +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return p->grp; +} + #else /* CONFIG_SCHED_HMP */ #define sched_use_pelt 0 struct hmp_sched_stats; +struct related_thread_group; static inline u64 scale_load_to_cpu(u64 load, int cpu) { @@ -1230,6 +1250,22 @@ static inline void sched_account_irqtime(int cpu, struct task_struct *curr, static inline int sched_cpu_high_irqload(int cpu) { return 0; } +static inline void set_preferred_cluster(struct related_thread_group *grp) { } + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return NULL; +} + +static inline u32 task_load(struct task_struct *p) { return 0; } + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load) +{ + return 0; +} + #endif /* CONFIG_SCHED_HMP */ /* @@ -1239,6 +1275,7 @@ static inline int sched_cpu_high_irqload(int cpu) { return 0; } #define group_rq_capacity(group) cpu_capacity(group_first_cpu(group)) #ifdef CONFIG_SCHED_FREQ_INPUT + extern void check_for_freq_change(struct rq *rq); /* Is frequency of two cpus synchronized with each other? */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1f2afa6eefaf..878b64bfcc7a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -448,6 +448,15 @@ static struct ctl_table kern_table[] = { .proc_handler = sched_hmp_proc_update_handler, }, { + .procname = "sched_enable_colocation", + .data = &sysctl_sched_enable_colocation, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "sched_boost", .data = &sysctl_sched_boost, .maxlen = sizeof(unsigned int), |
