summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c238
-rw-r--r--kernel/sched/fair.c141
-rw-r--r--kernel/sched/sched.h37
-rw-r--r--kernel/sysctl.c9
4 files changed, 412 insertions, 13 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8cdd373a8980..da693099cc40 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1651,6 +1651,8 @@ __read_mostly unsigned int sysctl_sched_account_wait_time = 1;
__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
+unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
+
#ifdef CONFIG_SCHED_FREQ_INPUT
static __read_mostly unsigned int sched_migration_fixup = 1;
@@ -2575,6 +2577,8 @@ void sched_exit(struct task_struct *p)
struct rq *rq = cpu_rq(cpu);
u64 wallclock;
+ sched_set_group_id(p, 0);
+
raw_spin_lock_irqsave(&rq->lock, flags);
/* rq->curr == p */
wallclock = sched_ktime_clock();
@@ -2979,6 +2983,206 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus)
update_up_down_migrate();
}
+static LIST_HEAD(related_thread_groups);
+static DEFINE_RWLOCK(related_thread_group_lock);
+static int nr_related_thread_groups;
+
+/* Return cluster which can offer required capacity for group */
+static struct sched_cluster *
+best_cluster(struct related_thread_group *grp, u64 total_demand)
+{
+ struct sched_cluster *cluster = NULL;
+
+ for_each_sched_cluster(cluster) {
+ if (group_will_fit(cluster, grp, total_demand))
+ return cluster;
+ }
+
+ return NULL;
+}
+
+static void _set_preferred_cluster(struct related_thread_group *grp)
+{
+ struct task_struct *p;
+ u64 combined_demand = 0;
+
+ if (!sysctl_sched_enable_colocation) {
+ grp->last_update = sched_ktime_clock();
+ grp->preferred_cluster = NULL;
+ return;
+ }
+
+ /*
+ * wakeup of two or more related tasks could race with each other and
+ * could result in multiple calls to _set_preferred_cluster being issued
+ * at same time. Avoid overhead in such cases of rechecking preferred
+ * cluster
+ */
+ if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
+ return;
+
+ list_for_each_entry(p, &grp->tasks, grp_list)
+ combined_demand += p->ravg.demand;
+
+ grp->preferred_cluster = best_cluster(grp, combined_demand);
+ grp->last_update = sched_ktime_clock();
+ trace_sched_set_preferred_cluster(grp, combined_demand);
+}
+
+static void set_preferred_cluster(struct related_thread_group *grp)
+{
+ raw_spin_lock(&grp->lock);
+ _set_preferred_cluster(grp);
+ raw_spin_unlock(&grp->lock);
+}
+
+struct related_thread_group *alloc_related_thread_group(int group_id)
+{
+ struct related_thread_group *grp;
+
+ grp = kzalloc(sizeof(*grp), GFP_KERNEL);
+ if (!grp)
+ return ERR_PTR(-ENOMEM);
+
+ grp->id = group_id;
+ INIT_LIST_HEAD(&grp->tasks);
+ INIT_LIST_HEAD(&grp->list);
+ raw_spin_lock_init(&grp->lock);
+
+ return grp;
+}
+
+struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
+{
+ struct related_thread_group *grp;
+
+ list_for_each_entry(grp, &related_thread_groups, list) {
+ if (grp->id == group_id)
+ return grp;
+ }
+
+ return NULL;
+}
+
+static void remove_task_from_group(struct task_struct *p)
+{
+ struct related_thread_group *grp = p->grp;
+ struct rq *rq;
+ int empty_group = 1;
+
+ raw_spin_lock(&grp->lock);
+
+ rq = __task_rq_lock(p);
+ list_del_init(&p->grp_list);
+ p->grp = NULL;
+ __task_rq_unlock(rq);
+
+ if (!list_empty(&grp->tasks)) {
+ empty_group = 0;
+ _set_preferred_cluster(grp);
+ }
+
+ raw_spin_unlock(&grp->lock);
+
+ if (empty_group) {
+ list_del(&grp->list);
+ nr_related_thread_groups--;
+ /* See comments before preferred_cluster() */
+ kfree_rcu(grp, rcu);
+ }
+}
+
+static int
+add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
+{
+ struct rq *rq;
+
+ raw_spin_lock(&grp->lock);
+
+ /*
+ * Change p->grp under rq->lock. Will prevent races with read-side
+ * reference of p->grp in various hot-paths
+ */
+ rq = __task_rq_lock(p);
+ p->grp = grp;
+ list_add(&p->grp_list, &grp->tasks);
+ __task_rq_unlock(rq);
+
+ _set_preferred_cluster(grp);
+
+ raw_spin_unlock(&grp->lock);
+
+ return 0;
+}
+
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+ int rc = 0, destroy = 0;
+ unsigned long flags;
+ struct related_thread_group *grp = NULL, *new = NULL;
+
+redo:
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+
+ if ((current != p && p->flags & PF_EXITING) ||
+ (!p->grp && !group_id) ||
+ (p->grp && p->grp->id == group_id))
+ goto done;
+
+ write_lock(&related_thread_group_lock);
+
+ if (!group_id) {
+ remove_task_from_group(p);
+ write_unlock(&related_thread_group_lock);
+ goto done;
+ }
+
+ if (p->grp && p->grp->id != group_id)
+ remove_task_from_group(p);
+
+ grp = lookup_related_thread_group(group_id);
+ if (!grp && !new) {
+ /* New group */
+ write_unlock(&related_thread_group_lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ new = alloc_related_thread_group(group_id);
+ if (IS_ERR(new))
+ return -ENOMEM;
+ destroy = 1;
+ /* Rerun checks (like task exiting), since we dropped pi_lock */
+ goto redo;
+ } else if (!grp && new) {
+ /* New group - use object allocated before */
+ destroy = 0;
+ nr_related_thread_groups++;
+ list_add(&new->list, &related_thread_groups);
+ grp = new;
+ }
+
+ BUG_ON(!grp);
+ rc = add_task_to_group(p, grp);
+ write_unlock(&related_thread_group_lock);
+done:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ if (destroy)
+ kfree(new);
+
+ return rc;
+}
+
+unsigned int sched_get_group_id(struct task_struct *p)
+{
+ unsigned long flags;
+ unsigned int group_id;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ group_id = p->grp ? p->grp->id : 0;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return group_id;
+}
+
static int cpufreq_notifier_policy(struct notifier_block *nb,
unsigned long val, void *data)
{
@@ -3161,6 +3365,25 @@ static void restore_orig_mark_start(struct task_struct *p, u64 mark_start)
p->ravg.mark_start = mark_start;
}
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ u32 new_load = task_load(p);
+
+ if (!grp)
+ return 0;
+
+ /*
+ * Update if task's load has changed significantly or a complete window
+ * has passed since we last updated preference
+ */
+ if (abs(new_load - old_load) > sched_ravg_window / 4 ||
+ sched_ktime_clock() - p->grp->last_update > sched_ravg_window)
+ return 1;
+
+ return 0;
+}
+
#else /* CONFIG_SCHED_HMP */
static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { }
@@ -4121,8 +4344,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
struct migration_notify_data mnd;
int heavy_task = 0;
#ifdef CONFIG_SMP
+ unsigned int old_load;
struct rq *rq;
u64 wallclock;
+ struct related_thread_group *grp = NULL;
#endif
/*
@@ -4185,12 +4410,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
rq = cpu_rq(task_cpu(p));
raw_spin_lock(&rq->lock);
+ old_load = task_load(p);
+ grp = task_related_thread_group(p);
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
heavy_task = heavy_task_wakeup(p, rq, TASK_WAKE);
update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
raw_spin_unlock(&rq->lock);
+ if (update_preferred_cluster(grp, p, old_load))
+ set_preferred_cluster(grp);
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -5156,10 +5386,14 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
u64 wallclock;
bool early_notif;
+ u32 old_load;
+ struct related_thread_group *grp;
sched_clock_tick();
raw_spin_lock(&rq->lock);
+ old_load = task_load(curr);
+ grp = task_related_thread_group(curr);
set_window_start(rq);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
@@ -5181,6 +5415,10 @@ void scheduler_tick(void)
trigger_load_balance(rq);
#endif
rq_last_tick_reset(rq);
+
+ if (update_preferred_cluster(grp, curr, old_load))
+ set_preferred_cluster(grp);
+
if (curr->sched_class == &fair_sched_class)
check_for_migration(rq, curr);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 950ab9229cfc..1b64a6ae333c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2992,6 +2992,30 @@ static int task_will_fit(struct task_struct *p, int cpu)
return task_load_will_fit(p, tload, cpu);
}
+int group_will_fit(struct sched_cluster *cluster,
+ struct related_thread_group *grp, u64 demand)
+{
+ int cpu = cluster_first_cpu(cluster);
+ int prev_capacity = 0;
+ unsigned int threshold = sched_upmigrate;
+ u64 load;
+
+ if (cluster->capacity == max_capacity)
+ return 1;
+
+ if (grp->preferred_cluster)
+ prev_capacity = grp->preferred_cluster->capacity;
+
+ if (cluster->capacity < prev_capacity)
+ threshold = sched_downmigrate;
+
+ load = scale_load_to_cpu(demand, cpu);
+ if (load < threshold)
+ return 1;
+
+ return 0;
+}
+
struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
{
return NULL;
@@ -3070,6 +3094,7 @@ unlock:
struct cpu_select_env {
struct task_struct *p;
+ struct related_thread_group *rtg;
u8 reason;
u8 need_idle:1;
u8 boost:1;
@@ -3093,6 +3118,34 @@ struct cluster_cpu_stats {
#define DOWN_MIGRATION 2
#define IRQLOAD_MIGRATION 3
+/*
+ * Invoked from three places:
+ * 1) try_to_wake_up() -> ... -> select_best_cpu()
+ * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
+ * 3) can_migrate_task()
+ *
+ * Its safe to de-reference p->grp in first case (since p->pi_lock is held)
+ * but not in other cases. p->grp is hence freed after a RCU grace period and
+ * accessed under rcu_read_lock()
+ */
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ struct related_thread_group *grp;
+ int rc = 0;
+
+ rcu_read_lock();
+
+ grp = p->grp;
+ if (!grp || !sysctl_sched_enable_colocation)
+ rc = 1;
+ else
+ rc = (grp->preferred_cluster == cluster);
+
+ rcu_read_unlock();
+ return rc;
+}
+
static int
spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
{
@@ -3158,6 +3211,9 @@ acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
static int
skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
{
+ if (!test_bit(cluster->id, env->candidate_list))
+ return 1;
+
if (!acceptable_capacity(cluster, env)) {
__clear_bit(cluster->id, env->candidate_list);
return 1;
@@ -3171,6 +3227,12 @@ select_least_power_cluster(struct cpu_select_env *env)
{
struct sched_cluster *cluster;
+ if (env->rtg) {
+ env->task_load = scale_load_to_cpu(task_load(env->p),
+ cluster_first_cpu(env->rtg->preferred_cluster));
+ return env->rtg->preferred_cluster;
+ }
+
for_each_sched_cluster(cluster) {
if (!skip_cluster(cluster, env)) {
int cpu = cluster_first_cpu(cluster);
@@ -3241,6 +3303,9 @@ next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
__clear_bit(cluster->id, env->candidate_list);
+ if (env->rtg && preferred_cluster(cluster, env->p))
+ return NULL;
+
do {
if (bitmap_empty(env->candidate_list, num_clusters))
return NULL;
@@ -3397,13 +3462,26 @@ bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
return true;
}
+static inline int
+cluster_allowed(struct task_struct *p, struct sched_cluster *cluster)
+{
+ cpumask_t tmp_mask;
+
+ cpumask_and(&tmp_mask, &cluster->cpus, cpu_active_mask);
+ cpumask_and(&tmp_mask, &tmp_mask, &p->cpus_allowed);
+
+ return !cpumask_empty(&tmp_mask);
+}
+
+
/* return cheapest cpu that can fit this task */
static int select_best_cpu(struct task_struct *p, int target, int reason,
int sync)
{
- struct sched_cluster *cluster;
+ struct sched_cluster *cluster, *pref_cluster = NULL;
struct cluster_cpu_stats stats;
bool fast_path = false;
+ struct related_thread_group *grp;
struct cpu_select_env env = {
.p = p,
@@ -3413,6 +3491,7 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
.sync = sync,
.prev_cpu = target,
.ignore_prev_cpu = 0,
+ .rtg = NULL,
};
bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
@@ -3420,26 +3499,39 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
init_cluster_cpu_stats(&stats);
- if (bias_to_prev_cpu(&env, &stats)) {
+ rcu_read_lock();
+
+ grp = p->grp;
+
+ if (grp && grp->preferred_cluster) {
+ pref_cluster = grp->preferred_cluster;
+ if (!cluster_allowed(p, pref_cluster))
+ clear_bit(pref_cluster->id, env.candidate_list);
+ else
+ env.rtg = grp;
+ } else if (bias_to_prev_cpu(&env, &stats)) {
fast_path = true;
goto out;
}
- rcu_read_lock();
+retry:
cluster = select_least_power_cluster(&env);
- if (!cluster) {
- rcu_read_unlock();
+ if (!cluster)
goto out;
- }
+
+ /*
+ * 'cluster' now points to the minimum power cluster which can satisfy
+ * task's perf goals. Walk down the cluster list starting with that
+ * cluster. For non-small tasks, skip clusters that don't have
+ * mostly_idle/idle cpus
+ */
do {
find_best_cpu_in_cluster(cluster, &env, &stats);
} while ((cluster = next_best_cluster(cluster, &env)));
- rcu_read_unlock();
-
if (stats.best_idle_cpu >= 0) {
target = stats.best_idle_cpu;
} else if (stats.best_cpu >= 0) {
@@ -3449,12 +3541,18 @@ static int select_best_cpu(struct task_struct *p, int target, int reason,
target = stats.best_cpu;
} else {
+ if (env.rtg) {
+ env.rtg = NULL;
+ goto retry;
+ }
+
find_backup_cluster(&env, &stats);
if (stats.best_capacity_cpu >= 0)
target = stats.best_capacity_cpu;
}
out:
+ rcu_read_unlock();
trace_sched_task_load(p, sched_boost(), env.reason, env.sync,
env.need_idle, fast_path, target);
return target;
@@ -3949,11 +4047,11 @@ static inline int migration_needed(struct task_struct *p, int cpu)
return IRQLOAD_MIGRATION;
nice = task_nice(p);
- if ((nice > sched_upmigrate_min_nice || upmigrate_discouraged(p)) &&
- cpu_capacity(cpu) > min_capacity)
+ if (!p->grp && (nice > sched_upmigrate_min_nice ||
+ upmigrate_discouraged(p)) && cpu_capacity(cpu) > min_capacity)
return DOWN_MIGRATION;
- if (!task_will_fit(p, cpu))
+ if (!p->grp && !task_will_fit(p, cpu))
return UP_MIGRATION;
return 0;
@@ -4092,6 +4190,12 @@ inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
static inline void
dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) { }
+static inline int
+preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
+{
+ return 1;
+}
+
#endif /* CONFIG_SCHED_HMP */
@@ -4489,6 +4593,8 @@ void init_new_task_load(struct task_struct *p)
u32 init_load_pct = current->init_load_pct;
p->init_load_pct = 0;
+ p->grp = NULL;
+ INIT_LIST_HEAD(&p->grp_list);
memset(&p->ravg, 0, sizeof(struct ravg));
if (init_load_pct) {
@@ -7347,6 +7453,7 @@ enum fbq_type { regular, remote, all };
#define LBF_HMP_ACTIVE_BALANCE (LBF_SCHED_BOOST_ACTIVE_BALANCE | \
LBF_BIG_TASK_ACTIVE_BALANCE)
#define LBF_IGNORE_BIG_TASKS 0x100
+#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200
struct lb_env {
struct sched_domain *sd;
@@ -7534,6 +7641,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
return 0;
+ if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
+ !preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
+ return 0;
+
/*
* Group imbalance can sometimes cause work to be pulled across groups
* even though the group could have managed the imbalance on its own.
@@ -7644,6 +7755,8 @@ static int detach_tasks(struct lb_env *env)
if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
!sched_boost())
env->flags |= LBF_IGNORE_BIG_TASKS;
+ else if (!same_cluster(env->dst_cpu, env->src_cpu))
+ env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
redo:
while (!list_empty(tasks)) {
@@ -7708,9 +7821,11 @@ next:
list_move_tail(&p->se.group_node, tasks);
}
- if (env->flags & LBF_IGNORE_BIG_TASKS && !detached) {
+ if (env->flags & (LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) {
tasks = &env->src_rq->cfs_tasks;
- env->flags &= ~LBF_IGNORE_BIG_TASKS;
+ env->flags &= ~(LBF_IGNORE_BIG_TASKS |
+ LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
env->loop = orig_loop;
goto redo;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9e4f0887136c..6cd1dc3b6267 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -395,9 +395,21 @@ static inline int cluster_first_cpu(struct sched_cluster *cluster)
return cpumask_first(&cluster->cpus);
}
+struct related_thread_group {
+ int id;
+ raw_spinlock_t lock;
+ struct list_head tasks;
+ struct list_head list;
+ struct sched_cluster *preferred_cluster;
+ struct rcu_head rcu;
+ u64 last_update;
+};
+
extern struct list_head cluster_head;
extern int num_clusters;
extern struct sched_cluster *sched_cluster[NR_CPUS];
+extern int group_will_fit(struct sched_cluster *cluster,
+ struct related_thread_group *grp, u64 demand);
#define for_each_sched_cluster(cluster) \
list_for_each_entry_rcu(cluster, &cluster_head, list)
@@ -1035,6 +1047,7 @@ extern unsigned int max_task_load(void);
extern void sched_account_irqtime(int cpu, struct task_struct *curr,
u64 delta, u64 wallclock);
unsigned int cpu_temp(int cpu);
+int sched_set_group_id(struct task_struct *p, unsigned int group_id);
extern unsigned int nr_eligible_big_tasks(int cpu);
extern void update_up_down_migrate(void);
@@ -1188,11 +1201,18 @@ static inline int sched_cpu_high_irqload(int cpu)
return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload;
}
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+ return p->grp;
+}
+
#else /* CONFIG_SCHED_HMP */
#define sched_use_pelt 0
struct hmp_sched_stats;
+struct related_thread_group;
static inline u64 scale_load_to_cpu(u64 load, int cpu)
{
@@ -1230,6 +1250,22 @@ static inline void sched_account_irqtime(int cpu, struct task_struct *curr,
static inline int sched_cpu_high_irqload(int cpu) { return 0; }
+static inline void set_preferred_cluster(struct related_thread_group *grp) { }
+
+static inline
+struct related_thread_group *task_related_thread_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+static inline u32 task_load(struct task_struct *p) { return 0; }
+
+static inline int update_preferred_cluster(struct related_thread_group *grp,
+ struct task_struct *p, u32 old_load)
+{
+ return 0;
+}
+
#endif /* CONFIG_SCHED_HMP */
/*
@@ -1239,6 +1275,7 @@ static inline int sched_cpu_high_irqload(int cpu) { return 0; }
#define group_rq_capacity(group) cpu_capacity(group_first_cpu(group))
#ifdef CONFIG_SCHED_FREQ_INPUT
+
extern void check_for_freq_change(struct rq *rq);
/* Is frequency of two cpus synchronized with each other? */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1f2afa6eefaf..878b64bfcc7a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -448,6 +448,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_hmp_proc_update_handler,
},
{
+ .procname = "sched_enable_colocation",
+ .data = &sysctl_sched_enable_colocation,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "sched_boost",
.data = &sysctl_sched_boost,
.maxlen = sizeof(unsigned int),