diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 2048 |
1 files changed, 1811 insertions, 237 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06e77d60a510..2ea3a4337dde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,9 +32,8 @@ #include <linux/task_work.h> #include <linux/module.h> -#include <trace/events/sched.h> - #include "sched.h" +#include <trace/events/sched.h> #include "tune.h" #include "walt.h" @@ -56,12 +55,6 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; -#ifdef CONFIG_SCHED_WALT -unsigned int sysctl_sched_use_walt_cpu_util = 1; -unsigned int sysctl_sched_use_walt_task_util = 1; -__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = - (10 * NSEC_PER_MSEC); -#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -254,6 +247,9 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#ifdef CONFIG_SMP +static int active_load_balance_cpu_stop(void *data); +#endif const struct sched_class fair_sched_class; @@ -891,12 +887,56 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; } +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} +#endif + /* * Task is being enqueued - update stats: */ @@ -910,23 +950,6 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -2612,7 +2635,25 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ +u32 sched_get_wake_up_idle(struct task_struct *p) +{ + u32 enabled = p->flags & PF_WAKE_UP_IDLE; + + return !!enabled; +} + +int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle) +{ + int enable = !!wake_up_idle; + + if (enable) + p->flags |= PF_WAKE_UP_IDLE; + else + p->flags &= ~PF_WAKE_UP_IDLE; + + return 0; +} + static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, @@ -2692,6 +2733,1064 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#ifdef CONFIG_SCHED_HMP + +/* CPU selection flag */ +#define SBC_FLAG_PREV_CPU 0x1 +#define SBC_FLAG_BEST_CAP_CPU 0x2 +#define SBC_FLAG_CPU_COST 0x4 +#define SBC_FLAG_MIN_COST 0x8 +#define SBC_FLAG_IDLE_LEAST_LOADED 0x10 +#define SBC_FLAG_IDLE_CSTATE 0x20 +#define SBC_FLAG_COST_CSTATE_TIE_BREAKER 0x40 +#define SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER 0x80 +#define SBC_FLAG_CSTATE_LOAD 0x100 +#define SBC_FLAG_BEST_SIBLING 0x200 +#define SBC_FLAG_WAKER_CPU 0x400 +#define SBC_FLAG_PACK_TASK 0x800 + +/* Cluster selection flag */ +#define SBC_FLAG_COLOC_CLUSTER 0x10000 +#define SBC_FLAG_WAKER_CLUSTER 0x20000 +#define SBC_FLAG_BACKUP_CLUSTER 0x40000 +#define SBC_FLAG_BOOST_CLUSTER 0x80000 + +struct cpu_select_env { + struct task_struct *p; + struct related_thread_group *rtg; + u8 reason; + u8 need_idle:1; + u8 need_waker_cluster:1; + u8 sync:1; + enum sched_boost_policy boost_policy; + u8 pack_task:1; + int prev_cpu; + DECLARE_BITMAP(candidate_list, NR_CPUS); + DECLARE_BITMAP(backup_list, NR_CPUS); + u64 task_load; + u64 cpu_load; + u32 sbc_best_flag; + u32 sbc_best_cluster_flag; + struct cpumask search_cpus; +}; + +struct cluster_cpu_stats { + int best_idle_cpu, least_loaded_cpu; + int best_capacity_cpu, best_cpu, best_sibling_cpu; + int min_cost, best_sibling_cpu_cost; + int best_cpu_wakeup_latency; + u64 min_load, best_load, best_sibling_cpu_load; + s64 highest_spare_capacity; +}; + +/* + * Should task be woken to any available idle cpu? + * + * Waking tasks to idle cpu has mixed implications on both performance and + * power. In many cases, scheduler can't estimate correctly impact of using idle + * cpus on either performance or power. PF_WAKE_UP_IDLE allows external kernel + * module to pass a strong hint to scheduler that the task in question should be + * woken to idle cpu, generally to improve performance. + */ +static inline int wake_to_idle(struct task_struct *p) +{ + return (current->flags & PF_WAKE_UP_IDLE) || + (p->flags & PF_WAKE_UP_IDLE); +} + +static int spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq) +{ + u64 total_load; + + total_load = env->task_load + env->cpu_load; + + if (total_load > sched_spill_load || + (rq->nr_running + 1) > sysctl_sched_spill_nr_run) + return 1; + + return 0; +} + +static int skip_cpu(int cpu, struct cpu_select_env *env) +{ + int tcpu = task_cpu(env->p); + int skip = 0; + + if (!env->reason) + return 0; + + if (is_reserved(cpu)) + return 1; + + switch (env->reason) { + case UP_MIGRATION: + skip = !idle_cpu(cpu); + break; + case IRQLOAD_MIGRATION: + /* Purposely fall through */ + default: + skip = (cpu == tcpu); + break; + } + + return skip; +} + +static inline int +acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + int tcpu; + + if (!env->reason) + return 1; + + tcpu = task_cpu(env->p); + switch (env->reason) { + case UP_MIGRATION: + return cluster->capacity > cpu_capacity(tcpu); + + case DOWN_MIGRATION: + return cluster->capacity < cpu_capacity(tcpu); + + default: + break; + } + + return 1; +} + +static int +skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env) +{ + if (!test_bit(cluster->id, env->candidate_list)) + return 1; + + if (!acceptable_capacity(cluster, env)) { + __clear_bit(cluster->id, env->candidate_list); + return 1; + } + + return 0; +} + +static struct sched_cluster * +select_least_power_cluster(struct cpu_select_env *env) +{ + struct sched_cluster *cluster; + + if (env->rtg) { + int cpu = cluster_first_cpu(env->rtg->preferred_cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), cpu); + + if (task_load_will_fit(env->p, env->task_load, + cpu, env->boost_policy)) { + env->sbc_best_cluster_flag |= SBC_FLAG_COLOC_CLUSTER; + + if (env->boost_policy == SCHED_BOOST_NONE) + return env->rtg->preferred_cluster; + + for_each_sched_cluster(cluster) { + if (cluster != env->rtg->preferred_cluster) { + __set_bit(cluster->id, + env->backup_list); + __clear_bit(cluster->id, + env->candidate_list); + } + } + + return env->rtg->preferred_cluster; + } + + /* + * Since the task load does not fit on the preferred + * cluster anymore, pretend that the task does not + * have any preferred cluster. This allows the waking + * task to get the appropriate CPU it needs as per the + * non co-location placement policy without having to + * wait until the preferred cluster is updated. + */ + env->rtg = NULL; + } + + for_each_sched_cluster(cluster) { + if (!skip_cluster(cluster, env)) { + int cpu = cluster_first_cpu(cluster); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cpu); + if (task_load_will_fit(env->p, env->task_load, cpu, + env->boost_policy)) + return cluster; + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + } + } + + return NULL; +} + +static struct sched_cluster * +next_candidate(const unsigned long *list, int start, int end) +{ + int cluster_id; + + cluster_id = find_next_bit(list, end, start - 1 + 1); + if (cluster_id >= end) + return NULL; + + return sched_cluster[cluster_id]; +} + +static void +update_spare_capacity(struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu, int capacity, + u64 cpu_load) +{ + s64 spare_capacity = sched_ravg_window - cpu_load; + + if (spare_capacity > 0 && + (spare_capacity > stats->highest_spare_capacity || + (spare_capacity == stats->highest_spare_capacity && + ((!env->need_waker_cluster && + capacity > cpu_capacity(stats->best_capacity_cpu)) || + (env->need_waker_cluster && + cpu_rq(cpu)->nr_running < + cpu_rq(stats->best_capacity_cpu)->nr_running))))) { + /* + * If sync waker is the only runnable of CPU, cr_avg of the + * CPU is 0 so we have high chance to place the wakee on the + * waker's CPU which likely causes preemtion of the waker. + * This can lead migration of preempted waker. Place the + * wakee on the real idle CPU when it's possible by checking + * nr_running to avoid such preemption. + */ + stats->highest_spare_capacity = spare_capacity; + stats->best_capacity_cpu = cpu; + } +} + +static inline void find_backup_cluster( +struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + int i; + struct cpumask search_cpus; + + while (!bitmap_empty(env->backup_list, num_clusters)) { + next = next_candidate(env->backup_list, 0, num_clusters); + __clear_bit(next->id, env->backup_list); + + cpumask_and(&search_cpus, &env->search_cpus, &next->cpus); + for_each_cpu(i, &search_cpus) { + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + update_spare_capacity(stats, env, i, next->capacity, + cpu_load_sync(i, env->sync)); + } + env->sbc_best_cluster_flag = SBC_FLAG_BACKUP_CLUSTER; + } +} + +struct sched_cluster * +next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env, + struct cluster_cpu_stats *stats) +{ + struct sched_cluster *next = NULL; + + __clear_bit(cluster->id, env->candidate_list); + + if (env->rtg && preferred_cluster(cluster, env->p)) + return NULL; + + do { + if (bitmap_empty(env->candidate_list, num_clusters)) + return NULL; + + next = next_candidate(env->candidate_list, 0, num_clusters); + if (next) { + if (next->min_power_cost > stats->min_cost) { + clear_bit(next->id, env->candidate_list); + next = NULL; + continue; + } + + if (skip_cluster(next, env)) + next = NULL; + } + } while (!next); + + env->task_load = scale_load_to_cpu(task_load(env->p), + cluster_first_cpu(next)); + return next; +} + +#ifdef CONFIG_SCHED_HMP_CSTATE_AWARE +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int wakeup_latency; + int prev_cpu = env->prev_cpu; + + wakeup_latency = cpu_rq(cpu)->wakeup_latency; + + if (env->need_idle) { + stats->min_cost = cpu_cost; + if (idle_cpu(cpu)) { + if (wakeup_latency < stats->best_cpu_wakeup_latency || + (wakeup_latency == stats->best_cpu_wakeup_latency && + cpu == prev_cpu)) { + stats->best_idle_cpu = cpu; + stats->best_cpu_wakeup_latency = wakeup_latency; + } + } else { + if (env->cpu_load < stats->min_load || + (env->cpu_load == stats->min_load && + cpu == prev_cpu)) { + stats->least_loaded_cpu = cpu; + stats->min_load = env->cpu_load; + } + } + + return; + } + + if (cpu_cost < stats->min_cost) { + stats->min_cost = cpu_cost; + stats->best_cpu_wakeup_latency = wakeup_latency; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CPU_COST; + return; + } + + /* CPU cost is the same. Start breaking the tie by C-state */ + + if (wakeup_latency > stats->best_cpu_wakeup_latency) + return; + + if (wakeup_latency < stats->best_cpu_wakeup_latency) { + stats->best_cpu_wakeup_latency = wakeup_latency; + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_TIE_BREAKER; + return; + } + + /* C-state is the same. Use prev CPU to break the tie */ + if (cpu == prev_cpu) { + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_COST_CSTATE_PREV_CPU_TIE_BREAKER; + return; + } + + if (stats->best_cpu != prev_cpu && + ((wakeup_latency == 0 && env->cpu_load < stats->best_load) || + (wakeup_latency > 0 && env->cpu_load > stats->best_load))) { + stats->best_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_CSTATE_LOAD; + } +} +#else /* CONFIG_SCHED_HMP_CSTATE_AWARE */ +static void __update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env, int cpu_cost) +{ + int prev_cpu = env->prev_cpu; + + if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) { + if (stats->best_sibling_cpu_cost > cpu_cost || + (stats->best_sibling_cpu_cost == cpu_cost && + stats->best_sibling_cpu_load > env->cpu_load)) { + stats->best_sibling_cpu_cost = cpu_cost; + stats->best_sibling_cpu_load = env->cpu_load; + stats->best_sibling_cpu = cpu; + } + } + + if ((cpu_cost < stats->min_cost) || + ((stats->best_cpu != prev_cpu && + stats->min_load > env->cpu_load) || cpu == prev_cpu)) { + if (env->need_idle) { + if (idle_cpu(cpu)) { + stats->min_cost = cpu_cost; + stats->best_idle_cpu = cpu; + } + } else { + stats->min_cost = cpu_cost; + stats->min_load = env->cpu_load; + stats->best_cpu = cpu; + env->sbc_best_flag = SBC_FLAG_MIN_COST; + } + } +} +#endif /* CONFIG_SCHED_HMP_CSTATE_AWARE */ + +static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats, + struct cpu_select_env *env) +{ + int cpu_cost; + + /* + * We try to find the least loaded *busy* CPU irrespective + * of the power cost. + */ + if (env->pack_task) + cpu_cost = cpu_min_power_cost(cpu); + + else + cpu_cost = power_cost(cpu, task_load(env->p) + + cpu_cravg_sync(cpu, env->sync)); + + if (cpu_cost <= stats->min_cost) + __update_cluster_stats(cpu, stats, env, cpu_cost); +} + +static void find_best_cpu_in_cluster(struct sched_cluster *c, + struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int i; + struct cpumask search_cpus; + + cpumask_and(&search_cpus, &env->search_cpus, &c->cpus); + + env->need_idle = wake_to_idle(env->p) || c->wake_up_idle; + + for_each_cpu(i, &search_cpus) { + env->cpu_load = cpu_load_sync(i, env->sync); + + trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, task_load(env->p) + + cpu_cravg_sync(i, env->sync)), 0); + + if (skip_cpu(i, env)) + continue; + + update_spare_capacity(stats, env, i, c->capacity, + env->cpu_load); + + /* + * need_idle takes precedence over sched boost but when both + * are set, idlest CPU with in all the clusters is selected + * when boost_policy = BOOST_ON_ALL whereas idlest CPU in the + * big cluster is selected within boost_policy = BOOST_ON_BIG. + */ + if ((!env->need_idle && + env->boost_policy != SCHED_BOOST_NONE) || + env->need_waker_cluster || + sched_cpu_high_irqload(i) || + spill_threshold_crossed(env, cpu_rq(i))) + continue; + + update_cluster_stats(i, stats, env); + } +} + +static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats) +{ + stats->best_cpu = stats->best_idle_cpu = -1; + stats->best_capacity_cpu = stats->best_sibling_cpu = -1; + stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX; + stats->min_load = stats->best_sibling_cpu_load = ULLONG_MAX; + stats->highest_spare_capacity = 0; + stats->least_loaded_cpu = -1; + stats->best_cpu_wakeup_latency = INT_MAX; + /* No need to initialize stats->best_load */ +} + +static inline bool env_has_special_flags(struct cpu_select_env *env) +{ + if (env->need_idle || env->boost_policy != SCHED_BOOST_NONE || + env->reason) + return true; + + return false; +} + +static inline bool +bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats) +{ + int prev_cpu; + struct task_struct *task = env->p; + struct sched_cluster *cluster; + + if (!task->ravg.mark_start || !sched_short_sleep_task_threshold) + return false; + + prev_cpu = env->prev_cpu; + if (!cpumask_test_cpu(prev_cpu, &env->search_cpus)) + return false; + + if (task->ravg.mark_start - task->last_cpu_selected_ts >= + sched_long_cpu_selection_threshold) + return false; + + /* + * This function should be used by task wake up path only as it's + * assuming p->last_switch_out_ts as last sleep time. + * p->last_switch_out_ts can denote last preemption time as well as + * last sleep time. + */ + if (task->ravg.mark_start - task->last_switch_out_ts >= + sched_short_sleep_task_threshold) + return false; + + env->task_load = scale_load_to_cpu(task_load(task), prev_cpu); + cluster = cpu_rq(prev_cpu)->cluster; + + if (!task_load_will_fit(task, env->task_load, prev_cpu, + sched_boost_policy())) { + + __set_bit(cluster->id, env->backup_list); + __clear_bit(cluster->id, env->candidate_list); + return false; + } + + env->cpu_load = cpu_load_sync(prev_cpu, env->sync); + if (sched_cpu_high_irqload(prev_cpu) || + spill_threshold_crossed(env, cpu_rq(prev_cpu))) { + update_spare_capacity(stats, env, prev_cpu, + cluster->capacity, env->cpu_load); + cpumask_clear_cpu(prev_cpu, &env->search_cpus); + return false; + } + + return true; +} + +static inline bool +wake_to_waker_cluster(struct cpu_select_env *env) +{ + return env->sync && + task_load(current) > sched_big_waker_task_load && + task_load(env->p) < sched_small_wakee_task_load; +} + +static inline bool +bias_to_waker_cpu(struct cpu_select_env *env, int cpu) +{ + return sysctl_sched_prefer_sync_wakee_to_waker && + cpu_rq(cpu)->nr_running == 1 && + cpumask_test_cpu(cpu, &env->search_cpus); +} + +static inline int +cluster_allowed(struct cpu_select_env *env, struct sched_cluster *cluster) +{ + return cpumask_intersects(&env->search_cpus, &cluster->cpus); +} + +/* return cheapest cpu that can fit this task */ +static int select_best_cpu(struct task_struct *p, int target, int reason, + int sync) +{ + struct sched_cluster *cluster, *pref_cluster = NULL; + struct cluster_cpu_stats stats; + struct related_thread_group *grp; + unsigned int sbc_flag = 0; + int cpu = raw_smp_processor_id(); + bool special; + + struct cpu_select_env env = { + .p = p, + .reason = reason, + .need_idle = wake_to_idle(p), + .need_waker_cluster = 0, + .sync = sync, + .prev_cpu = target, + .rtg = NULL, + .sbc_best_flag = 0, + .sbc_best_cluster_flag = 0, + .pack_task = false, + }; + + env.boost_policy = task_sched_boost(p) ? + sched_boost_policy() : SCHED_BOOST_NONE; + + bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS); + bitmap_zero(env.backup_list, NR_CPUS); + + cpumask_and(&env.search_cpus, tsk_cpus_allowed(p), cpu_active_mask); + cpumask_andnot(&env.search_cpus, &env.search_cpus, cpu_isolated_mask); + + init_cluster_cpu_stats(&stats); + special = env_has_special_flags(&env); + + rcu_read_lock(); + + grp = task_related_thread_group(p); + + if (grp && grp->preferred_cluster) { + pref_cluster = grp->preferred_cluster; + if (!cluster_allowed(&env, pref_cluster)) + clear_bit(pref_cluster->id, env.candidate_list); + else + env.rtg = grp; + } else if (!special) { + cluster = cpu_rq(cpu)->cluster; + if (wake_to_waker_cluster(&env)) { + if (bias_to_waker_cpu(&env, cpu)) { + target = cpu; + sbc_flag = SBC_FLAG_WAKER_CLUSTER | + SBC_FLAG_WAKER_CPU; + goto out; + } else if (cluster_allowed(&env, cluster)) { + env.need_waker_cluster = 1; + bitmap_zero(env.candidate_list, NR_CPUS); + __set_bit(cluster->id, env.candidate_list); + env.sbc_best_cluster_flag = + SBC_FLAG_WAKER_CLUSTER; + } + } else if (bias_to_prev_cpu(&env, &stats)) { + sbc_flag = SBC_FLAG_PREV_CPU; + goto out; + } + } + + if (!special && is_short_burst_task(p)) { + env.pack_task = true; + sbc_flag = SBC_FLAG_PACK_TASK; + } +retry: + cluster = select_least_power_cluster(&env); + + if (!cluster) + goto out; + + /* + * 'cluster' now points to the minimum power cluster which can satisfy + * task's perf goals. Walk down the cluster list starting with that + * cluster. For non-small tasks, skip clusters that don't have + * mostly_idle/idle cpus + */ + + do { + find_best_cpu_in_cluster(cluster, &env, &stats); + + } while ((cluster = next_best_cluster(cluster, &env, &stats))); + + if (env.need_idle) { + if (stats.best_idle_cpu >= 0) { + target = stats.best_idle_cpu; + sbc_flag |= SBC_FLAG_IDLE_CSTATE; + } else if (stats.least_loaded_cpu >= 0) { + target = stats.least_loaded_cpu; + sbc_flag |= SBC_FLAG_IDLE_LEAST_LOADED; + } + } else if (stats.best_cpu >= 0) { + if (stats.best_sibling_cpu >= 0 && + stats.best_cpu != task_cpu(p) && + stats.min_cost == stats.best_sibling_cpu_cost) { + stats.best_cpu = stats.best_sibling_cpu; + sbc_flag |= SBC_FLAG_BEST_SIBLING; + } + sbc_flag |= env.sbc_best_flag; + target = stats.best_cpu; + } else { + if (env.rtg && env.boost_policy == SCHED_BOOST_NONE) { + env.rtg = NULL; + goto retry; + } + + /* + * With boost_policy == SCHED_BOOST_ON_BIG, we reach here with + * backup_list = little cluster, candidate_list = none and + * stats->best_capacity_cpu points the best spare capacity + * CPU among the CPUs in the big cluster. + */ + if (env.boost_policy == SCHED_BOOST_ON_BIG && + stats.best_capacity_cpu >= 0) + sbc_flag |= SBC_FLAG_BOOST_CLUSTER; + else + find_backup_cluster(&env, &stats); + + if (stats.best_capacity_cpu >= 0) { + target = stats.best_capacity_cpu; + sbc_flag |= SBC_FLAG_BEST_CAP_CPU; + } + } + p->last_cpu_selected_ts = sched_ktime_clock(); +out: + sbc_flag |= env.sbc_best_cluster_flag; + rcu_read_unlock(); + trace_sched_task_load(p, sched_boost_policy() && task_sched_boost(p), + env.reason, env.sync, env.need_idle, sbc_flag, target); + return target; +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static inline struct task_group *next_task_group(struct task_group *tg) +{ + tg = list_entry_rcu(tg->list.next, typeof(struct task_group), list); + + return (&tg->list == &task_groups) ? NULL : tg; +} + +/* Iterate over all cfs_rq in a cpu */ +#define for_each_cfs_rq(cfs_rq, tg, cpu) \ + for (tg = container_of(&task_groups, struct task_group, list); \ + ((tg = next_task_group(tg)) && (cfs_rq = tg->cfs_rq[cpu]));) + +void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) +{ + struct task_group *tg; + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + + for_each_cfs_rq(cfs_rq, tg, cpu) + reset_hmp_stats(&cfs_rq->hmp_stats, reset_cra); + + rcu_read_unlock(); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra); + +/* Add task's contribution to a cpu' HMP statistics */ +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* + * Although below check is not strictly required (as + * inc/dec_nr_big_task and inc/dec_cumulative_runnable_avg called + * from inc_cfs_rq_hmp_stats() have similar checks), we gain a bit on + * efficiency by short-circuiting for_each_sched_entity() loop when + * sched_disable_window_stats + */ + if (sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + inc_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + inc_rq_hmp_stats(rq, p, change_cra); +} + +/* Remove task's contribution from a cpu' HMP statistics */ +static void +_dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, int change_cra) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + /* See comment on efficiency in _inc_hmp_sched_stats_fair */ + if (sched_disable_window_stats) + return; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dec_cfs_rq_hmp_stats(cfs_rq, p, change_cra); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Update rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) + dec_rq_hmp_stats(rq, p, change_cra); +} + +static void inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _inc_hmp_sched_stats_fair(rq, p, 1); +} + +static void dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + _dec_hmp_sched_stats_fair(rq, p, 1); +} + +static void fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + fixup_cumulative_runnable_avg(&cfs_rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&cfs_rq->hmp_stats, p, task_load_delta); + if (cfs_rq_throttled(cfs_rq)) + break; + } + + /* Fix up rq->hmp_stats only if we didn't find any throttled cfs_rq */ + if (!se) { + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, + task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); + } +} + +static int task_will_be_throttled(struct task_struct *p); + +#else /* CONFIG_CFS_BANDWIDTH */ + +inline void reset_cfs_rq_hmp_stats(int cpu, int reset_cra) { } + +static void +inc_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + inc_nr_big_task(&rq->hmp_stats, p); + inc_cumulative_runnable_avg(&rq->hmp_stats, p); +} + +static void +dec_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p) +{ + dec_nr_big_task(&rq->hmp_stats, p); + dec_cumulative_runnable_avg(&rq->hmp_stats, p); +} +static void +fixup_hmp_sched_stats_fair(struct rq *rq, struct task_struct *p, + u32 new_task_load, u32 new_pred_demand) +{ + s64 task_load_delta = (s64)new_task_load - task_load(p); + s64 pred_demand_delta = PRED_DEMAND_DELTA; + + fixup_cumulative_runnable_avg(&rq->hmp_stats, p, task_load_delta, + pred_demand_delta); + fixup_nr_big_tasks(&rq->hmp_stats, p, task_load_delta); +} + +static inline int task_will_be_throttled(struct task_struct *p) +{ + return 0; +} + +void _inc_hmp_sched_stats_fair(struct rq *rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&rq->hmp_stats, p); +} + +#endif /* CONFIG_CFS_BANDWIDTH */ + +/* + * Reset balance_interval at all sched_domain levels of given cpu, so that it + * honors kick. + */ +static inline void reset_balance_interval(int cpu) +{ + struct sched_domain *sd; + + if (cpu >= nr_cpu_ids) + return; + + rcu_read_lock(); + for_each_domain(cpu, sd) + sd->balance_interval = 0; + rcu_read_unlock(); +} + +/* + * Check if a task is on the "wrong" cpu (i.e its current cpu is not the ideal + * cpu as per its demand or priority) + * + * Returns reason why task needs to be migrated + */ +static inline int migration_needed(struct task_struct *p, int cpu) +{ + int nice; + struct related_thread_group *grp; + + if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1) + return 0; + + /* No need to migrate task that is about to be throttled */ + if (task_will_be_throttled(p)) + return 0; + + if (sched_boost_policy() == SCHED_BOOST_ON_BIG && + cpu_capacity(cpu) != max_capacity && task_sched_boost(p)) + return UP_MIGRATION; + + if (sched_cpu_high_irqload(cpu)) + return IRQLOAD_MIGRATION; + + nice = task_nice(p); + rcu_read_lock(); + grp = task_related_thread_group(p); + /* + * Don't assume higher capacity means higher power. If the task + * is running on the power efficient CPU, avoid migrating it + * to a lower capacity cluster. + */ + if (!grp && (nice > SCHED_UPMIGRATE_MIN_NICE || + upmigrate_discouraged(p)) && + cpu_capacity(cpu) > min_capacity && + cpu_max_power_cost(cpu) == max_power_cost) { + rcu_read_unlock(); + return DOWN_MIGRATION; + } + + if (!task_will_fit(p, cpu)) { + rcu_read_unlock(); + return UP_MIGRATION; + } + rcu_read_unlock(); + + return 0; +} + +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +static DEFINE_RAW_SPINLOCK(migration_lock); + +static bool do_migration(int reason, int new_cpu, int cpu) +{ + if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) + && same_cluster(new_cpu, cpu)) + return false; + + /* Inter cluster high irqload migrations are OK */ + return new_cpu != cpu; +} + +/* + * Check if currently running task should be migrated to a better cpu. + * + * Todo: Effect this via changes to nohz_balancer_kick() and load balance? + */ +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq), new_cpu; + int active_balance = 0, reason; + + reason = migration_needed(p, cpu); + if (!reason) + return; + + raw_spin_lock(&migration_lock); + new_cpu = select_best_cpu(p, cpu, reason, 0); + + if (do_migration(reason, new_cpu, cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + mark_reserved(new_cpu); + } + + raw_spin_unlock(&migration_lock); + + if (active_balance) + stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, + &rq->active_balance_work); +} + +#ifdef CONFIG_CFS_BANDWIDTH + +static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) +{ + cfs_rq->hmp_stats.nr_big_tasks = 0; + cfs_rq->hmp_stats.cumulative_runnable_avg = 0; + cfs_rq->hmp_stats.pred_demands_sum = 0; +} + +static void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + inc_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + inc_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) +{ + dec_nr_big_task(&cfs_rq->hmp_stats, p); + if (change_cra) + dec_cumulative_runnable_avg(&cfs_rq->hmp_stats, p); +} + +static void inc_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks += cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg += + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum += cfs_rq->hmp_stats.pred_demands_sum; +} + +static void dec_throttled_cfs_rq_hmp_stats(struct hmp_sched_stats *stats, + struct cfs_rq *cfs_rq) +{ + stats->nr_big_tasks -= cfs_rq->hmp_stats.nr_big_tasks; + stats->cumulative_runnable_avg -= + cfs_rq->hmp_stats.cumulative_runnable_avg; + stats->pred_demands_sum -= cfs_rq->hmp_stats.pred_demands_sum; + + BUG_ON(stats->nr_big_tasks < 0 || + (s64)stats->cumulative_runnable_avg < 0); + BUG_ON((s64)stats->pred_demands_sum < 0); +} + +#else /* CONFIG_CFS_BANDWIDTH */ + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#endif /* CONFIG_CFS_BANDWIDTH */ + +#else /* CONFIG_SCHED_HMP */ + +static inline void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) { } + +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +#define dec_throttled_cfs_rq_hmp_stats(...) +#define inc_throttled_cfs_rq_hmp_stats(...) + +#endif /* CONFIG_SCHED_HMP */ + #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 #error "load tracking assumes 2^10 as unit" #endif @@ -2815,6 +3914,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (cfs_rq) cfs_rq->runnable_load_sum += weight * scaled_delta; } + if (running) sa->util_sum += scaled_delta * scale_cpu; @@ -3383,6 +4483,12 @@ static inline int idle_balance(struct rq *rq) return 0; } +static inline void inc_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + +static inline void dec_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq, + struct task_struct *p, int change_cra) { } + #endif /* CONFIG_SMP */ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -4009,6 +5115,35 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) return cfs_bandwidth_used() && cfs_rq->throttled; } +#ifdef CONFIG_SCHED_HMP +/* + * Check if task is part of a hierarchy where some cfs_rq does not have any + * runtime left. + * + * We can't rely on throttled_hierarchy() to do this test, as + * cfs_rq->throttle_count will not be updated yet when this function is called + * from scheduler_tick() + */ +static int task_will_be_throttled(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq; + + if (!cfs_bandwidth_used()) + return 0; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + if (!cfs_rq->runtime_enabled) + continue; + if (cfs_rq->runtime_remaining <= 0) + return 1; + } + + return 0; +} +#endif + /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { @@ -4088,13 +5223,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + dec_throttled_cfs_rq_hmp_stats(&qcfs_rq->hmp_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + dec_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, cfs_rq); + } cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -4115,6 +5253,12 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) start_cfs_bandwidth(cfs_b); raw_spin_unlock(&cfs_b->lock); + + /* Log effect on hmp stats after throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) @@ -4124,6 +5268,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4151,17 +5296,26 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) if (enqueue) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; + inc_throttled_cfs_rq_hmp_stats(&cfs_rq->hmp_stats, tcfs_rq); if (cfs_rq_throttled(cfs_rq)) break; } - if (!se) + if (!se) { add_nr_running(rq, task_delta); + inc_throttled_cfs_rq_hmp_stats(&rq->hmp_stats, tcfs_rq); + } /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); + + /* Log effect on hmp stats after un-throttling */ + trace_sched_cpu_load_cgroup(rq, idle_cpu(cpu_of(rq)), + sched_irqload(cpu_of(rq)), + power_cost(cpu_of(rq), 0), + cpu_temp(cpu_of(rq))); } static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, @@ -4502,6 +5656,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + init_cfs_rq_hmp_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -4617,7 +5772,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (cfs_rq->nr_running > 1) { + if (rq->cfs.h_nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -4633,8 +5788,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) /* * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. + * current task is from our class. */ static void hrtick_update(struct rq *rq) { @@ -4643,8 +5797,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -4702,7 +5855,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; - walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; } @@ -4710,7 +5863,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; - walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); + inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4719,8 +5872,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + inc_rq_hmp_stats(rq, p, 1); + } #ifdef CONFIG_SMP @@ -4743,8 +5898,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_enqueue_task(p, cpu_of(rq)); - if (!se) { - walt_inc_cumulative_runnable_avg(rq, p); + if (energy_aware() && !se) { if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; @@ -4782,7 +5936,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; - walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4802,7 +5956,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; - walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); + dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; @@ -4811,8 +5965,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(se); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + dec_rq_hmp_stats(rq, p, 1); + } #ifdef CONFIG_SMP @@ -4825,8 +5981,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_dequeue_task(p, cpu_of(rq)); - if (!se) - walt_dec_cumulative_runnable_avg(rq, p); #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -5239,11 +6393,6 @@ unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } -static inline bool energy_aware(void) -{ - return sched_feat(ENERGY_AWARE); -} - struct energy_env { struct sched_group *sg_top; struct sched_group *sg_cap; @@ -5810,12 +6959,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, static inline unsigned long task_util(struct task_struct *p) { -#ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_task_util) { - unsigned long demand = p->ravg.demand; - return (demand << 10) / walt_ravg_window; - } -#endif return p->se.avg.util_avg; } @@ -6200,6 +7343,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } } + if (!(current->flags & PF_WAKE_UP_IDLE) && + !(p->flags & PF_WAKE_UP_IDLE)) + return target; + /* * Otherwise, iterate the domains and find an elegible idle cpu. */ @@ -6724,6 +7871,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; +#ifdef CONFIG_SCHED_HMP + return select_best_cpu(p, prev_cpu, 0, sync); +#endif + if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); want_affine = !wake_wide(p, sibling_count_hint) && @@ -7310,6 +8461,10 @@ enum group_type { #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 #define LBF_SOME_PINNED 0x08 +#define LBF_BIG_TASK_ACTIVE_BALANCE 0x80 +#define LBF_IGNORE_BIG_TASKS 0x100 +#define LBF_IGNORE_PREFERRED_CLUSTER_TASKS 0x200 +#define LBF_MOVED_RELATED_THREAD_GROUP_TASK 0x400 struct lb_env { struct sched_domain *sd; @@ -7327,6 +8482,8 @@ struct lb_env { unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; + unsigned int busiest_grp_capacity; + unsigned int busiest_nr_running; unsigned int flags; @@ -7337,6 +8494,7 @@ struct lb_env { enum fbq_type fbq_type; enum group_type busiest_group_type; struct list_head tasks; + enum sched_boost_policy boost_policy; }; /* @@ -7434,6 +8592,7 @@ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot; + int twf, group_cpus; lockdep_assert_held(&env->src_rq->lock); @@ -7480,6 +8639,39 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; + if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu)) { + if (nr_big_tasks(env->src_rq) && !is_big_task(p)) + return 0; + + if (env->boost_policy == SCHED_BOOST_ON_BIG && + !task_sched_boost(p)) + return 0; + } + + twf = task_will_fit(p, env->dst_cpu); + + /* + * Attempt to not pull tasks that don't fit. We may get lucky and find + * one that actually fits. + */ + if (env->flags & LBF_IGNORE_BIG_TASKS && !twf) + return 0; + + if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS && + !preferred_cluster(rq_cluster(cpu_rq(env->dst_cpu)), p)) + return 0; + + /* + * Group imbalance can sometimes cause work to be pulled across groups + * even though the group could have managed the imbalance on its own. + * Prevent inter-cluster migrations for big tasks when the number of + * tasks is lower than the capacity of the group. + */ + group_cpus = DIV_ROUND_UP(env->busiest_grp_capacity, + SCHED_CAPACITY_SCALE); + if (!twf && env->busiest_nr_running <= group_cpus) + return 0; + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; @@ -7487,15 +8679,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * Aggressive migration if: - * 1) destination numa is preferred - * 2) task is cache cold, or - * 3) too many balance attempts have failed. + * 1) IDLE or NEWLY_IDLE balance. + * 2) destination numa is preferred + * 3) task is cache cold, or + * 4) too many balance attempts have failed. */ tsk_cache_hot = migrate_degrades_locality(p, env); if (tsk_cache_hot == -1) tsk_cache_hot = task_hot(p, env); - if (tsk_cache_hot <= 0 || + if (env->idle != CPU_NOT_IDLE || tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); @@ -7515,10 +8708,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + if (task_in_related_thread_group(p)) + env->flags |= LBF_MOVED_RELATED_THREAD_GROUP_TASK; double_unlock_balance(env->src_rq, env->dst_rq); } @@ -7547,6 +8742,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) * inside detach_tasks(). */ schedstat_inc(env->sd, lb_gained[env->idle]); + return p; } return NULL; @@ -7566,12 +8762,20 @@ static int detach_tasks(struct lb_env *env) struct task_struct *p; unsigned long load; int detached = 0; + int orig_loop = env->loop; lockdep_assert_held(&env->src_rq->lock); if (env->imbalance <= 0) return 0; + if (!same_cluster(env->dst_cpu, env->src_cpu)) + env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS; + + if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu)) + env->flags |= LBF_IGNORE_BIG_TASKS; + +redo: while (!list_empty(tasks)) { /* * We don't want to steal all, otherwise we may be treated likewise, @@ -7633,6 +8837,15 @@ next: list_move_tail(&p->se.group_node, tasks); } + if (env->flags & (LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS) && !detached) { + tasks = &env->src_rq->cfs_tasks; + env->flags &= ~(LBF_IGNORE_BIG_TASKS | + LBF_IGNORE_PREFERRED_CLUSTER_TASKS); + env->loop = orig_loop; + goto redo; + } + /* * Right now, this is one of only two places we collect this stat * so we can safely collect detach_one_task() stats here rather @@ -7651,8 +8864,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7796,6 +9009,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ +#ifdef CONFIG_SCHED_HMP + unsigned long sum_nr_big_tasks; + u64 group_cpu_load; /* Scaled load of all CPUs of the group */ +#endif unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; @@ -7839,10 +9056,64 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) .avg_load = 0UL, .sum_nr_running = 0, .group_type = group_other, +#ifdef CONFIG_SCHED_HMP + .sum_nr_big_tasks = 0UL, + .group_cpu_load = 0ULL, +#endif }, }; } +#ifdef CONFIG_SCHED_HMP + +static int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + int local_cpu, busiest_cpu; + int local_capacity, busiest_capacity; + int local_pwr_cost, busiest_pwr_cost; + int nr_cpus; + int boost = sched_boost(); + + if (!sysctl_sched_restrict_cluster_spill || + boost == FULL_THROTTLE_BOOST || boost == CONSERVATIVE_BOOST) + return 0; + + local_cpu = group_first_cpu(sds->local); + busiest_cpu = group_first_cpu(sds->busiest); + + local_capacity = cpu_max_possible_capacity(local_cpu); + busiest_capacity = cpu_max_possible_capacity(busiest_cpu); + + local_pwr_cost = cpu_max_power_cost(local_cpu); + busiest_pwr_cost = cpu_max_power_cost(busiest_cpu); + + if (local_pwr_cost <= busiest_pwr_cost) + return 0; + + if (local_capacity > busiest_capacity && + sds->busiest_stat.sum_nr_big_tasks) + return 0; + + nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest)); + if ((sds->busiest_stat.group_cpu_load < nr_cpus * sched_spill_load) && + (sds->busiest_stat.sum_nr_running < + nr_cpus * sysctl_sched_spill_nr_run)) + return 1; + + return 0; +} + +#else /* CONFIG_SCHED_HMP */ + +static inline int +bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HMP */ + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -7986,6 +9257,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu) struct sched_group_capacity *sgc; struct rq *rq = cpu_rq(cpu); + if (cpumask_test_cpu(cpu, cpu_isolated_mask)) + continue; /* * build_sched_domains() -> init_sched_groups_capacity() * gets here before we've attached the domains to the @@ -8017,9 +9290,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) do { struct sched_group_capacity *sgc = group->sgc; - capacity += sgc->capacity; - max_capacity = max(sgc->max_capacity, max_capacity); - min_capacity = min(sgc->min_capacity, min_capacity); + cpumask_t *cpus = sched_group_cpus(group); + + /* Revisit this later. This won't work for MT domain */ + if (!cpu_isolated(cpumask_first(cpus))) { + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); + } group = group->next; } while (group != child->groups); } @@ -8135,7 +9413,7 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) static inline enum group_type group_classify(struct sched_group *group, - struct sg_lb_stats *sgs) + struct sg_lb_stats *sgs, struct lb_env *env) { if (sgs->group_no_capacity) return group_overloaded; @@ -8204,6 +9482,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); + trace_sched_cpu_load_lb(cpu_rq(i), idle_cpu(i), + sched_irqload(i), + power_cost(i, 0), + cpu_temp(i)); + + if (cpu_isolated(i)) + continue; + /* if we are entering idle and there are CPUs with * their tick stopped, do an update for them */ @@ -8224,6 +9510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (nr_running > 1) *overload = true; +#ifdef CONFIG_SCHED_HMP + sgs->sum_nr_big_tasks += rq->hmp_stats.nr_big_tasks; + sgs->group_cpu_load += cpu_load(i); +#endif + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -8235,25 +9526,62 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) { + if (energy_aware() && cpu_overutilized(i)) { *overutilized = true; if (!sgs->group_misfit_task && rq->misfit_task) sgs->group_misfit_task = capacity_of(i); } } - /* Adjust by relative CPU capacity of the group */ - sgs->group_capacity = group->sgc->capacity; - sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; + /* Isolated CPU has no weight */ + if (!group->group_weight) { + sgs->group_capacity = 0; + sgs->avg_load = 0; + sgs->group_no_capacity = 1; + sgs->group_type = group_other; + sgs->group_weight = group->group_weight; + } else { + /* Adjust by relative CPU capacity of the group */ + sgs->group_capacity = group->sgc->capacity; + sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / + sgs->group_capacity; + + sgs->group_weight = group->group_weight; + + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(group, sgs, env); + } if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; +} - sgs->group_weight = group->group_weight; +#ifdef CONFIG_SCHED_HMP +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + if (env->idle != CPU_NOT_IDLE && + cpu_capacity(env->dst_cpu) > group_rq_capacity(sg)) { + if (sgs->sum_nr_big_tasks > + sds->busiest_stat.sum_nr_big_tasks) { + env->flags |= LBF_BIG_TASK_ACTIVE_BALANCE; + return true; + } + } - sgs->group_no_capacity = group_is_overloaded(env, sgs); - sgs->group_type = group_classify(group, sgs); + return false; } +#else +static bool update_sd_pick_busiest_active_balance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs) +{ + return false; +} +#endif /** * update_sd_pick_busiest - return 1 on busiest group @@ -8275,35 +9603,40 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + if (update_sd_pick_busiest_active_balance(env, sds, sg, sgs)) + return true; + if (sgs->group_type > busiest->group_type) return true; if (sgs->group_type < busiest->group_type) return false; - /* - * Candidate sg doesn't face any serious load-balance problems - * so don't pick it if the local sg is already filled up. - */ - if (sgs->group_type == group_other && - !group_has_capacity(env, &sds->local_stat)) - return false; + if (energy_aware()) { + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; - if (sgs->avg_load <= busiest->avg_load) - return false; + if (sgs->avg_load <= busiest->avg_load) + return false; - if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) - goto asym_packing; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; - /* - * Candidate sg has no more than one task per CPU and - * has higher per-CPU capacity. Migrating tasks to less - * capable CPUs may harm throughput. Maximize throughput, - * power/energy consequences are not considered. - */ - if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) - return false; + /* + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + } asym_packing: /* This is the busiest node in its class. */ @@ -8411,14 +9744,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd group_has_capacity(env, &sds->local_stat) && (sgs->sum_nr_running > 1)) { sgs->group_no_capacity = 1; - sgs->group_type = group_classify(sg, sgs); + sgs->group_type = group_classify(sg, sgs, env); } /* * Ignore task groups with misfit tasks if local group has no * capacity or if per-cpu capacity isn't higher. */ - if (sgs->group_type == group_misfit_task && + if (energy_aware() && + sgs->group_type == group_misfit_task && (!group_has_capacity(env, &sds->local_stat) || !group_smaller_cpu_capacity(sg, sds->local))) sgs->group_type = group_other; @@ -8426,6 +9760,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; + env->busiest_nr_running = sgs->sum_nr_running; + env->busiest_grp_capacity = sgs->group_capacity; } next_group: @@ -8447,12 +9783,12 @@ next_group: env->dst_rq->rd->overload = overload; /* Update over-utilization (tipping point, U >= 0) indicator */ - if (env->dst_rq->rd->overutilized != overutilized) { + if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) { env->dst_rq->rd->overutilized = overutilized; trace_sched_overutilized(overutilized); } } else { - if (!env->dst_rq->rd->overutilized && overutilized) { + if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) { env->dst_rq->rd->overutilized = true; trace_sched_overutilized(true); } @@ -8604,20 +9940,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { - /* Misfitting tasks should be migrated in any case */ - if (busiest->group_type == group_misfit_task) { - env->imbalance = busiest->group_misfit_task; - return; - } + if (energy_aware()) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } - /* - * Busiest group is overloaded, local is not, use the spare - * cycles to maximize throughput - */ - if (busiest->group_type == group_overloaded && - local->group_type <= group_misfit_task) { - env->imbalance = busiest->load_per_task; - return; + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } } env->imbalance = 0; @@ -8654,7 +9992,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s ) / SCHED_CAPACITY_SCALE; /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) + if (energy_aware() && busiest->group_type == group_misfit_task) env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task); @@ -8715,6 +10053,12 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) + goto force_balance; + + if (bail_inter_cluster_balance(env, &sds)) + goto out_balanced; + sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load) / sds.total_capacity; @@ -8735,7 +10079,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* Misfitting tasks should be dealt with regardless of the avg load */ - if (busiest->group_type == group_misfit_task) { + if (energy_aware() && busiest->group_type == group_misfit_task) { goto force_balance; } @@ -8786,6 +10130,60 @@ out_balanced: return NULL; } +#ifdef CONFIG_SCHED_HMP +static struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + struct rq *busiest = NULL, *busiest_big = NULL; + u64 max_runnable_avg = 0, max_runnable_avg_big = 0; + int max_nr_big = 0, nr_big; + bool find_big = !!(env->flags & LBF_BIG_TASK_ACTIVE_BALANCE); + int i; + cpumask_t cpus; + + cpumask_andnot(&cpus, sched_group_cpus(group), cpu_isolated_mask); + + for_each_cpu(i, &cpus) { + struct rq *rq = cpu_rq(i); + u64 cumulative_runnable_avg = + rq->hmp_stats.cumulative_runnable_avg; + + if (!cpumask_test_cpu(i, env->cpus)) + continue; + + + if (find_big) { + nr_big = nr_big_tasks(rq); + if (nr_big > max_nr_big || + (nr_big > 0 && nr_big == max_nr_big && + cumulative_runnable_avg > max_runnable_avg_big)) { + max_runnable_avg_big = cumulative_runnable_avg; + busiest_big = rq; + max_nr_big = nr_big; + continue; + } + } + + if (cumulative_runnable_avg > max_runnable_avg) { + max_runnable_avg = cumulative_runnable_avg; + busiest = rq; + } + } + + if (busiest_big) + return busiest_big; + + env->flags &= ~LBF_BIG_TASK_ACTIVE_BALANCE; + return busiest; +} +#else +static inline struct rq *find_busiest_queue_hmp(struct lb_env *env, + struct sched_group *group) +{ + return NULL; +} +#endif + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -8796,6 +10194,10 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_capacity = 1; int i; +#ifdef CONFIG_SCHED_HMP + return find_busiest_queue_hmp(env, group); +#endif + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long capacity, wl; enum fbq_type rt; @@ -8864,15 +10266,20 @@ static struct rq *find_busiest_queue(struct lb_env *env, * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough. */ -#define MAX_PINNED_INTERVAL 512 +#define MAX_PINNED_INTERVAL 16 /* Working cpumask for load_balance and load_balance_newidle. */ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +#define NEED_ACTIVE_BALANCE_THRESHOLD 10 + static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; + if (env->flags & LBF_BIG_TASK_ACTIVE_BALANCE) + return 1; + if (env->idle == CPU_NEWLY_IDLE) { /* @@ -8897,7 +10304,8 @@ static int need_active_balance(struct lb_env *env) return 1; } - if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + if (energy_aware() && + (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && env->src_rq->cfs.h_nr_running == 1 && cpu_overutilized(env->src_cpu) && @@ -8905,10 +10313,18 @@ static int need_active_balance(struct lb_env *env) return 1; } - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); + return unlikely(sd->nr_balance_failed > + sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD); } -static int active_load_balance_cpu_stop(void *data); +static int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + cpumask_t cpus; + + cpumask_and(&cpus, sched_group_cpus(sg), sched_group_mask(sg)); + cpumask_andnot(&cpus, &cpus, cpu_isolated_mask); + return cpumask_first(&cpus); +} static int should_we_balance(struct lb_env *env) { @@ -8927,7 +10343,8 @@ static int should_we_balance(struct lb_env *env) sg_mask = sched_group_mask(sg); /* Try to find first idle cpu */ for_each_cpu_and(cpu, sg_cpus, env->cpus) { - if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu) || + cpu_isolated(cpu)) continue; balance_cpu = cpu; @@ -8935,7 +10352,7 @@ static int should_we_balance(struct lb_env *env) } if (balance_cpu == -1) - balance_cpu = group_balance_cpu(sg); + balance_cpu = group_balance_cpu_not_isolated(sg); /* * First idle cpu or the first cpu(busiest) in this sched group @@ -8952,23 +10369,29 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { - int ld_moved, cur_ld_moved, active_balance = 0; + int ld_moved = 0, cur_ld_moved, active_balance = 0; struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; - struct sched_group *group; - struct rq *busiest; + struct sched_group *group = NULL; + struct rq *busiest = NULL; unsigned long flags; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { - .sd = sd, - .dst_cpu = this_cpu, - .dst_rq = this_rq, - .dst_grpmask = sched_group_cpus(sd->groups), - .idle = idle, - .loop_break = sched_nr_migrate_break, - .cpus = cpus, - .fbq_type = all, - .tasks = LIST_HEAD_INIT(env.tasks), + .sd = sd, + .dst_cpu = this_cpu, + .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), + .idle = idle, + .loop_break = sched_nr_migrate_break, + .cpus = cpus, + .fbq_type = all, + .tasks = LIST_HEAD_INIT(env.tasks), + .imbalance = 0, + .flags = 0, + .loop = 0, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .boost_policy = sched_boost_policy(), }; /* @@ -9022,6 +10445,13 @@ more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); update_rq_clock(busiest); + /* The world might have changed. Validate assumptions */ + if (busiest->nr_running <= 1) { + raw_spin_unlock_irqrestore(&busiest->lock, flags); + env.flags &= ~LBF_ALL_PINNED; + goto no_move; + } + /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations @@ -9109,17 +10539,22 @@ more_balance: } } +no_move: if (!ld_moved) { - schedstat_inc(sd, lb_failed[idle]); + if (!(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) + schedstat_inc(sd, lb_failed[idle]); + /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. */ - if (idle != CPU_NEWLY_IDLE) - if (env.src_grp_nr_running > 1) + if (idle != CPU_NEWLY_IDLE && + !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) { + if (env.src_grp_nr_running > 1) sd->nr_balance_failed++; + } if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); @@ -9141,7 +10576,8 @@ more_balance: * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ - if (!busiest->active_balance) { + if (!busiest->active_balance && + !cpu_isolated(cpu_of(busiest))) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; @@ -9152,17 +10588,31 @@ more_balance: stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); + *continue_balancing = 0; } /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries+1; + sd->nr_balance_failed = + sd->cache_nice_tries + + NEED_ACTIVE_BALANCE_THRESHOLD - 1; } - } else + } else { sd->nr_balance_failed = 0; + /* Assumes one 'busiest' cpu that we pulled tasks from */ + if (!same_freq_domain(this_cpu, cpu_of(busiest))) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + + check_for_freq_change(this_rq, false, check_groups); + check_for_freq_change(busiest, false, check_groups); + } else { + check_for_freq_change(this_rq, true, false); + } + } if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; @@ -9210,6 +10660,11 @@ out_one_pinned: ld_moved = 0; out: + trace_sched_load_balance(this_cpu, idle, *continue_balancing, + group ? group->cpumask[0] : 0, + busiest ? busiest->nr_running : 0, + env.imbalance, env.flags, ld_moved, + sd->balance_interval); return ld_moved; } @@ -9252,6 +10707,9 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; + if (cpu_isolated(this_cpu)) + return 0; + idle_enter_fair(this_rq); /* @@ -9306,9 +10764,12 @@ static int idle_balance(struct rq *this_rq) /* * Stop searching for tasks to pull if there are - * now runnable tasks on this rq. + * now runnable tasks on the balance rq or if + * continue_balancing has been unset (only possible + * due to active migration). */ - if (pulled_task || this_rq->nr_running > 0) + if (pulled_task || this_rq->nr_running > 0 || + !continue_balancing) break; } rcu_read_unlock(); @@ -9360,13 +10821,19 @@ static int active_load_balance_cpu_stop(void *data) struct task_struct *push_task = NULL; int push_task_detached = 0; struct lb_env env = { - .sd = sd, - .dst_cpu = target_cpu, - .dst_rq = target_rq, - .src_cpu = busiest_rq->cpu, - .src_rq = busiest_rq, - .idle = CPU_IDLE, + .sd = sd, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, + .idle = CPU_IDLE, + .busiest_nr_running = 0, + .busiest_grp_capacity = 0, + .flags = 0, + .loop = 0, + .boost_policy = sched_boost_policy(), }; + bool moved = false; raw_spin_lock_irq(&busiest_rq->lock); @@ -9387,12 +10854,15 @@ static int active_load_balance_cpu_stop(void *data) BUG_ON(busiest_rq == target_rq); push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; if (push_task) { if (task_on_rq_queued(push_task) && + push_task->state == TASK_RUNNING && task_cpu(push_task) == busiest_cpu && cpu_online(target_cpu)) { detach_task(push_task, &env); push_task_detached = 1; + moved = true; } goto out_unlock; } @@ -9411,14 +10881,18 @@ static int active_load_balance_cpu_stop(void *data) update_rq_clock(busiest_rq); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); - else + moved = true; + } else { schedstat_inc(sd, alb_failed); + } } rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; + push_task = busiest_rq->push_task; + target_cpu = busiest_rq->push_cpu; if (push_task) busiest_rq->push_task = NULL; @@ -9429,6 +10903,7 @@ out_unlock: if (push_task_detached) attach_one_task(target_rq, push_task); put_task_struct(push_task); + clear_reserved(target_cpu); } if (p) @@ -9436,6 +10911,15 @@ out_unlock: local_irq_enable(); + if (moved && !same_freq_domain(busiest_cpu, target_cpu)) { + int check_groups = !!(env.flags & + LBF_MOVED_RELATED_THREAD_GROUP_TASK); + check_for_freq_change(busiest_rq, false, check_groups); + check_for_freq_change(target_rq, false, check_groups); + } else if (moved) { + check_for_freq_change(target_rq, true, false); + } + return 0; } @@ -9451,9 +10935,49 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static inline int find_new_ilb(void) + +#ifdef CONFIG_SCHED_HMP +static inline int find_new_hmp_ilb(int type) +{ + int call_cpu = raw_smp_processor_id(); + struct sched_domain *sd; + int ilb; + + rcu_read_lock(); + + /* Pick an idle cpu "closest" to call_cpu */ + for_each_domain(call_cpu, sd) { + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + sched_domain_span(sd)) { + if (idle_cpu(ilb) && (type != NOHZ_KICK_RESTRICT || + cpu_max_power_cost(ilb) <= + cpu_max_power_cost(call_cpu))) { + rcu_read_unlock(); + reset_balance_interval(ilb); + return ilb; + } + } + } + + rcu_read_unlock(); + return nr_cpu_ids; +} +#else /* CONFIG_SCHED_HMP */ +static inline int find_new_hmp_ilb(int type) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + return 0; +} +#endif /* CONFIG_SCHED_HMP */ + +static inline int find_new_ilb(int type) +{ + int ilb; + +#ifdef CONFIG_SCHED_HMP + return find_new_hmp_ilb(type); +#endif + + ilb = cpumask_first(nohz.idle_cpus_mask); if (ilb < nr_cpu_ids && idle_cpu(ilb)) return ilb; @@ -9466,13 +10990,13 @@ static inline int find_new_ilb(void) * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle * CPU (if there is one). */ -static void nohz_balancer_kick(void) +static void nohz_balancer_kick(int type) { int ilb_cpu; nohz.next_balance++; - ilb_cpu = find_new_ilb(); + ilb_cpu = find_new_ilb(type); if (ilb_cpu >= nr_cpu_ids) return; @@ -9489,16 +11013,21 @@ static void nohz_balancer_kick(void) return; } +void nohz_balance_clear_nohz_mask(int cpu) +{ + if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } +} + static inline void nohz_balance_exit_idle(int cpu) { if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { /* * Completely isolated CPUs don't ever set, so we must test. */ - if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) { - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - atomic_dec(&nohz.nr_cpus); - } + nohz_balance_clear_nohz_mask(cpu); clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } } @@ -9555,7 +11084,7 @@ void nohz_balance_enter_idle(int cpu) /* * If we're a completely isolated CPU, we don't play. */ - if (on_null_domain(cpu_rq(cpu))) + if (on_null_domain(cpu_rq(cpu)) || cpu_isolated(cpu)) return; cpumask_set_cpu(cpu, nohz.idle_cpus_mask); @@ -9584,7 +11113,13 @@ static DEFINE_SPINLOCK(balancing); */ void update_max_interval(void) { - max_load_balance_interval = HZ*num_online_cpus()/10; + cpumask_t avail_mask; + unsigned int available_cpus; + + cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask); + available_cpus = cpumask_weight(&avail_mask); + + max_load_balance_interval = HZ*available_cpus/10; } /* @@ -9709,12 +11244,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t cpus; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) goto end; - for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask); + + for_each_cpu(balance_cpu, &cpus) { if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; @@ -9757,6 +11295,79 @@ end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } +#ifdef CONFIG_SCHED_HMP +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + struct sched_domain *sd; + int i; + + if (rq->nr_running < 2) + return 0; + + if (!sysctl_sched_restrict_cluster_spill || + sched_boost_policy() == SCHED_BOOST_ON_ALL) + return 1; + + if (cpu_max_power_cost(cpu) == max_power_cost) + return 1; + + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(rq->sd); + if (!sd) { + rcu_read_unlock(); + return 0; + } + + for_each_cpu(i, sched_domain_span(sd)) { + if (cpu_load(i) < sched_spill_load && + cpu_rq(i)->nr_running < + sysctl_sched_spill_nr_run) { + /* Change the kick type to limit to CPUs that + * are of equal or lower capacity. + */ + *type = NOHZ_KICK_RESTRICT; + break; + } + } + rcu_read_unlock(); + return 1; +} +#else +static inline int _nohz_kick_needed_hmp(struct rq *rq, int cpu, int *type) +{ + return 0; +} +#endif + +static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) +{ + unsigned long now = jiffies; + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; + +#ifdef CONFIG_SCHED_HMP + return _nohz_kick_needed_hmp(rq, cpu, type); +#endif + + if (time_before(now, nohz.next_balance)) + return 0; + + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) + return true; + + /* Do idle load balance if there have misfit task */ + if (energy_aware()) + return rq->misfit_task; + + return (rq->nr_running >= 2); +} + /* * Current heuristic for kicking the idle load balancer in the presence * of an idle cpu in the system. @@ -9768,12 +11379,14 @@ end: * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline bool nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq, int *type) { - unsigned long now = jiffies; +#ifndef CONFIG_SCHED_HMP struct sched_domain *sd; struct sched_group_capacity *sgc; - int nr_busy, cpu = rq->cpu; + int nr_busy; +#endif + int cpu = rq->cpu; bool kick = false; if (unlikely(rq->idle_balance)) @@ -9786,24 +11399,10 @@ static inline bool nohz_kick_needed(struct rq *rq) set_cpu_sd_state_busy(); nohz_balance_exit_idle(cpu); - /* - * None are in tickless mode and hence no need for NOHZ idle load - * balancing. - */ - if (likely(!atomic_read(&nohz.nr_cpus))) - return false; - - if (time_before(now, nohz.next_balance)) - return false; - - if (rq->nr_running >= 2 && - (!energy_aware() || cpu_overutilized(cpu))) + if (_nohz_kick_needed(rq, cpu, type)) return true; - /* Do idle load balance if there have misfit task */ - if (energy_aware()) - return rq->misfit_task; - +#ifndef CONFIG_SCHED_HMP rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); if (sd) { @@ -9835,6 +11434,7 @@ static inline bool nohz_kick_needed(struct rq *rq) unlock: rcu_read_unlock(); +#endif return kick; } #else @@ -9868,15 +11468,19 @@ static void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + int type = NOHZ_KICK_ANY; + + /* Don't need to rebalance while attached to NULL domain or + * cpu is isolated. + */ + if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq))) return; if (time_after_eq(jiffies, rq->next_balance)) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ_COMMON - if (nohz_kick_needed(rq)) - nohz_balancer_kick(); + if (nohz_kick_needed(rq, &type)) + nohz_balancer_kick(type); #endif } @@ -9895,47 +11499,6 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } -static inline int -kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) -{ - int rc = 0; - - /* Invoke active balance to force migrate currently running task */ - raw_spin_lock(&rq->lock); - if (!rq->active_balance) { - rq->active_balance = 1; - rq->push_cpu = new_cpu; - get_task_struct(p); - rq->push_task = p; - rc = 1; - } - raw_spin_unlock(&rq->lock); - - return rc; -} - -void check_for_migration(struct rq *rq, struct task_struct *p) -{ - int new_cpu; - int active_balance; - int cpu = task_cpu(p); - - if (energy_aware() && rq->misfit_task) { - if (rq->curr->state != TASK_RUNNING || - rq->curr->nr_cpus_allowed == 1) - return; - - new_cpu = select_energy_cpu_brute(p, cpu, 0); - if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { - active_balance = kick_active_balance(rq, p, new_cpu); - if (active_balance) - stop_one_cpu_nowait(cpu, - active_load_balance_cpu_stop, - rq, &rq->active_balance_work); - } - } -} - #endif /* CONFIG_SMP */ /* @@ -9955,7 +11518,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); #ifdef CONFIG_SMP - if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + if (energy_aware() && + !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { rq->rd->overutilized = true; trace_sched_overutilized(true); } @@ -10231,11 +11795,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -10290,21 +11851,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -10388,7 +11957,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -10450,6 +12019,11 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif +#ifdef CONFIG_SCHED_HMP + .inc_hmp_sched_stats = inc_hmp_sched_stats_fair, + .dec_hmp_sched_stats = dec_hmp_sched_stats_fair, + .fixup_hmp_sched_stats = fixup_hmp_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG |