diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 782 |
1 files changed, 424 insertions, 358 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 266fc95f6c0f..43c3d2684f64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3690,68 +3690,6 @@ static inline int migration_needed(struct task_struct *p, int cpu) return 0; } -static inline int -kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) -{ - unsigned long flags; - int rc = 0; - - /* Invoke active balance to force migrate currently running task */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (!rq->active_balance) { - rq->active_balance = 1; - rq->push_cpu = new_cpu; - get_task_struct(p); - rq->push_task = p; - rc = 1; - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return rc; -} - -static DEFINE_RAW_SPINLOCK(migration_lock); - -static bool do_migration(int reason, int new_cpu, int cpu) -{ - if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) - && same_cluster(new_cpu, cpu)) - return false; - - /* Inter cluster high irqload migrations are OK */ - return new_cpu != cpu; -} - -/* - * Check if currently running task should be migrated to a better cpu. - * - * Todo: Effect this via changes to nohz_balancer_kick() and load balance? - */ -void check_for_migration(struct rq *rq, struct task_struct *p) -{ - int cpu = cpu_of(rq), new_cpu; - int active_balance = 0, reason; - - reason = migration_needed(p, cpu); - if (!reason) - return; - - raw_spin_lock(&migration_lock); - new_cpu = select_best_cpu(p, cpu, reason, 0); - - if (do_migration(reason, new_cpu, cpu)) { - active_balance = kick_active_balance(rq, p, new_cpu); - if (active_balance) - mark_reserved(new_cpu); - } - - raw_spin_unlock(&migration_lock); - - if (active_balance) - stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, - &rq->active_balance_work); -} - #ifdef CONFIG_CFS_BANDWIDTH static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) @@ -5060,12 +4998,9 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - if (cfs_b->quota == RUNTIME_INF) return; - now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; } @@ -6062,6 +5997,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dec_rq_hmp_stats(rq, p, 1); } +#ifdef CONFIG_SMP + if (energy_aware() && !se) + walt_dec_cumulative_runnable_avg(rq, p); +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -6472,28 +6412,79 @@ unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * CPU candidates. + * + * These are labels to reference CPU candidates for an energy_diff. + * Currently we support only two possible candidates: the task's previous CPU + * and another candiate CPU. + * More advanced/aggressive EAS selection policies can consider more + * candidates. + */ +#define EAS_CPU_PRV 0 +#define EAS_CPU_NXT 1 +#define EAS_CPU_BKP 2 +#define EAS_CPU_CNT 3 + +/* + * Returns the current capacity of cpu after applying both + * cpu and min freq scaling. + */ +unsigned long capacity_min_of(int cpu) +{ + if (!sched_feat(MIN_CAPACITY_CAPPING)) + return 0; + return arch_scale_cpu_capacity(NULL, cpu) * + arch_scale_min_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +/* + * energy_diff - supports the computation of the estimated energy impact in + * moving a "task"'s "util_delta" between different CPU candidates. + */ struct energy_env { - struct sched_group *sg_top; - struct sched_group *sg_cap; - int cap_idx; + /* Utilization to move */ + struct task_struct *p; int util_delta; - int src_cpu; - int dst_cpu; - int trg_cpu; - int energy; - int payoff; - struct task_struct *task; - struct { - int before; - int after; - int delta; - int diff; - } nrg; + + /* Mask of CPUs candidates to evaluate */ + cpumask_t cpus_mask; + + /* CPU candidates to evaluate */ struct { - int before; - int after; - int delta; - } cap; + + /* CPU ID, must be in cpus_mask */ + int cpu_id; + + /* + * Index (into sched_group_energy::cap_states) of the OPP the + * CPU needs to run at if the task is placed on it. + * This includes the both active and blocked load, due to + * other tasks on this CPU, as well as the task's own + * utilization. + */ + int cap_idx; + int cap; + + /* Estimated system energy */ + unsigned int energy; + + /* Estimated energy variation wrt EAS_CPU_PRV */ + int nrg_delta; + + } cpu[EAS_CPU_CNT]; + + /* + * Index (into energy_env::cpu) of the morst energy efficient CPU for + * the specified energy_env::task + */ + int next_idx; + + /* Support data */ + struct sched_group *sg_top; + struct sched_group *sg_cap; + struct sched_group *sg; }; static int cpu_util_wake(int cpu, struct task_struct *p); @@ -6521,24 +6512,33 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct energy_env *eenv) +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) { unsigned long max_util = 0; unsigned long util; int cpu; for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { - util = cpu_util_wake(cpu, eenv->task); + util = cpu_util_wake(cpu, eenv->p); /* * If we are looking at the target CPU specified by the eenv, * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->trg_cpu)) + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) util += eenv->util_delta; max_util = max(max_util, util); + + /* + * Take into account any minimum frequency imposed + * elsewhere which limits the energy states available + * If the MIN_CAPACITY_CAPPING feature is not enabled + * capacity_min_of will return 0 (not capped). + */ + max_util = max(max_util, capacity_min_of(cpu)); + } return max_util; @@ -6556,21 +6556,21 @@ static unsigned long group_max_util(struct energy_env *eenv) * estimate (more busy). */ static unsigned -long group_norm_util(struct energy_env *eenv, struct sched_group *sg) +long group_norm_util(struct energy_env *eenv, int cpu_idx) { - unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long capacity = eenv->cpu[cpu_idx].cap; unsigned long util, util_sum = 0; int cpu; - for_each_cpu(cpu, sched_group_cpus(sg)) { - util = cpu_util_wake(cpu, eenv->task); + for_each_cpu(cpu, sched_group_cpus(eenv->sg)) { + util = cpu_util_wake(cpu, eenv->p); /* * If we are looking at the target CPU specified by the eenv, * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->trg_cpu)) + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) util += eenv->util_delta; util_sum += __cpu_norm_util(util, capacity); @@ -6579,27 +6579,31 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } -static int find_new_capacity(struct energy_env *eenv, - const struct sched_group_energy * const sge) +static int find_new_capacity(struct energy_env *eenv, int cpu_idx) { + const struct sched_group_energy *sge = eenv->sg->sge; int idx, max_idx = sge->nr_cap_states - 1; - unsigned long util = group_max_util(eenv); + unsigned long util = group_max_util(eenv, cpu_idx); /* default is max_cap if we don't find a match */ - eenv->cap_idx = max_idx; + eenv->cpu[cpu_idx].cap_idx = max_idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap; for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) { - eenv->cap_idx = idx; + /* Keep track of SG's capacity */ + eenv->cpu[cpu_idx].cap_idx = idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap; break; } } - return eenv->cap_idx; + return eenv->cpu[cpu_idx].cap_idx; } -static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) +static int group_idle_state(struct energy_env *eenv, int cpu_idx) { + struct sched_group *sg = eenv->sg; int i, state = INT_MAX; int src_in_grp, dst_in_grp; long grp_util = 0; @@ -6611,8 +6615,10 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; - src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); - dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id, + sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id, + sched_group_cpus(sg)); if (src_in_grp == dst_in_grp) { /* both CPUs under consideration are in the same group or not in * either group, migration should leave idle state the same. @@ -6625,8 +6631,8 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) * achievable when we move the task. */ for_each_cpu(i, sched_group_cpus(sg)) { - grp_util += cpu_util_wake(i, eenv->task); - if (unlikely(i == eenv->trg_cpu)) + grp_util += cpu_util_wake(i, eenv->p); + if (unlikely(i == eenv->cpu[cpu_idx].cpu_id)) grp_util += eenv->util_delta; } @@ -6662,19 +6668,65 @@ end: } /* - * sched_group_energy(): Computes the absolute energy consumption of cpus - * belonging to the sched_group including shared resources shared only by - * members of the group. Iterates over all cpus in the hierarchy below the - * sched_group starting from the bottom working it's way up before going to - * the next cpu until all cpus are covered at all levels. The current - * implementation is likely to gather the same util statistics multiple times. - * This can probably be done in a faster but more complex way. - * Note: sched_group_energy() may fail when racing with sched_domain updates. + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg). + * + * This works in iterations to compute the SG's energy for each CPU + * candidate defined by the energy_env's cpu array. + * + * NOTE: in the following computations for busy_energy and idle_energy we do + * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors. + * The required scaling will be performed just one time, by the calling + * functions, once we accumulated the contributons for all the SGs. */ -static int sched_group_energy(struct energy_env *eenv) +static void calc_sg_energy(struct energy_env *eenv) +{ + struct sched_group *sg = eenv->sg; + int busy_energy, idle_energy; + unsigned int busy_power; + unsigned int idle_power; + unsigned long sg_util; + int cap_idx, idle_idx; + int total_energy = 0; + int cpu_idx; + + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + + + if (eenv->cpu[cpu_idx].cpu_id == -1) + continue; + /* Compute ACTIVE energy */ + cap_idx = find_new_capacity(eenv, cpu_idx); + busy_power = sg->sge->cap_states[cap_idx].power; + /* + * in order to calculate cpu_norm_util, we need to know which + * capacity level the group will be at, so calculate that first + */ + sg_util = group_norm_util(eenv, cpu_idx); + + busy_energy = sg_util * busy_power; + + /* Compute IDLE energy */ + idle_idx = group_idle_state(eenv, cpu_idx); + idle_power = sg->sge->idle_states[idle_idx].power; + + idle_energy = SCHED_CAPACITY_SCALE - sg_util; + idle_energy *= idle_power; + + total_energy = busy_energy + idle_energy; + eenv->cpu[cpu_idx].energy += total_energy; + } +} + +/* + * compute_energy() computes the absolute variation in energy consumption by + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT. + * + * NOTE: compute_energy() may fail when racing with sched_domain updates, in + * which case we abort by returning -EINVAL. + */ +static int compute_energy(struct energy_env *eenv) { struct cpumask visit_cpus; - u64 total_energy = 0; int cpu_count; WARN_ON(!eenv->sg_top->sge); @@ -6716,41 +6768,18 @@ static int sched_group_energy(struct energy_env *eenv) break; do { - unsigned long group_util; - int sg_busy_energy, sg_idle_energy; - int cap_idx, idle_idx; - + eenv->sg_cap = sg; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; - else - eenv->sg_cap = sg; - cap_idx = find_new_capacity(eenv, sg->sge); - - if (sg->group_weight == 1) { - /* Remove capacity of src CPU (before task move) */ - if (eenv->trg_cpu == eenv->src_cpu && - cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { - eenv->cap.before = sg->sge->cap_states[cap_idx].cap; - eenv->cap.delta -= eenv->cap.before; - } - /* Add capacity of dst CPU (after task move) */ - if (eenv->trg_cpu == eenv->dst_cpu && - cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { - eenv->cap.after = sg->sge->cap_states[cap_idx].cap; - eenv->cap.delta += eenv->cap.after; - } - } - - idle_idx = group_idle_state(eenv, sg); - group_util = group_norm_util(eenv, sg); - - sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) - * sg->sge->idle_states[idle_idx].power); - - total_energy += sg_busy_energy + sg_idle_energy; + /* + * Compute the energy for all the candidate + * CPUs in the current visited SG. + */ + eenv->sg = sg; + calc_sg_energy(eenv); + /* remove CPUs we have just visited */ if (!sd->child) { /* * cpu_count here is the number of @@ -6791,7 +6820,6 @@ next_cpu: continue; } - eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT; return 0; } @@ -6800,181 +6828,101 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -static inline unsigned long task_util(struct task_struct *p); - /* - * energy_diff(): Estimate the energy impact of changing the utilization - * distribution. eenv specifies the change: utilisation amount, source, and - * destination cpu. Source or destination cpu may be -1 in which case the - * utilization is removed from or added to the system (e.g. task wake-up). If - * both are specified, the utilization is migrated. + * select_energy_cpu_idx(): estimate the energy impact of changing the + * utilization distribution. + * + * The eenv parameter specifies the changes: utilisation amount and a pair of + * possible CPU candidates (the previous CPU and a different target CPU). + * + * This function returns the index of a CPU candidate specified by the + * energy_env which corresponds to the first CPU saving energy. + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy + * efficient than running on prev_cpu. This is also the value returned in case + * of abort due to error conditions during the computations. + * A value greater than zero means that the first energy-efficient CPU is the + * one represented by eenv->cpu[eenv->next_idx].cpu_id. */ -static inline int __energy_diff(struct energy_env *eenv) +static inline int select_energy_cpu_idx(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; - int sd_cpu = -1, energy_before = 0, energy_after = 0; - int diff, margin; - - struct energy_env eenv_before = { - .util_delta = task_util(eenv->task), - .src_cpu = eenv->src_cpu, - .dst_cpu = eenv->dst_cpu, - .trg_cpu = eenv->src_cpu, - .nrg = { 0, 0, 0, 0}, - .cap = { 0, 0, 0 }, - .task = eenv->task, - }; + int sd_cpu = -1; + int cpu_idx; + int margin; - if (eenv->src_cpu == eenv->dst_cpu) - return 0; - - sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id; sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); - if (!sd) - return 0; /* Error */ + return EAS_CPU_PRV; - sg = sd->groups; + cpumask_clear(&eenv->cpus_mask); + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + int cpu = eenv->cpu[cpu_idx].cpu_id; - do { - if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { - eenv_before.sg_top = eenv->sg_top = sg; + if (cpu < 0) + continue; + cpumask_set_cpu(cpu, &eenv->cpus_mask); + } - if (sched_group_energy(&eenv_before)) - return 0; /* Invalid result abort */ - energy_before += eenv_before.energy; + sg = sd->groups; + do { + /* Skip SGs which do not contains a candidate CPU */ + if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg))) + continue; - /* Keep track of SRC cpu (before) capacity */ - eenv->cap.before = eenv_before.cap.before; - eenv->cap.delta = eenv_before.cap.delta; + eenv->sg_top = sg; + /* energy is unscaled to reduce rounding errors */ + if (compute_energy(eenv) == -EINVAL) + return EAS_CPU_PRV; - if (sched_group_energy(eenv)) - return 0; /* Invalid result abort */ - energy_after += eenv->energy; - } } while (sg = sg->next, sg != sd->groups); - eenv->nrg.before = energy_before; - eenv->nrg.after = energy_after; - eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; - eenv->payoff = 0; -#ifndef CONFIG_SCHED_TUNE - trace_sched_energy_diff(eenv->task, - eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, - eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, - eenv->cap.before, eenv->cap.after, eenv->cap.delta, - eenv->nrg.delta, eenv->payoff); -#endif + /* Scale energy before comparisons */ + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) + eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT; + /* - * Dead-zone margin preventing too many migrations. + * Compute the dead-zone margin used to prevent too many task + * migrations with negligible energy savings. + * An energy saving is considered meaningful if it reduces the energy + * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56% */ + margin = eenv->cpu[EAS_CPU_PRV].energy >> 6; - margin = eenv->nrg.before >> 6; /* ~1.56% */ - - diff = eenv->nrg.after - eenv->nrg.before; - - eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; - - return eenv->nrg.diff; -} - -#ifdef CONFIG_SCHED_TUNE - -struct target_nrg schedtune_target_nrg; - -#ifdef CONFIG_CGROUP_SCHEDTUNE -extern bool schedtune_initialized; -#endif /* CONFIG_CGROUP_SCHEDTUNE */ - -/* - * System energy normalization - * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], - * corresponding to the specified energy variation. - */ -static inline int -normalize_energy(int energy_diff) -{ - u32 normalized_nrg; - -#ifdef CONFIG_CGROUP_SCHEDTUNE - /* during early setup, we don't know the extents */ - if (unlikely(!schedtune_initialized)) - return energy_diff < 0 ? -1 : 1 ; -#endif /* CONFIG_CGROUP_SCHEDTUNE */ - -#ifdef CONFIG_SCHED_DEBUG - { - int max_delta; + /* + * By default the EAS_CPU_PRV CPU is considered the most energy + * efficient, with a 0 energy variation. + */ + eenv->next_idx = EAS_CPU_PRV; - /* Check for boundaries */ - max_delta = schedtune_target_nrg.max_power; - max_delta -= schedtune_target_nrg.min_power; - WARN_ON(abs(energy_diff) >= max_delta); + /* + * Compare the other CPU candidates to find a CPU which can be + * more energy efficient then EAS_CPU_PRV + */ + for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + /* Skip not valid scheduled candidates */ + if (eenv->cpu[cpu_idx].cpu_id < 0) + continue; + /* Compute energy delta wrt EAS_CPU_PRV */ + eenv->cpu[cpu_idx].nrg_delta = + eenv->cpu[cpu_idx].energy - + eenv->cpu[EAS_CPU_PRV].energy; + /* filter energy variations within the dead-zone margin */ + if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin) + eenv->cpu[cpu_idx].nrg_delta = 0; + /* update the schedule candidate with min(nrg_delta) */ + if (eenv->cpu[cpu_idx].nrg_delta < + eenv->cpu[eenv->next_idx].nrg_delta) { + eenv->next_idx = cpu_idx; + if (sched_feat(FBT_STRICT_ORDER)) + break; + } } -#endif - - /* Do scaling using positive numbers to increase the range */ - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; - - /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_CAPACITY_SHIFT; - - /* Normalize on max energy for target platform */ - normalized_nrg = reciprocal_divide( - normalized_nrg, schedtune_target_nrg.rdiv); - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; + return eenv->next_idx; } -static inline int -energy_diff(struct energy_env *eenv) -{ - int boost = schedtune_task_boost(eenv->task); - int nrg_delta; - - /* Conpute "absolute" energy diff */ - __energy_diff(eenv); - - /* Return energy diff when boost margin is 0 */ - if (boost == 0) { - trace_sched_energy_diff(eenv->task, - eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, - eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, - eenv->cap.before, eenv->cap.after, eenv->cap.delta, - 0, -eenv->nrg.diff); - return eenv->nrg.diff; - } - - /* Compute normalized energy diff */ - nrg_delta = normalize_energy(eenv->nrg.diff); - eenv->nrg.delta = nrg_delta; - - eenv->payoff = schedtune_accept_deltas( - eenv->nrg.delta, - eenv->cap.delta, - eenv->task); - - trace_sched_energy_diff(eenv->task, - eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, - eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, - eenv->cap.before, eenv->cap.after, eenv->cap.delta, - eenv->nrg.delta, eenv->payoff); - - /* - * When SchedTune is enabled, the energy_diff() function will return - * the computed energy payoff value. Since the energy_diff() return - * value is expected to be negative by its callers, this evaluation - * function return a negative value each time the evaluation return a - * positive payoff, which is the condition for the acceptance of - * a scheduling decision - */ - return -eenv->payoff; -} -#else /* CONFIG_SCHED_TUNE */ -#define energy_diff(eenv) __energy_diff(eenv) -#endif - /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened @@ -7069,18 +7017,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return 1; } -static inline unsigned long task_util(struct task_struct *p) -{ -#ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - unsigned long demand = p->ravg.demand; - return (demand << 10) / walt_ravg_window; - } -#endif - return p->se.avg.util_avg; -} - -static inline unsigned long boosted_task_util(struct task_struct *task); +static inline unsigned long boosted_task_util(struct task_struct *p); static inline bool __task_fits(struct task_struct *p, int cpu, int util) { @@ -7157,16 +7094,16 @@ schedtune_cpu_margin(unsigned long util, int cpu) } static inline long -schedtune_task_margin(struct task_struct *task) +schedtune_task_margin(struct task_struct *p) { - int boost = schedtune_task_boost(task); + int boost = schedtune_task_boost(p); unsigned long util; long margin; if (boost == 0) return 0; - util = task_util(task); + util = task_util(p); margin = schedtune_margin(util, boost); return margin; @@ -7181,7 +7118,7 @@ schedtune_cpu_margin(unsigned long util, int cpu) } static inline int -schedtune_task_margin(struct task_struct *task) +schedtune_task_margin(struct task_struct *p) { return 0; } @@ -7200,12 +7137,12 @@ boosted_cpu_util(int cpu) } static inline unsigned long -boosted_task_util(struct task_struct *task) +boosted_task_util(struct task_struct *p) { - unsigned long util = task_util(task); - long margin = schedtune_task_margin(task); + unsigned long util = task_util(p); + long margin = schedtune_task_margin(p); - trace_sched_boost_task(task, util, margin); + trace_sched_boost_task(p, util, margin); return util + margin; } @@ -7575,6 +7512,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; unsigned long best_active_util = ULONG_MAX; + unsigned long target_idle_max_spare_cap = 0; int best_idle_cstate = INT_MAX; struct sched_domain *sd; struct sched_group *sg; @@ -7610,7 +7548,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { unsigned long capacity_curr = capacity_curr_of(i); unsigned long capacity_orig = capacity_orig_of(i); - unsigned long wake_util, new_util; + unsigned long wake_util, new_util, min_capped_util; if (!cpu_online(i)) continue; @@ -7632,13 +7570,18 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, * than the one required to boost the task. */ new_util = max(min_util, new_util); - if (new_util > capacity_orig) - continue; -#ifdef CONFIG_SCHED_WALT - if (walt_cpu_high_irqload(i)) + /* + * Include minimum capacity constraint: + * new_util contains the required utilization including + * boost. min_capped_util also takes into account a + * minimum capacity cap imposed on the CPU by external + * actors. + */ + min_capped_util = max(new_util, capacity_min_of(i)); + + if (new_util > capacity_orig) continue; -#endif /* * Case A) Latency sensitive tasks @@ -7759,6 +7702,12 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, /* Select idle CPU with lower cap_orig */ if (capacity_orig > best_idle_min_cap_orig) continue; + /* Favor CPUs that won't end up running at a + * high OPP. + */ + if ((capacity_orig - min_capped_util) < + target_idle_max_spare_cap) + continue; /* * Skip CPUs in deeper idle state, but only @@ -7772,6 +7721,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, /* Keep track of best idle CPU */ best_idle_min_cap_orig = capacity_orig; + target_idle_max_spare_cap = capacity_orig - + min_capped_util; best_idle_cstate = idle_idx; best_idle_cpu = i; continue; @@ -7802,10 +7753,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, continue; /* Favor CPUs with maximum spare capacity */ - if ((capacity_orig - new_util) < target_max_spare_cap) + if ((capacity_orig - min_capped_util) < + target_max_spare_cap) continue; - target_max_spare_cap = capacity_orig - new_util; + target_max_spare_cap = capacity_orig - min_capped_util; target_capacity = capacity_orig; target_cpu = i; } @@ -7877,9 +7829,11 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { - struct sched_domain *sd; - int target_cpu = prev_cpu, tmp_target, tmp_backup; bool boosted, prefer_idle; + struct sched_domain *sd; + int target_cpu; + int backup_cpu; + int next_cpu; schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); schedstat_inc(this_rq(), eas_stats.secb_attempts); @@ -7894,7 +7848,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } } - rcu_read_lock(); #ifdef CONFIG_CGROUP_SCHEDTUNE boosted = schedtune_task_boost(p) > 0; prefer_idle = schedtune_prefer_idle(p) > 0; @@ -7903,31 +7856,49 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync prefer_idle = 0; #endif - sync_entity_load_avg(&p->se); + rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + if (!sd) { + target_cpu = prev_cpu; + goto unlock; + } + + sync_entity_load_avg(&p->se); + /* Find a cpu with sufficient capacity */ - tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); + next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle); + if (next_cpu == -1) { + target_cpu = prev_cpu; + goto unlock; + } - if (!sd) + /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */ + if ((boosted || prefer_idle) && idle_cpu(next_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); + target_cpu = next_cpu; goto unlock; - if (tmp_target >= 0) { - target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); - schedstat_inc(this_rq(), eas_stats.secb_idle_bt); - goto unlock; - } } - if (target_cpu != prev_cpu) { + target_cpu = prev_cpu; + if (next_cpu != prev_cpu) { int delta = 0; struct energy_env eenv = { + .p = p, .util_delta = task_util(p), - .src_cpu = prev_cpu, - .dst_cpu = target_cpu, - .task = p, - .trg_cpu = target_cpu, + /* Task's previous CPU candidate */ + .cpu[EAS_CPU_PRV] = { + .cpu_id = prev_cpu, + }, + /* Main alternative CPU candidate */ + .cpu[EAS_CPU_NXT] = { + .cpu_id = next_cpu, + }, + /* Backup alternative CPU candidate */ + .cpu[EAS_CPU_BKP] = { + .cpu_id = backup_cpu, + }, }; @@ -7940,26 +7911,21 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync if (__cpu_overutilized(prev_cpu, delta)) { schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); + target_cpu = next_cpu; goto unlock; } - if (energy_diff(&eenv) >= 0) { - /* No energy saving for target_cpu, try backup */ - target_cpu = tmp_backup; - eenv.dst_cpu = target_cpu; - eenv.trg_cpu = target_cpu; - if (tmp_backup < 0 || - tmp_backup == prev_cpu || - energy_diff(&eenv) >= 0) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); - target_cpu = prev_cpu; - goto unlock; - } + /* Check if EAS_CPU_NXT is a more energy efficient CPU */ + if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + target_cpu = eenv.cpu[eenv.next_idx].cpu_id; + goto unlock; } - schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; goto unlock; } @@ -9339,6 +9305,9 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity_orig = capacity; + capacity *= arch_scale_max_freq_capacity(sd, cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; raw_spin_lock_irqsave(&mcc->lock, flags); @@ -10373,6 +10342,17 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a cpu that could + * eventually lead to active_balancing high->low capacity. + * Higher per-cpu capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(i); /* @@ -11677,6 +11657,92 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +#ifdef CONFIG_SCHED_HMP +static DEFINE_RAW_SPINLOCK(migration_lock); + +static bool do_migration(int reason, int new_cpu, int cpu) +{ + if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) + && same_cluster(new_cpu, cpu)) + return false; + + /* Inter cluster high irqload migrations are OK */ + return new_cpu != cpu; +} + +/* + * Check if currently running task should be migrated to a better cpu. + * + * Todo: Effect this via changes to nohz_balancer_kick() and load balance? + */ +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq), new_cpu; + int active_balance = 0, reason; + + reason = migration_needed(p, cpu); + if (!reason) + return; + + raw_spin_lock(&migration_lock); + new_cpu = select_best_cpu(p, cpu, reason, 0); + + if (do_migration(reason, new_cpu, cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + mark_reserved(new_cpu); + } + + raw_spin_unlock(&migration_lock); + + if (active_balance) + stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, + &rq->active_balance_work); +} +#else +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int new_cpu; + int active_balance; + int cpu = task_cpu(p); + + if (rq->misfit_task) { + if (rq->curr->state != TASK_RUNNING || + rq->curr->nr_cpus_allowed == 1) + return; + + new_cpu = select_energy_cpu_brute(p, cpu, 0); + if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + stop_one_cpu_nowait(cpu, + active_load_balance_cpu_stop, + rq, &rq->active_balance_work); + } + } +} +#endif + #endif /* CONFIG_SMP */ /* |