diff options
Diffstat (limited to 'kernel/sched/fair.c')
| -rw-r--r-- | kernel/sched/fair.c | 371 |
1 files changed, 234 insertions, 137 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e3ab49c262a..9263ffd5673f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -762,7 +762,9 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); static void attach_entity_cfs_rq(struct sched_entity *se); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -833,7 +835,7 @@ void post_init_entity_util_avg(struct sched_entity *se) attach_entity_cfs_rq(se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } @@ -4412,11 +4414,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); @@ -5824,7 +5829,7 @@ static void update_capacity_of(int cpu) if (!sched_freq()) return; - /* Convert scale-invariant capacity to cpu. */ + /* Normalize scale-invariant capacity to cpu. */ req_cap = boosted_cpu_util(cpu); req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); @@ -5867,7 +5872,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -6023,7 +6028,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (rq->cfs.nr_running) update_capacity_of(cpu_of(rq)); else if (sched_freq()) - set_cfs_cpu_capacity(cpu_of(rq), false, 0); + set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */ } } @@ -6446,6 +6451,7 @@ struct energy_env { int util_delta; int src_cpu; int dst_cpu; + int trg_cpu; int energy; int payoff; struct task_struct *task; @@ -6462,11 +6468,14 @@ struct energy_env { } cap; }; +static int cpu_util_wake(int cpu, struct task_struct *p); + /* * __cpu_norm_util() returns the cpu util relative to a specific capacity, - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for - * energy calculations. Using the scale-invariant util returned by - * cpu_util() and approximating scale-invariant util by: + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: * * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time * @@ -6476,34 +6485,32 @@ struct energy_env { * * norm_util = running_time/time ~ util/capacity */ -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) { - int util = __cpu_util(cpu, delta); - if (util >= capacity) return SCHED_CAPACITY_SCALE; return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static int calc_util_delta(struct energy_env *eenv, int cpu) +static unsigned long group_max_util(struct energy_env *eenv) { - if (cpu == eenv->src_cpu) - return -eenv->util_delta; - if (cpu == eenv->dst_cpu) - return eenv->util_delta; - return 0; -} - -static -unsigned long group_max_util(struct energy_env *eenv) -{ - int i, delta; unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->task); - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { - delta = calc_util_delta(eenv, i); - max_util = max(max_util, __cpu_util(i, delta)); + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + max_util = max(max_util, util); } return max_util; @@ -6511,44 +6518,56 @@ unsigned long group_max_util(struct energy_env *eenv) /* * group_norm_util() returns the approximated group util relative to it's - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in - * energy calculations. Since task executions may or may not overlap in time in - * the group the true normalized util is between max(cpu_norm_util(i)) and - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The - * latter is used as the estimate as it leads to a more pessimistic energy + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy * estimate (more busy). */ static unsigned long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i, delta; - unsigned long util_sum = 0; unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long util, util_sum = 0; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; - for_each_cpu(i, sched_group_cpus(sg)) { - delta = calc_util_delta(eenv, i); - util_sum += __cpu_norm_util(i, capacity, delta); + util_sum += __cpu_norm_util(util, capacity); } - if (util_sum > SCHED_CAPACITY_SCALE) - return SCHED_CAPACITY_SCALE; - return util_sum; + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy * const sge) { - int idx; + int idx, max_idx = sge->nr_cap_states - 1; unsigned long util = group_max_util(eenv); + /* default is max_cap if we don't find a match */ + eenv->cap_idx = max_idx; + for (idx = 0; idx < sge->nr_cap_states; idx++) { - if (sge->cap_states[idx].cap >= util) + if (sge->cap_states[idx].cap >= util) { + eenv->cap_idx = idx; break; + } } - eenv->cap_idx = idx; - - return idx; + return eenv->cap_idx; } static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) @@ -6721,6 +6740,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -6736,11 +6757,13 @@ static inline int __energy_diff(struct energy_env *eenv) int diff, margin; struct energy_env eenv_before = { - .util_delta = 0, + .util_delta = task_util(eenv->task), .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, + .task = eenv->task, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -6799,7 +6822,11 @@ static inline int __energy_diff(struct energy_env *eenv) #ifdef CONFIG_SCHED_TUNE struct target_nrg schedtune_target_nrg; + +#ifdef CONFIG_CGROUP_SCHEDTUNE extern bool schedtune_initialized; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + /* * System energy normalization * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], @@ -6810,9 +6837,11 @@ normalize_energy(int energy_diff) { u32 normalized_nrg; +#ifdef CONFIG_CGROUP_SCHEDTUNE /* during early setup, we don't know the extents */ if (unlikely(!schedtune_initialized)) return energy_diff < 0 ? -1 : 1 ; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ #ifdef CONFIG_SCHED_DEBUG { @@ -6848,8 +6877,14 @@ energy_diff(struct energy_env *eenv) __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ - if (boost == 0) + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); return eenv->nrg.diff; + } /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); @@ -6892,15 +6927,18 @@ energy_diff(struct energy_env *eenv) * being client/server, worker/dispatcher, interrupt source or whatever is * irrelevant, spread criteria is apparent partner count exceeds socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -7106,8 +7144,6 @@ boosted_task_util(struct task_struct *task) return util + margin; } -static int cpu_util_wake(int cpu, struct task_struct *p); - static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); @@ -7116,6 +7152,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -7123,7 +7161,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -7191,23 +7229,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilized systems if we require spare_capacity > task_util(p), * so we allow for some task stuffing by using * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + if (this_spare > task_util(p) / 2 && imbalance*this_spare > 100*most_spare) return NULL; else if (most_spare > task_util(p) / 2) return most_spare_sg; +skip_spare: if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -7254,6 +7300,68 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + } + +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = cpu; + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); + } + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); + } + + return new_cpu; } /* @@ -7397,7 +7505,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; - unsigned long target_util = ULONG_MAX; unsigned long best_active_util = ULONG_MAX; int best_idle_cstate = INT_MAX; struct sched_domain *sd; @@ -7536,6 +7643,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, } /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + + /* * Case B) Non latency sensitive tasks on IDLE CPUs. * * Find an optimal backup IDLE CPU for non latency @@ -7613,7 +7733,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, target_max_spare_cap = capacity_orig - new_util; target_capacity = capacity_orig; - target_util = new_util; target_cpu = i; } @@ -7734,6 +7853,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .src_cpu = prev_cpu, .dst_cpu = target_cpu, .task = p, + .trg_cpu = target_cpu, }; @@ -7752,7 +7872,9 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync /* No energy saving for target_cpu, try backup */ target_cpu = tmp_backup; eenv.dst_cpu = target_cpu; - if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); target_cpu = prev_cpu; @@ -7787,7 +7909,8 @@ unlock: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -7799,9 +7922,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return select_best_cpu(p, prev_cpu, 0, sync); #endif - if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); + } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); @@ -7833,61 +7959,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else { - int wu = sd_flag & SD_BALANCE_WAKE; - int cas_cpu = -1; - - if (wu) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); - schedstat_inc(this_rq(), eas_stats.cas_attempts); - } - - while (sd) { - struct sched_group *group; - int weight; - - if (wu) - schedstat_inc(sd, eas_stats.cas_attempts); - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = cas_cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - if (wu && (cas_cpu >= 0)) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_count); - schedstat_inc(this_rq(), eas_stats.cas_count); - } + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); @@ -10040,8 +10126,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; @@ -10410,6 +10499,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* The world might have changed. Validate assumptions */ if (busiest->nr_running <= 1) { @@ -10867,6 +10957,7 @@ static int active_load_balance_cpu_stop(void *data) if (likely(sd)) { env.sd = sd; schedstat_inc(sd, alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -11530,31 +11621,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -11567,8 +11644,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -11760,6 +11836,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -11772,6 +11856,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -12003,7 +12100,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif #ifdef CONFIG_SCHED_HMP .inc_hmp_sched_stats = inc_hmp_sched_stats_fair, |
