diff options
Diffstat (limited to 'kernel/sched/hmp.c')
| -rw-r--r-- | kernel/sched/hmp.c | 305 |
1 files changed, 186 insertions, 119 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 301ed592f50f..a8bf39c6d7d7 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -74,11 +74,6 @@ inline void clear_ed_task(struct task_struct *p, struct rq *rq) rq->ed_task = NULL; } -inline void set_task_last_wake(struct task_struct *p, u64 wallclock) -{ - p->last_wake_ts = wallclock; -} - inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock) { p->last_switch_out_ts = wallclock; @@ -641,14 +636,18 @@ void clear_hmp_request(int cpu) clear_boost_kick(cpu); clear_reserved(cpu); if (rq->push_task) { + struct task_struct *push_task = NULL; + raw_spin_lock_irqsave(&rq->lock, flags); if (rq->push_task) { clear_reserved(rq->push_cpu); - put_task_struct(rq->push_task); + push_task = rq->push_task; rq->push_task = NULL; } rq->active_balance = 0; raw_spin_unlock_irqrestore(&rq->lock, flags); + if (push_task) + put_task_struct(push_task); } } @@ -784,11 +783,12 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000; static unsigned int sync_cpu; -static LIST_HEAD(related_thread_groups); +struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static LIST_HEAD(active_related_thread_groups); static DEFINE_RWLOCK(related_thread_group_lock); #define for_each_related_thread_group(grp) \ - list_for_each_entry(grp, &related_thread_groups, list) + list_for_each_entry(grp, &active_related_thread_groups, list) /* * Task load is categorized into buckets for the purpose of top task tracking. @@ -956,6 +956,16 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC; unsigned int __read_mostly sysctl_sched_restrict_cluster_spill; +/* + * Scheduler tries to avoid waking up idle CPUs for tasks running + * in short bursts. If the task average burst is less than + * sysctl_sched_short_burst nanoseconds and it sleeps on an average + * for more than sysctl_sched_short_sleep nanoseconds, then the + * task is eligible for packing. + */ +unsigned int __read_mostly sysctl_sched_short_burst; +unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC; + static void _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate) { @@ -1547,6 +1557,15 @@ void init_new_task_load(struct task_struct *p, bool idle_task) INIT_LIST_HEAD(&p->grp_list); memset(&p->ravg, 0, sizeof(struct ravg)); p->cpu_cycles = 0; + p->ravg.curr_burst = 0; + /* + * Initialize the avg_burst to twice the threshold, so that + * a task would not be classified as short burst right away + * after fork. It takes at least 6 sleep-wakeup cycles for + * the avg_burst to go below the threshold. + */ + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + p->ravg.avg_sleep_time = 0; p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL); @@ -1767,20 +1786,20 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups) if (freq_required < cur_freq + sysctl_sched_pred_alert_freq) return 0; } else { - read_lock(&related_thread_group_lock); + read_lock_irqsave(&related_thread_group_lock, flags); /* * Protect from concurrent update of rq->prev_runnable_sum and * group cpu load */ - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); if (check_groups) _group_load_in_cpu(cpu_of(rq), &group_load, NULL); new_load = rq->prev_runnable_sum + group_load; new_load = freq_policy_load(rq, new_load); - raw_spin_unlock_irqrestore(&rq->lock, flags); - read_unlock(&related_thread_group_lock); + raw_spin_unlock(&rq->lock); + read_unlock_irqrestore(&related_thread_group_lock, flags); cur_freq = load_to_freq(rq, rq->old_busy_time); freq_required = load_to_freq(rq, new_load); @@ -2733,12 +2752,14 @@ done: trace_sched_update_history(rq, p, runtime, samples, event); } -static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) { delta = scale_exec_time(delta, rq); p->ravg.sum += delta; if (unlikely(p->ravg.sum > sched_ravg_window)) p->ravg.sum = sched_ravg_window; + + return delta; } /* @@ -2791,13 +2812,14 @@ static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() * depends on it! */ -static void update_task_demand(struct task_struct *p, struct rq *rq, +static u64 update_task_demand(struct task_struct *p, struct rq *rq, int event, u64 wallclock) { u64 mark_start = p->ravg.mark_start; u64 delta, window_start = rq->window_start; int new_window, nr_full_windows; u32 window_size = sched_ravg_window; + u64 runtime; new_window = mark_start < window_start; if (!account_busy_for_task_demand(p, event)) { @@ -2811,7 +2833,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, * it is not necessary to account those. */ update_history(rq, p, p->ravg.sum, 1, event); - return; + return 0; } if (!new_window) { @@ -2819,8 +2841,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, * The simple case - busy time contained within the existing * window. */ - add_to_task_demand(rq, p, wallclock - mark_start); - return; + return add_to_task_demand(rq, p, wallclock - mark_start); } /* @@ -2832,13 +2853,16 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, window_start -= (u64)nr_full_windows * (u64)window_size; /* Process (window_start - mark_start) first */ - add_to_task_demand(rq, p, window_start - mark_start); + runtime = add_to_task_demand(rq, p, window_start - mark_start); /* Push new sample(s) into task's demand history */ update_history(rq, p, p->ravg.sum, 1, event); - if (nr_full_windows) - update_history(rq, p, scale_exec_time(window_size, rq), - nr_full_windows, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } /* * Roll window_start back to current to process any remainder @@ -2848,13 +2872,31 @@ static void update_task_demand(struct task_struct *p, struct rq *rq, /* Process (wallclock - window_start) next */ mark_start = window_start; - add_to_task_demand(rq, p, wallclock - mark_start); + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static inline void +update_task_burst(struct task_struct *p, struct rq *rq, int event, int runtime) +{ + /* + * update_task_demand() has checks for idle task and + * exit task. The runtime may include the wait time, + * so update the burst only for the cases where the + * task is running. + */ + if (event == PUT_PREV_TASK || (event == TASK_UPDATE && + rq->curr == p)) + p->ravg.curr_burst += runtime; } /* Reflect task activity on its demand and cpu's busy time statistics */ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { + u64 runtime; + if (!rq->window_start || sched_disable_window_stats || p->ravg.mark_start == wallclock) return; @@ -2869,7 +2911,9 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event, } update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); - update_task_demand(p, rq, event, wallclock); + runtime = update_task_demand(p, rq, event, wallclock); + if (runtime) + update_task_burst(p, rq, event, runtime); update_cpu_busy_time(p, rq, event, wallclock, irqtime); update_task_pred_demand(rq, p, event); done: @@ -2955,6 +2999,8 @@ void reset_task_stats(struct task_struct *p) p->ravg.curr_window_cpu = curr_window_ptr; p->ravg.prev_window_cpu = prev_window_ptr; + p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst; + /* Retain EXITING_TASK marker */ p->ravg.sum_history[0] = sum; } @@ -3053,7 +3099,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) read_unlock(&tasklist_lock); - list_for_each_entry(grp, &related_thread_groups, list) { + list_for_each_entry(grp, &active_related_thread_groups, list) { int j; for_each_possible_cpu(j) { @@ -3203,14 +3249,16 @@ void sched_get_cpus_busy(struct sched_load *busy, if (unlikely(cpus == 0)) return; + local_irq_save(flags); + + read_lock(&related_thread_group_lock); + /* * This function could be called in timer context, and the * current task may have been executing for a long time. Ensure * that the window stats are current by doing an update. */ - read_lock(&related_thread_group_lock); - local_irq_save(flags); for_each_cpu(cpu, query_cpus) raw_spin_lock(&cpu_rq(cpu)->lock); @@ -3310,10 +3358,11 @@ skip_early: for_each_cpu(cpu, query_cpus) raw_spin_unlock(&(cpu_rq(cpu))->lock); - local_irq_restore(flags); read_unlock(&related_thread_group_lock); + local_irq_restore(flags); + i = 0; for_each_cpu(cpu, query_cpus) { rq = cpu_rq(cpu); @@ -3966,47 +4015,54 @@ _group_cpu_time(struct related_thread_group *grp, int cpu) return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL; } -struct related_thread_group *alloc_related_thread_group(int group_id) +static inline struct related_thread_group* +lookup_related_thread_group(unsigned int group_id) { - struct related_thread_group *grp; - - grp = kzalloc(sizeof(*grp), GFP_ATOMIC); - if (!grp) - return ERR_PTR(-ENOMEM); - - if (alloc_group_cputime(grp)) { - kfree(grp); - return ERR_PTR(-ENOMEM); - } - - grp->id = group_id; - INIT_LIST_HEAD(&grp->tasks); - INIT_LIST_HEAD(&grp->list); - raw_spin_lock_init(&grp->lock); - - return grp; + return related_thread_groups[group_id]; } -struct related_thread_group *lookup_related_thread_group(unsigned int group_id) +int alloc_related_thread_groups(void) { + int i, ret; struct related_thread_group *grp; - list_for_each_entry(grp, &related_thread_groups, list) { - if (grp->id == group_id) - return grp; + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_NOWAIT); + if (!grp) { + ret = -ENOMEM; + goto err; + } + + if (alloc_group_cputime(grp)) { + kfree(grp); + ret = -ENOMEM; + goto err; + } + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; } - return NULL; -} + return 0; -/* See comments before preferred_cluster() */ -static void free_related_thread_group(struct rcu_head *rcu) -{ - struct related_thread_group *grp = container_of(rcu, struct - related_thread_group, rcu); +err: + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = lookup_related_thread_group(i); + if (grp) { + free_group_cputime(grp); + kfree(grp); + related_thread_groups[i] = NULL; + } else { + break; + } + } - free_group_cputime(grp); - kfree(grp); + return ret; } static void remove_task_from_group(struct task_struct *p) @@ -4031,10 +4087,12 @@ static void remove_task_from_group(struct task_struct *p) raw_spin_unlock(&grp->lock); /* Reserved groups cannot be destroyed */ - if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) { - list_del(&grp->list); - call_rcu(&grp->rcu, free_related_thread_group); - } + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + list_del_init(&grp->list); } static int @@ -4106,53 +4164,15 @@ void add_new_task_to_grp(struct task_struct *new) write_unlock_irqrestore(&related_thread_group_lock, flags); } -#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) -/* - * We create a default colocation group at boot. There is no need to - * synchronize tasks between cgroups at creation time because the - * correct cgroup hierarchy is not available at boot. Therefore cgroup - * colocation is turned off by default even though the colocation group - * itself has been allocated. Furthermore this colocation group cannot - * be destroyted once it has been created. All of this has been as part - * of runtime optimizations. - * - * The job of synchronizing tasks to the colocation group is done when - * the colocation flag in the cgroup is turned on. - */ -static int __init create_default_coloc_group(void) -{ - struct related_thread_group *grp = NULL; - unsigned long flags; - - grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID); - if (IS_ERR(grp)) { - WARN_ON(1); - return -ENOMEM; - } - - write_lock_irqsave(&related_thread_group_lock, flags); - list_add(&grp->list, &related_thread_groups); - write_unlock_irqrestore(&related_thread_group_lock, flags); - - update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); - return 0; -} -late_initcall(create_default_coloc_group); - -int sync_cgroup_colocation(struct task_struct *p, bool insert) -{ - unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; - - return sched_set_group_id(p, grp_id); -} -#endif - -int sched_set_group_id(struct task_struct *p, unsigned int group_id) +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; unsigned long flags; struct related_thread_group *grp = NULL; + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + raw_spin_lock_irqsave(&p->pi_lock, flags); write_lock(&related_thread_group_lock); @@ -4168,29 +4188,26 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id) } grp = lookup_related_thread_group(group_id); - if (!grp) { - /* This is a reserved id */ - if (group_id == DEFAULT_CGROUP_COLOC_ID) { - rc = -EINVAL; - goto done; - } - - grp = alloc_related_thread_group(group_id); - if (IS_ERR(grp)) { - rc = -ENOMEM; - goto done; - } - - list_add(&grp->list, &related_thread_groups); - } + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); rc = add_task_to_group(p, grp); done: write_unlock(&related_thread_group_lock); raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return rc; } +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + unsigned int sched_get_group_id(struct task_struct *p) { unsigned int group_id; @@ -4204,6 +4221,42 @@ unsigned int sched_get_group_id(struct task_struct *p) return group_id; } +#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE) +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int __init create_default_coloc_group(void) +{ + struct related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + + update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH); + return 0; +} +late_initcall(create_default_coloc_group); + +int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return __sched_set_group_id(p, grp_id); +} +#endif + static void update_cpu_cluster_capacity(const cpumask_t *cpus) { int i; @@ -4451,6 +4504,20 @@ bool early_detection_notify(struct rq *rq, u64 wallclock) return 0; } +void update_avg_burst(struct task_struct *p) +{ + update_avg(&p->ravg.avg_burst, p->ravg.curr_burst); + p->ravg.curr_burst = 0; +} + +void note_task_waking(struct task_struct *p, u64 wallclock) +{ + u64 sleep_time = wallclock - p->last_switch_out_ts; + + p->last_wake_ts = wallclock; + update_avg(&p->ravg.avg_sleep_time, sleep_time); +} + #ifdef CONFIG_CGROUP_SCHED u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
