diff options
Diffstat (limited to 'kernel/sched/hmp.c')
| -rw-r--r-- | kernel/sched/hmp.c | 715 |
1 files changed, 608 insertions, 107 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 3d5de8ba70a2..d220482f4dbc 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -18,9 +18,9 @@ #include <linux/list_sort.h> #include <linux/syscore_ops.h> #include <linux/of.h> +#include <linux/sched/core_ctl.h> #include "sched.h" -#include "core_ctl.h" #include <trace/events/sched.h> @@ -201,7 +201,7 @@ int sched_update_freq_max_load(const cpumask_t *cpumask) entry = &max_load->freqs[i]; freq = costs[i].freq; hpct = get_freq_max_load(cpu, freq); - if (hpct <= 0 && hpct > 100) + if (hpct <= 0 || hpct > 100) hpct = 100; hfreq = div64_u64((u64)freq * hpct, 100); entry->hdemand = @@ -590,6 +590,7 @@ static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus) cluster->dstate_wakeup_latency = 0; cluster->freq_init_done = false; + raw_spin_lock_init(&cluster->load_lock); cluster->cpus = *cpus; cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus)); @@ -647,6 +648,7 @@ void init_clusters(void) { bitmap_clear(all_cluster_ids, 0, NR_CPUS); init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); INIT_LIST_HEAD(&cluster_head); } @@ -788,6 +790,12 @@ __read_mostly unsigned int sysctl_sched_new_task_windows = 5; #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 /* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +/* * For increase, send notification if * freq_required - cur_freq > sysctl_sched_freq_inc_notify */ @@ -823,15 +831,15 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ unsigned int min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ -/* Window size (in ns) */ -__read_mostly unsigned int sched_ravg_window = 10000000; - /* Min window size (in ns) = 10ms */ #define MIN_SCHED_RAVG_WINDOW 10000000 /* Max window size (in ns) = 1s */ #define MAX_SCHED_RAVG_WINDOW 1000000000 +/* Window size (in ns) */ +__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; + /* Temporarily disable window-stats activity on all cpus */ unsigned int __read_mostly sched_disable_window_stats; @@ -851,6 +859,21 @@ static DEFINE_RWLOCK(related_thread_group_lock); list_for_each_entry(grp, &related_thread_groups, list) /* + * Task load is categorized into buckets for the purpose of top task tracking. + * The entire range of load from 0 to sched_ravg_window needs to be covered + * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket + * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value + * of sched_ravg_window is MIN_SCHED_RAVG_WINDOW, use that to compute + * sched_load_granule. + */ +__read_mostly unsigned int sched_load_granule = + MIN_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES; + +/* Size of bitmaps maintained to track top tasks */ +static const unsigned int top_tasks_bitmap_size = + BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long); + +/* * Demand aggregation for frequency purpose: * * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads @@ -1505,7 +1528,7 @@ static inline int invalid_value(unsigned int *data) /* * Handle "atomic" update of sysctl_sched_window_stats_policy, - * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables. + * sysctl_sched_ravg_hist_size variables. */ int sched_window_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -1611,7 +1634,7 @@ unsigned int cpu_temp(int cpu) return 0; } -void init_new_task_load(struct task_struct *p) +void init_new_task_load(struct task_struct *p, bool idle_task) { int i; u32 init_load_windows = sched_init_task_load_windows; @@ -1623,6 +1646,15 @@ void init_new_task_load(struct task_struct *p) memset(&p->ravg, 0, sizeof(struct ravg)); p->cpu_cycles = 0; + p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_ATOMIC); + p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_ATOMIC); + + /* Don't have much choice. CPU frequency would be bogus */ + BUG_ON(!p->ravg.curr_window_cpu || !p->ravg.prev_window_cpu); + + if (idle_task) + return; + if (init_load_pct) init_load_windows = div64_u64((u64)init_load_pct * (u64)sched_ravg_window, 100); @@ -2161,6 +2193,174 @@ void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) p->ravg.pred_demand = new; } +void clear_top_tasks_bitmap(unsigned long *bitmap) +{ + memset(bitmap, 0, top_tasks_bitmap_size); + __set_bit(NUM_LOAD_INDICES, bitmap); +} + +/* + * Special case the last index and provide a fast path for index = 0. + * Note that sched_load_granule can change underneath us if we are not + * holding any runqueue locks while calling the two functions below. + */ +static u32 top_task_load(struct rq *rq) +{ + int index = rq->prev_top; + u8 prev = 1 - rq->curr_table; + + if (!index) { + int msb = NUM_LOAD_INDICES - 1; + + if (!test_bit(msb, rq->top_tasks_bitmap[prev])) + return 0; + else + return sched_load_granule; + } else if (index == NUM_LOAD_INDICES - 1) { + return sched_ravg_window; + } else { + return (index + 1) * sched_load_granule; + } +} + +static int load_to_index(u32 load) +{ + if (load < sched_load_granule) + return 0; + else if (load >= sched_ravg_window) + return NUM_LOAD_INDICES - 1; + else + return load / sched_load_granule; +} + +static void update_top_tasks(struct task_struct *p, struct rq *rq, + u32 old_curr_window, int new_window, bool full_window) +{ + u8 curr = rq->curr_table; + u8 prev = 1 - curr; + u8 *curr_table = rq->top_tasks[curr]; + u8 *prev_table = rq->top_tasks[prev]; + int old_index, new_index, update_index; + u32 curr_window = p->ravg.curr_window; + u32 prev_window = p->ravg.prev_window; + bool zero_index_update; + + if (old_curr_window == curr_window && !new_window) + return; + + old_index = load_to_index(old_curr_window); + new_index = load_to_index(curr_window); + + if (!new_window) { + zero_index_update = !old_curr_window && curr_window; + if (old_index != new_index || zero_index_update) { + if (old_curr_window) + curr_table[old_index] -= 1; + if (curr_window) + curr_table[new_index] += 1; + if (new_index > rq->curr_top) + rq->curr_top = new_index; + } + + if (!curr_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + rq->top_tasks_bitmap[curr]); + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + rq->top_tasks_bitmap[curr]); + + return; + } + + /* + * The window has rolled over for this task. By the time we get + * here, curr/prev swaps would has already occurred. So we need + * to use prev_window for the new index. + */ + update_index = load_to_index(prev_window); + + if (full_window) { + /* + * Two cases here. Either 'p' ran for the entire window or + * it didn't run at all. In either case there is no entry + * in the prev table. If 'p' ran the entire window, we just + * need to create a new entry in the prev table. In this case + * update_index will be correspond to sched_ravg_window + * so we can unconditionally update the top index. + */ + if (prev_window) { + prev_table[update_index] += 1; + rq->prev_top = update_index; + } + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + rq->top_tasks_bitmap[prev]); + } else { + zero_index_update = !old_curr_window && prev_window; + if (old_index != update_index || zero_index_update) { + if (old_curr_window) + prev_table[old_index] -= 1; + + prev_table[update_index] += 1; + + if (update_index > rq->prev_top) + rq->prev_top = update_index; + + if (!prev_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + rq->top_tasks_bitmap[prev]); + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + rq->top_tasks_bitmap[prev]); + } + } + + if (curr_window) { + curr_table[new_index] += 1; + + if (new_index > rq->curr_top) + rq->curr_top = new_index; + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + rq->top_tasks_bitmap[curr]); + } +} + +static inline void clear_top_tasks_table(u8 *table) +{ + memset(table, 0, NUM_LOAD_INDICES * sizeof(u8)); +} + +static u32 empty_windows[NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = p->ravg.curr_window; + curr_cpu_windows = p->ravg.curr_window_cpu; + } + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + p->ravg.prev_window_cpu[i] = curr_cpu_windows[i]; + p->ravg.curr_window_cpu[i] = 0; + } +} + /* * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) */ @@ -2181,6 +2381,8 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, int prev_sum_reset = 0; bool new_task; struct related_thread_group *grp; + int cpu = rq->cpu; + u32 old_curr_window; new_window = mark_start < window_start; if (new_window) { @@ -2240,57 +2442,43 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * Handle per-task window rollover. We don't care about the idle * task or exiting tasks. */ - if (new_window && !is_idle_task(p) && !exiting_task(p)) { - u32 curr_window = 0; + if (!is_idle_task(p) && !exiting_task(p)) { + old_curr_window = p->ravg.curr_window; - if (!full_window) - curr_window = p->ravg.curr_window; - - p->ravg.prev_window = curr_window; - p->ravg.curr_window = 0; + if (new_window) + rollover_task_window(p, full_window); } if (flip_counters) { u64 curr_sum = *curr_runnable_sum; u64 nt_curr_sum = *nt_curr_runnable_sum; + u8 curr_table = rq->curr_table; + u8 prev_table = 1 - curr_table; + int curr_top = rq->curr_top; - if (prev_sum_reset) + clear_top_tasks_table(rq->top_tasks[prev_table]); + clear_top_tasks_bitmap(rq->top_tasks_bitmap[prev_table]); + + if (prev_sum_reset) { curr_sum = nt_curr_sum = 0; + curr_top = 0; + clear_top_tasks_table(rq->top_tasks[curr_table]); + clear_top_tasks_bitmap( + rq->top_tasks_bitmap[curr_table]); + } *prev_runnable_sum = curr_sum; *nt_prev_runnable_sum = nt_curr_sum; *curr_runnable_sum = 0; *nt_curr_runnable_sum = 0; + rq->curr_table = prev_table; + rq->prev_top = curr_top; + rq->curr_top = 0; } - if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { - /* - * account_busy_for_cpu_time() = 0, so no update to the - * task's current window needs to be made. This could be - * for example - * - * - a wakeup event on a task within the current - * window (!new_window below, no action required), - * - switching to a new task from idle (PICK_NEXT_TASK) - * in a new window where irqtime is 0 and we aren't - * waiting on IO - */ - - if (!new_window) - return; - - /* - * A new window has started. The RQ demand must be rolled - * over if p is the current task. - */ - if (p_is_curr_task) { - /* p is idle task */ - BUG_ON(p != rq->idle); - } - - return; - } + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; if (!new_window) { /* @@ -2310,10 +2498,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, if (new_task) *nt_curr_runnable_sum += delta; - if (!is_idle_task(p) && !exiting_task(p)) + if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.curr_window += delta; + p->ravg.curr_window_cpu[cpu] += delta; + } - return; + goto done; } if (!p_is_curr_task) { @@ -2336,8 +2526,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * contribution to previous completed window. */ delta = scale_exec_time(window_start - mark_start, rq); - if (!exiting_task(p)) + if (!exiting_task(p)) { p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } } else { /* * Since at least one full window has elapsed, @@ -2345,8 +2537,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * full window (window_size). */ delta = scale_exec_time(window_size, rq); - if (!exiting_task(p)) + if (!exiting_task(p)) { p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } } *prev_runnable_sum += delta; @@ -2359,10 +2553,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, if (new_task) *nt_curr_runnable_sum += delta; - if (!exiting_task(p)) + if (!exiting_task(p)) { p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } - return; + goto done; } if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { @@ -2386,8 +2582,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * contribution to previous completed window. */ delta = scale_exec_time(window_start - mark_start, rq); - if (!is_idle_task(p) && !exiting_task(p)) + if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.prev_window += delta; + p->ravg.prev_window_cpu[cpu] += delta; + } } else { /* * Since at least one full window has elapsed, @@ -2395,8 +2593,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, * full window (window_size). */ delta = scale_exec_time(window_size, rq); - if (!is_idle_task(p) && !exiting_task(p)) + if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.prev_window = delta; + p->ravg.prev_window_cpu[cpu] = delta; + } } /* @@ -2413,10 +2613,12 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, if (new_task) *nt_curr_runnable_sum += delta; - if (!is_idle_task(p) && !exiting_task(p)) + if (!is_idle_task(p) && !exiting_task(p)) { p->ravg.curr_window = delta; + p->ravg.curr_window_cpu[cpu] = delta; + } - return; + goto done; } if (irqtime) { @@ -2461,7 +2663,10 @@ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, return; } - BUG(); +done: + if (!is_idle_task(p) && !exiting_task(p)) + update_top_tasks(p, rq, old_curr_window, + new_window, full_window); } static inline u32 predict_and_update_buckets(struct rq *rq, @@ -2829,11 +3034,23 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock) void reset_task_stats(struct task_struct *p) { u32 sum = 0; + u32 *curr_window_ptr = NULL; + u32 *prev_window_ptr = NULL; - if (exiting_task(p)) + if (exiting_task(p)) { sum = EXITING_TASK_MARKER; + } else { + curr_window_ptr = p->ravg.curr_window_cpu; + prev_window_ptr = p->ravg.prev_window_cpu; + memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids); + } memset(&p->ravg, 0, sizeof(struct ravg)); + + p->ravg.curr_window_cpu = curr_window_ptr; + p->ravg.prev_window_cpu = prev_window_ptr; + /* Retain EXITING_TASK marker */ p->ravg.sum_history[0] = sum; } @@ -2889,7 +3106,9 @@ static void reset_all_task_stats(void) read_lock(&tasklist_lock); do_each_thread(g, p) { + raw_spin_lock(&p->pi_lock); reset_task_stats(p); + raw_spin_unlock(&p->pi_lock); } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -2934,7 +3153,7 @@ const char *sched_window_reset_reasons[] = { /* Called with IRQs enabled */ void reset_all_window_stats(u64 window_start, unsigned int window_size) { - int cpu; + int cpu, i; unsigned long flags; u64 start_ts = sched_ktime_clock(); int reason = WINDOW_CHANGE; @@ -2968,6 +3187,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) if (window_size) { sched_ravg_window = window_size * TICK_NSEC; set_hmp_defaults(); + sched_load_granule = sched_ravg_window / NUM_LOAD_INDICES; } enable_window_stats(); @@ -2979,6 +3199,16 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) rq->window_start = window_start; rq->curr_runnable_sum = rq->prev_runnable_sum = 0; rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0; + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + memset(&rq->load_subs[i], 0, + sizeof(struct load_subtractions)); + clear_top_tasks_table(rq->top_tasks[i]); + clear_top_tasks_bitmap(rq->top_tasks_bitmap[i]); + } + + rq->curr_table = 0; + rq->curr_top = 0; + rq->prev_top = 0; reset_cpu_hmp_stats(cpu, 1); } @@ -3011,6 +3241,59 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size) sched_ktime_clock() - start_ts, reason, old, new); } +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + u64 ws = rq->window_start; + u64 prev_ws = ws - sched_ravg_window; + struct load_subtractions *ls = rq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + rq->curr_runnable_sum -= ls[i].subs; + rq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + rq->prev_runnable_sum -= ls[i].subs; + rq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + BUG_ON((s64)rq->prev_runnable_sum < 0); + BUG_ON((s64)rq->curr_runnable_sum < 0); + BUG_ON((s64)rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)rq->nt_curr_runnable_sum < 0); +} + +static inline u64 freq_policy_load(struct rq *rq, u64 load) +{ + unsigned int reporting_policy = sysctl_sched_freq_reporting_policy; + + switch (reporting_policy) { + case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK: + load = max_t(u64, load, top_task_load(rq)); + break; + case FREQ_REPORT_TOP_TASK: + load = top_task_load(rq); + break; + case FREQ_REPORT_CPU_LOAD: + break; + default: + WARN_ON_ONCE(1); + } + + return load; +} + static inline void sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time); @@ -3033,6 +3316,7 @@ void sched_get_cpus_busy(struct sched_load *busy, struct related_thread_group *grp; u64 total_group_load = 0, total_ngload = 0; bool aggregate_load = false; + struct sched_cluster *cluster = cpu_cluster(cpumask_first(query_cpus)); if (unlikely(cpus == 0)) return; @@ -3050,6 +3334,13 @@ void sched_get_cpus_busy(struct sched_load *busy, window_size = sched_ravg_window; + /* + * We don't really need the cluster lock for this entire for loop + * block. However, there is no advantage in optimizing this as rq + * locks are held regardless and would prevent migration anyways + */ + raw_spin_lock(&cluster->load_lock); + for_each_cpu(cpu, query_cpus) { rq = cpu_rq(cpu); @@ -3057,6 +3348,7 @@ void sched_get_cpus_busy(struct sched_load *busy, 0); cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time); + account_load_subtractions(rq); load[i] = rq->old_busy_time = rq->prev_runnable_sum; nload[i] = rq->nt_prev_runnable_sum; pload[i] = rq->hmp_stats.pred_demands_sum; @@ -3083,6 +3375,8 @@ void sched_get_cpus_busy(struct sched_load *busy, i++; } + raw_spin_unlock(&cluster->load_lock); + for_each_related_thread_group(grp) { for_each_cpu(cpu, query_cpus) { /* Protected by rq_lock */ @@ -3117,6 +3411,8 @@ void sched_get_cpus_busy(struct sched_load *busy, load[i] += group_load[i]; nload[i] += ngload[i]; + + load[i] = freq_policy_load(rq, load[i]); /* * Scale load in reference to cluster max_possible_freq. * @@ -3237,6 +3533,189 @@ int sched_set_window(u64 window_start, unsigned int window_size) return 0; } +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + rq->load_subs[index].window_start = ws; + rq->load_subs[index].subs = 0; + rq->load_subs[index].new_subs = 0; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = rq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + rq->load_subs[index].subs += sub_load; + if (new_task) + rq->load_subs[index].new_subs += sub_load; +} + +static void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + u64 prev_ws = ws - sched_ravg_window; + int i; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (p->ravg.curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + p->ravg.curr_window_cpu[i], new_task); + p->ravg.curr_window_cpu[i] = 0; + } + + if (p->ravg.prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + p->ravg.prev_window_cpu[i], new_task); + p->ravg.prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window; + + dest_rq->curr_runnable_sum += p->ravg.curr_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + + src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu]; + src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu]; + + if (new_task) { + dest_rq->nt_curr_runnable_sum += p->ravg.curr_window; + dest_rq->nt_prev_runnable_sum += p->ravg.prev_window; + + src_rq->nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[task_cpu]; + src_rq->nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[task_cpu]; + } + + p->ravg.curr_window_cpu[task_cpu] = 0; + p->ravg.prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_rq->window_start, new_task); + + BUG_ON((s64)src_rq->prev_runnable_sum < 0); + BUG_ON((s64)src_rq->curr_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0); + BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0); +} + +static int get_top_index(unsigned long *bitmap, unsigned long old_top) +{ + int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top); + + if (index == NUM_LOAD_INDICES) + return 0; + + return NUM_LOAD_INDICES - 1 - index; +} + +static void +migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) +{ + int index; + int top_index; + u32 curr_window = p->ravg.curr_window; + u32 prev_window = p->ravg.prev_window; + u8 src = src_rq->curr_table; + u8 dst = dst_rq->curr_table; + u8 *src_table; + u8 *dst_table; + + if (curr_window) { + src_table = src_rq->top_tasks[src]; + dst_table = dst_rq->top_tasks[dst]; + index = load_to_index(curr_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_rq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_rq->top_tasks_bitmap[dst]); + + if (index > dst_rq->curr_top) + dst_rq->curr_top = index; + + top_index = src_rq->curr_top; + if (index == top_index && !src_table[index]) + src_rq->curr_top = get_top_index( + src_rq->top_tasks_bitmap[src], top_index); + } + + if (prev_window) { + src = 1 - src; + dst = 1 - dst; + src_table = src_rq->top_tasks[src]; + dst_table = dst_rq->top_tasks[dst]; + index = load_to_index(prev_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_rq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_rq->top_tasks_bitmap[dst]); + + if (index > dst_rq->prev_top) + dst_rq->prev_top = index; + + top_index = src_rq->prev_top; + if (index == top_index && !src_table[index]) + src_rq->prev_top = get_top_index( + src_rq->top_tasks_bitmap[src], top_index); + } +} + void fixup_busy_time(struct task_struct *p, int new_cpu) { struct rq *src_rq = task_rq(p); @@ -3246,8 +3725,6 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; - int migrate_type; - struct migration_sum_data d; bool new_task; struct related_thread_group *grp; @@ -3281,75 +3758,62 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) new_task = is_new_task(p); /* Protected by rq_lock */ grp = p->grp; + + /* + * For frequency aggregation, we continue to do migration fixups + * even for intra cluster migrations. This is because, the aggregated + * load has to reported on a single CPU regardless. + */ if (grp && sched_freq_aggregate) { struct group_cpu_time *cpu_time; - migrate_type = GROUP_TO_GROUP; - /* Protected by rq_lock */ cpu_time = _group_cpu_time(grp, cpu_of(src_rq)); - d.src_rq = NULL; - d.src_cpu_time = cpu_time; src_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; - /* Protected by rq_lock */ cpu_time = _group_cpu_time(grp, cpu_of(dest_rq)); - d.dst_rq = NULL; - d.dst_cpu_time = cpu_time; dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; sync_window_start(dest_rq, cpu_time); - } else { - migrate_type = RQ_TO_RQ; - d.src_rq = src_rq; - d.src_cpu_time = NULL; - d.dst_rq = dest_rq; - d.dst_cpu_time = NULL; - src_curr_runnable_sum = &src_rq->curr_runnable_sum; - src_prev_runnable_sum = &src_rq->prev_runnable_sum; - src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum; - src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum; - - dst_curr_runnable_sum = &dest_rq->curr_runnable_sum; - dst_prev_runnable_sum = &dest_rq->prev_runnable_sum; - dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum; - dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum; - } - if (p->ravg.curr_window) { - *src_curr_runnable_sum -= p->ravg.curr_window; - *dst_curr_runnable_sum += p->ravg.curr_window; - if (new_task) { - *src_nt_curr_runnable_sum -= p->ravg.curr_window; - *dst_nt_curr_runnable_sum += p->ravg.curr_window; + if (p->ravg.curr_window) { + *src_curr_runnable_sum -= p->ravg.curr_window; + *dst_curr_runnable_sum += p->ravg.curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window; + *dst_nt_curr_runnable_sum += + p->ravg.curr_window; + } } - } - if (p->ravg.prev_window) { - *src_prev_runnable_sum -= p->ravg.prev_window; - *dst_prev_runnable_sum += p->ravg.prev_window; - if (new_task) { - *src_nt_prev_runnable_sum -= p->ravg.prev_window; - *dst_nt_prev_runnable_sum += p->ravg.prev_window; + if (p->ravg.prev_window) { + *src_prev_runnable_sum -= p->ravg.prev_window; + *dst_prev_runnable_sum += p->ravg.prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= + p->ravg.prev_window; + *dst_nt_prev_runnable_sum += + p->ravg.prev_window; + } } + } else { + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); } + migrate_top_tasks(p, src_rq, dest_rq); + if (p == src_rq->ed_task) { src_rq->ed_task = NULL; if (!dest_rq->ed_task) dest_rq->ed_task = p; } - trace_sched_migration_update_sum(p, migrate_type, &d); - BUG_ON((s64)*src_prev_runnable_sum < 0); - BUG_ON((s64)*src_curr_runnable_sum < 0); - BUG_ON((s64)*src_nt_prev_runnable_sum < 0); - BUG_ON((s64)*src_nt_curr_runnable_sum < 0); - done: if (p->state == TASK_WAKING) double_rq_unlock(src_rq, dest_rq); @@ -3501,6 +3965,9 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; struct migration_sum_data d; int migrate_type; + int cpu = cpu_of(rq); + bool new_task = is_new_task(p); + int i; if (!sched_freq_aggregate) return; @@ -3511,7 +3978,7 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */ - cpu_time = _group_cpu_time(grp, cpu_of(rq)); + cpu_time = _group_cpu_time(grp, cpu); if (event == ADD_TASK) { sync_window_start(rq, cpu_time); migrate_type = RQ_TO_GROUP; @@ -3528,6 +3995,19 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window_cpu[cpu]; + *src_prev_runnable_sum -= p->ravg.prev_window_cpu[cpu]; + if (new_task) { + *src_nt_curr_runnable_sum -= + p->ravg.curr_window_cpu[cpu]; + *src_nt_prev_runnable_sum -= + p->ravg.prev_window_cpu[cpu]; + } + + update_cluster_load_subtractions(p, cpu, + rq->window_start, new_task); + } else { migrate_type = GROUP_TO_RQ; d.src_rq = NULL; @@ -3550,21 +4030,42 @@ static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp, dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum; + + *src_curr_runnable_sum -= p->ravg.curr_window; + *src_prev_runnable_sum -= p->ravg.prev_window; + if (new_task) { + *src_nt_curr_runnable_sum -= p->ravg.curr_window; + *src_nt_prev_runnable_sum -= p->ravg.prev_window; + } + + /* + * Need to reset curr/prev windows for all CPUs, not just the + * ones in the same cluster. Since inter cluster migrations + * did not result in the appropriate book keeping, the values + * per CPU would be inaccurate. + */ + for_each_possible_cpu(i) { + p->ravg.curr_window_cpu[i] = 0; + p->ravg.prev_window_cpu[i] = 0; + } } - *src_curr_runnable_sum -= p->ravg.curr_window; *dst_curr_runnable_sum += p->ravg.curr_window; - - *src_prev_runnable_sum -= p->ravg.prev_window; *dst_prev_runnable_sum += p->ravg.prev_window; - - if (is_new_task(p)) { - *src_nt_curr_runnable_sum -= p->ravg.curr_window; + if (new_task) { *dst_nt_curr_runnable_sum += p->ravg.curr_window; - *src_nt_prev_runnable_sum -= p->ravg.prev_window; *dst_nt_prev_runnable_sum += p->ravg.prev_window; } + /* + * When a task enter or exits a group, it's curr and prev windows are + * moved to a single CPU. This behavior might be sub-optimal in the + * exit case, however, it saves us the overhead of handling inter + * cluster migration fixups while the task is part of a related group. + */ + p->ravg.curr_window_cpu[cpu] = p->ravg.curr_window; + p->ravg.prev_window_cpu[cpu] = p->ravg.prev_window; + trace_sched_migration_update_sum(p, migrate_type, &d); BUG_ON((s64)*src_curr_runnable_sum < 0); |
