diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 5 | ||||
-rw-r--r-- | kernel/sched/core.c | 359 | ||||
-rw-r--r-- | kernel/sched/cpufreq_sched.c | 499 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 16 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 33 | ||||
-rw-r--r-- | kernel/sched/energy.c | 124 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1329 | ||||
-rw-r--r-- | kernel/sched/features.h | 9 | ||||
-rw-r--r-- | kernel/sched/idle.c | 4 | ||||
-rw-r--r-- | kernel/sched/rt.c | 106 | ||||
-rw-r--r-- | kernel/sched/sched.h | 265 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 3 | ||||
-rw-r--r-- | kernel/sched/tune.c | 945 | ||||
-rw-r--r-- | kernel/sched/tune.h | 55 | ||||
-rw-r--r-- | kernel/sched/walt.c | 1125 | ||||
-rw-r--r-- | kernel/sched/walt.h | 62 |
16 files changed, 4823 insertions, 116 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..623ce4bde0d5 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -14,8 +14,11 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o -obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o +obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20253dbc8610..1df6da0094f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -89,6 +89,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +#include "walt.h" DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -287,6 +288,18 @@ int sysctl_sched_rt_runtime = 950000; /* cpus with isolated domains */ cpumask_var_t cpu_isolated_map; +struct rq * +lock_rq_of(struct task_struct *p, unsigned long *flags) +{ + return task_rq_lock(p, flags); +} + +void +unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags) +{ + task_rq_unlock(rq, p, flags); +} + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -1076,7 +1089,9 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(rq, cpu_rq(new_cpu)); set_task_cpu(p, new_cpu); + double_unlock_balance(rq, cpu_rq(new_cpu)); raw_spin_unlock(&rq->lock); rq = cpu_rq(new_cpu); @@ -1300,6 +1315,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); + + walt_fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); @@ -1928,6 +1945,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; +#ifdef CONFIG_SMP + struct rq *rq; + u64 wallclock; +#endif /* * If we are going to wake up a thread waiting for CONDITION we @@ -2007,6 +2028,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); + rq = cpu_rq(task_cpu(p)); + + raw_spin_lock(&rq->lock); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + raw_spin_unlock(&rq->lock); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2014,10 +2043,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_class->task_waking(p); cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); @@ -2066,8 +2097,13 @@ static void try_to_wake_up_local(struct task_struct *p) trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + u64 wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } ttwu_do_wakeup(rq, p, 0); ttwu_stat(p, smp_processor_id(), 0); @@ -2133,6 +2169,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.nr_migrations = 0; p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -2400,6 +2437,9 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + + walt_init_new_task_load(p); + /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2412,7 +2452,8 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + walt_mark_task_starting(p); + activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -2793,6 +2834,36 @@ unsigned long nr_iowait_cpu(int cpu) return atomic_read(&this->nr_iowait); } +#ifdef CONFIG_CPU_QUIET +u64 nr_running_integral(unsigned int cpu) +{ + unsigned int seqcnt; + u64 integral; + struct rq *q; + + if (cpu >= nr_cpu_ids) + return 0; + + q = cpu_rq(cpu); + + /* + * Update average to avoid reading stalled value if there were + * no run-queue changes for a long time. On the other hand if + * the changes are happening right now, just read current value + * directly. + */ + + seqcnt = read_seqcount_begin(&q->ave_seqcnt); + integral = do_nr_running_integral(q); + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) { + read_seqcount_begin(&q->ave_seqcnt); + integral = q->nr_running_integral; + } + + return integral; +} +#endif + void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); @@ -2879,6 +2950,93 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CPU_FREQ_GOV_SCHED + +static inline +unsigned long add_capacity_margin(unsigned long cpu_capacity) +{ + cpu_capacity = cpu_capacity * capacity_margin; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; +} + +static inline +unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = add_capacity_margin(cfs_cap + scr->rt); + return total += scr->dl; +} + +static void sched_freq_tick_pelt(int cpu) +{ + unsigned long cpu_utilization = capacity_max; + unsigned long capacity_curr = capacity_curr_of(cpu); + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to a higher OPP as soon as the margin of free capacity + * is impacted (specified by capacity_margin). + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); +} + +#ifdef CONFIG_SCHED_WALT +static void sched_freq_tick_walt(int cpu) +{ + unsigned long cpu_utilization = cpu_util(cpu); + unsigned long capacity_curr = capacity_curr_of(cpu); + + if (walt_disabled || !sysctl_sched_use_walt_cpu_util) + return sched_freq_tick_pelt(cpu); + + /* + * Add a margin to the WALT utilization. + * NOTE: WALT tracks a single CPU signal for all the scheduling + * classes, thus this margin is going to be added to the DL class as + * well, which is something we do not do in sched_freq_tick_pelt case. + */ + cpu_utilization = add_capacity_margin(cpu_utilization); + if (cpu_utilization <= capacity_curr) + return; + + /* + * It is likely that the load is growing so we + * keep the added margin in our request as an + * extra boost. + */ + set_cfs_cpu_capacity(cpu, true, cpu_utilization); + +} +#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) +#else +#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) +#endif /* CONFIG_SCHED_WALT */ + +static void sched_freq_tick(int cpu) +{ + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + _sched_freq_tick(cpu); +} +#else +static inline void sched_freq_tick(int cpu) { } +#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2892,10 +3050,14 @@ void scheduler_tick(void) sched_clock_tick(); raw_spin_lock(&rq->lock); + walt_set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -3132,6 +3294,7 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3193,6 +3356,9 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -5019,6 +5185,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock(&rq->lock); __sched_fork(0, idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -5401,9 +5568,60 @@ set_table_entry(struct ctl_table *entry, } static struct ctl_table * +sd_alloc_ctl_energy_table(struct sched_group_energy *sge) +{ + struct ctl_table *table = sd_alloc_ctl_entry(5); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power, + sge->nr_idle_states*sizeof(struct idle_state), 0644, + proc_doulongvec_minmax, false); + set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap, + sge->nr_cap_states*sizeof(struct capacity_state), 0644, + proc_doulongvec_minmax, false); + + return table; +} + +static struct ctl_table * +sd_alloc_ctl_group_table(struct sched_group *sg) +{ + struct ctl_table *table = sd_alloc_ctl_entry(2); + + if (table == NULL) + return NULL; + + table->procname = kstrdup("energy", GFP_KERNEL); + table->mode = 0555; + table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge); + + return table; +} + +static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table; + unsigned int nr_entries = 14; + + int i = 0; + struct sched_group *sg = sd->groups; + + if (sg->sge) { + int nr_sgs = 0; + + do {} while (nr_sgs++, sg = sg->next, sg != sd->groups); + + nr_entries += nr_sgs; + } + + table = sd_alloc_ctl_entry(nr_entries); if (table == NULL) return NULL; @@ -5436,7 +5654,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + sg = sd->groups; + if (sg->sge) { + char buf[32]; + struct ctl_table *entry = &table[13]; + + do { + snprintf(buf, 32, "group%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_group_table(sg); + } while (entry++, i++, sg = sg->next, sg != sd->groups); + } + /* &table[nr_entries-1] is terminator */ return table; } @@ -5552,6 +5782,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; account_reset_rq(rq); break; @@ -5572,6 +5805,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -5743,7 +5977,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -5804,7 +6038,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5836,7 +6071,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5915,6 +6151,8 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; + + init_max_cpu_capacity(&rd->max_cpu_capacity); return 0; free_rto_mask: @@ -6020,11 +6258,13 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; + struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; @@ -6045,6 +6285,17 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); } /* @@ -6205,6 +6456,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the @@ -6334,6 +6586,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) } /* + * Check that the per-cpu provided sd energy data is consistent for all cpus + * within the mask. + */ +static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn, + const struct cpumask *cpumask) +{ + const struct sched_group_energy * const sge = fn(cpu); + struct cpumask mask; + int i; + + if (cpumask_weight(cpumask) <= 1) + return; + + cpumask_xor(&mask, cpumask, get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) { + const struct sched_group_energy * const e = fn(i); + int y; + + BUG_ON(e->nr_idle_states != sge->nr_idle_states); + + for (y = 0; y < (e->nr_idle_states); y++) { + BUG_ON(e->idle_states[y].power != + sge->idle_states[y].power); + } + + BUG_ON(e->nr_cap_states != sge->nr_cap_states); + + for (y = 0; y < (e->nr_cap_states); y++) { + BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap); + BUG_ON(e->cap_states[y].power != + sge->cap_states[y].power); + } + } +} + +static void init_sched_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sd->groups)) + return; + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup broken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); + + sd->groups->sge = fn(cpu); +} + +/* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ @@ -6441,6 +6753,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6450,7 +6763,8 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) @@ -7000,6 +7314,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7038,10 +7353,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } @@ -7050,6 +7368,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); } @@ -7331,6 +7650,7 @@ void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; + walt_init_cpu_efficiency(); alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -7508,6 +7828,11 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7571,6 +7896,14 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == preempt_offset); } +static int __might_sleep_init_called; +int __init __might_sleep_init(void) +{ + __might_sleep_init_called = 1; + return 0; +} +early_initcall(__might_sleep_init); + void __might_sleep(const char *file, int line, int preempt_offset) { /* @@ -7595,8 +7928,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + !is_idle_task(current)) || oops_in_progress) + return; + if (system_state != SYSTEM_RUNNING && + (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 000000000000..d751bc2d0d6e --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,499 @@ +/* + * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/cpufreq.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/percpu.h> +#include <linux/irq_work.h> +#include <linux/delay.h> +#include <linux/string.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/cpufreq_sched.h> + +#include "sched.h" + +#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ +#define THROTTLE_UP_NSEC 500000 /* 500us default */ + +struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; +static bool __read_mostly cpufreq_driver_slow; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static struct cpufreq_governor cpufreq_gov_sched; +#endif + +static DEFINE_PER_CPU(unsigned long, enabled); +DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); + +/** + * gov_data - per-policy data internal to the governor + * @up_throttle: next throttling period expiry if increasing OPP + * @down_throttle: next throttling period expiry if decreasing OPP + * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP + * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @requested_freq: last frequency requested by the sched governor + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t up_throttle; + ktime_t down_throttle; + unsigned int up_throttle_nsec; + unsigned int down_throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + unsigned int requested_freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, + unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); + gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); + up_write(&policy->rwsem); +} + +static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) +{ + ktime_t now = ktime_get(); + + ktime_t throttle = gd->requested_freq < cur_freq ? + gd->down_throttle : gd->up_throttle; + + if (ktime_after(now, throttle)) + return false; + + while (1) { + int usec_left = ktime_to_ns(ktime_sub(throttle, now)); + + usec_left /= NSEC_PER_USEC; + trace_cpufreq_sched_throttled(usec_left); + usleep_range(usec_left, usec_left + 100); + now = ktime_get(); + if (ktime_after(now, throttle)) + return true; + } +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned int new_request = 0; + unsigned int last_request = 0; + int ret; + + policy = (struct cpufreq_policy *) data; + gd = policy->governor_data; + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + do { + new_request = gd->requested_freq; + if (new_request == last_request) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop()) + break; + schedule(); + } else { + /* + * if the frequency thread sleeps while waiting to be + * unthrottled, start over to check for a newer request + */ + if (finish_last_request(gd, policy->cur)) + continue; + last_request = new_request; + cpufreq_sched_try_driver_target(policy, new_request); + } + } while (!kthread_should_stop()); + + return 0; +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) + return; + + wake_up_process(gd->task); +} + +static void update_fdomain_capacity_request(int cpu) +{ + unsigned int freq_new, index_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity = 0; + + /* + * Avoid grabbing the policy if possible. A test is still + * required after locking the CPU's policy to avoid racing + * with the governor changing. + */ + if (!per_cpu(enabled, cpu)) + return; + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + return; + + if (policy->governor != &cpufreq_gov_sched || + !policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) { + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); + capacity = max(capacity, scr->total); + } + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + if (cpufreq_frequency_table_target(policy, policy->freq_table, + freq_new, CPUFREQ_RELATION_L, + &index_new)) + goto out; + freq_new = policy->freq_table[index_new].frequency; + + if (freq_new > policy->max) + freq_new = policy->max; + + if (freq_new < policy->min) + freq_new = policy->min; + + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, + gd->requested_freq); + if (freq_new == gd->requested_freq) + goto out; + + gd->requested_freq = freq_new; + + /* + * Throttling is not yet supported on platforms with fast cpufreq + * drivers. + */ + if (cpufreq_driver_slow) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); +} + +void update_cpu_capacity_request(int cpu, bool request) +{ + unsigned long new_capacity; + struct sched_capacity_reqs *scr; + + /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ + lockdep_assert_held(&cpu_rq(cpu)->lock); + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + + new_capacity = scr->cfs + scr->rt; + new_capacity = new_capacity * capacity_margin + / SCHED_CAPACITY_SCALE; + new_capacity += scr->dl; + + if (new_capacity == scr->total) + return; + + trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); + + scr->total = new_capacity; + if (request) + update_fdomain_capacity_request(cpu); +} + +static inline void set_sched_freq(void) +{ + static_key_slow_inc(&__sched_freq); +} + +static inline void clear_sched_freq(void) +{ + static_key_slow_dec(&__sched_freq); +} + +static struct attribute_group sched_attr_group_gov_pol; +static struct attribute_group *get_sysfs_attr(void) +{ + return &sched_attr_group_gov_pol; +} + +static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + int rc; + + for_each_cpu(cpu, policy->cpus) + memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, + sizeof(struct sched_capacity_reqs)); + + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) + return -ENOMEM; + + gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_UP_NSEC; + gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->up_throttle_nsec); + + rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); + if (rc) { + pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); + goto err; + } + + policy->governor_data = gd; + if (cpufreq_driver_is_slow()) { + cpufreq_driver_slow = true; + gd->task = kthread_create(cpufreq_sched_thread, policy, + "kschedfreq:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kschedfreq thread\n", + __func__); + goto err; + } + get_task_struct(gd->task); + kthread_bind_mask(gd->task, policy->related_cpus); + wake_up_process(gd->task); + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + set_sched_freq(); + + return 0; + +err: + policy->governor_data = NULL; + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + clear_sched_freq(); + if (cpufreq_driver_slow) { + kthread_stop(gd->task); + put_task_struct(gd->task); + } + + sysfs_remove_group(&policy->kobj, get_sysfs_attr()); + + policy->governor_data = NULL; + + kfree(gd); + return 0; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 1; + + return 0; +} + +static void cpufreq_sched_limits(struct cpufreq_policy *policy) +{ + unsigned int clamp_freq; + struct gov_data *gd = policy->governor_data;; + + pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", + policy->cpu, policy->min, policy->max, + policy->cur); + + clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); + + if (policy->cur != clamp_freq) + __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 0; + + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, + unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_POLICY_INIT: + return cpufreq_sched_policy_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return cpufreq_sched_policy_exit(policy); + case CPUFREQ_GOV_START: + return cpufreq_sched_start(policy); + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + case CPUFREQ_GOV_LIMITS: + cpufreq_sched_limits(policy); + break; + } + return 0; +} + +/* Tunables */ +static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->up_throttle_nsec); +} + +static ssize_t store_up_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->up_throttle_nsec = val; + return count; +} + +static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) +{ + return sprintf(buf, "%u\n", gd->down_throttle_nsec); +} + +static ssize_t store_down_throttle_nsec(struct gov_data *gd, + const char *buf, size_t count) +{ + int ret; + long unsigned int val; + + ret = kstrtoul(buf, 0, &val); + if (ret < 0) + return ret; + gd->down_throttle_nsec = val; + return count; +} + +/* + * Create show/store routines + * - sys: One governor instance for complete SYSTEM + * - pol: One governor instance per struct cpufreq_policy + */ +#define show_gov_pol_sys(file_name) \ +static ssize_t show_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, char *buf) \ +{ \ + return show_##file_name(policy->governor_data, buf); \ +} + +#define store_gov_pol_sys(file_name) \ +static ssize_t store_##file_name##_gov_pol \ +(struct cpufreq_policy *policy, const char *buf, size_t count) \ +{ \ + return store_##file_name(policy->governor_data, buf, count); \ +} + +#define gov_pol_attr_rw(_name) \ + static struct freq_attr _name##_gov_pol = \ + __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) + +#define show_store_gov_pol_sys(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name) +#define tunable_handlers(file_name) \ + show_gov_pol_sys(file_name); \ + store_gov_pol_sys(file_name); \ + gov_pol_attr_rw(file_name) + +tunable_handlers(down_throttle_nsec); +tunable_handlers(up_throttle_nsec); + +/* Per policy governor instance */ +static struct attribute *sched_attributes_gov_pol[] = { + &up_throttle_nsec_gov_pol.attr, + &down_throttle_nsec_gov_pol.attr, + NULL, +}; + +static struct attribute_group sched_attr_group_gov_pol = { + .attrs = sched_attributes_gov_pol, + .name = "sched", +}; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) + per_cpu(enabled, cpu) = 0; + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a1aecbedf5b1..acde1d7c763c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,7 @@ #include <linux/static_key.h> #include <linux/context_tracking.h> #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -49,6 +50,10 @@ void irqtime_account_irq(struct task_struct *curr) unsigned long flags; s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; @@ -56,6 +61,9 @@ void irqtime_account_irq(struct task_struct *curr) local_irq_save(flags); cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -70,8 +78,16 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_hardirq_time, delta); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); +#ifdef CONFIG_SCHED_WALT + else + account = false; +#endif irq_time_write_end(); +#ifdef CONFIG_SCHED_WALT + if (account) + walt_account_irqtime(cpu, curr, delta, wallclock); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e984f059e5fc..f10b1cb255b2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw += se_bw; +} + +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw -= se_bw; + if (dl_rq->avg_bw < 0) { + WARN_ON(1); + dl_rq->avg_bw = 0; + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + if (dl_se->dl_new) + add_average_bw(dl_se, dl_rq); + /* * The arrival of a new instance needs special treatment, i.e., * the actual scheduling parameters have to be "renewed". @@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; @@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_rq *dl_rq = dl_rq_of_se(&p->dl); + struct rq *rq = rq_of_dl_rq(dl_rq); /* * Since we are TASK_DEAD we won't slip out of the domain! @@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); + + clear_average_bw(&p->dl, &rq->dl); } static void set_curr_task_dl(struct rq *rq) @@ -1556,7 +1579,9 @@ retry: } deactivate_task(rq, next_task, 0); + clear_average_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + clear_average_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); + add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!start_dl_timer(p)) __dl_clear_params(p); + clear_average_bw(&p->dl, &rq->dl); + /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c new file mode 100644 index 000000000000..b0656b7a93e3 --- /dev/null +++ b/kernel/sched/energy.c @@ -0,0 +1,124 @@ +/* + * Obtain energy cost data from DT and populate relevant scheduler data + * structures. + * + * Copyright (C) 2015 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#define pr_fmt(fmt) "sched-energy: " fmt + +#define DEBUG + +#include <linux/gfp.h> +#include <linux/of.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/sched_energy.h> +#include <linux/stddef.h> + +struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +static void free_resources(void) +{ + int cpu, sd_level; + struct sched_group_energy *sge; + + for_each_possible_cpu(cpu) { + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (sge) { + kfree(sge->cap_states); + kfree(sge->idle_states); + kfree(sge); + } + } + } +} + +void init_sched_energy_costs(void) +{ + struct device_node *cn, *cp; + struct capacity_state *cap_states; + struct idle_state *idle_states; + struct sched_group_energy *sge; + const struct property *prop; + int sd_level, i, nstates, cpu; + const __be32 *val; + + for_each_possible_cpu(cpu) { + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_warn("CPU device node missing for CPU %d\n", cpu); + return; + } + + if (!of_find_property(cn, "sched-energy-costs", NULL)) { + pr_warn("CPU device node has no sched-energy-costs\n"); + return; + } + + for_each_possible_sd_level(sd_level) { + cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); + if (!cp) + break; + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No busy-cost data, skipping sched_energy init\n"); + goto out; + } + + sge = kcalloc(1, sizeof(struct sched_group_energy), + GFP_NOWAIT); + + nstates = (prop->length / sizeof(u32)) / 2; + cap_states = kcalloc(nstates, + sizeof(struct capacity_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) { + cap_states[i].cap = be32_to_cpup(val++); + cap_states[i].power = be32_to_cpup(val++); + } + + sge->nr_cap_states = nstates; + sge->cap_states = cap_states; + + prop = of_find_property(cp, "idle-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No idle-cost data, skipping sched_energy init\n"); + goto out; + } + + nstates = (prop->length / sizeof(u32)); + idle_states = kcalloc(nstates, + sizeof(struct idle_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) + idle_states[i].power = be32_to_cpup(val++); + + sge->nr_idle_states = nstates; + sge->idle_states = idle_states; + + sge_array[cpu][sd_level] = sge; + } + } + + pr_info("Sched-energy-costs installed from DT\n"); + return; + +out: + free_resources(); +} diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8f258f437ac2..83cfb72b2d95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,10 +30,13 @@ #include <linux/mempolicy.h> #include <linux/migrate.h> #include <linux/task_work.h> +#include <linux/module.h> #include <trace/events/sched.h> #include "sched.h" +#include "tune.h" +#include "walt.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -50,6 +53,17 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_is_big_little = 0; +unsigned int sysctl_sched_sync_hint_enable = 1; +unsigned int sysctl_sched_initial_task_util = 0; +unsigned int sysctl_sched_cstate_aware = 1; + +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -682,7 +696,9 @@ void init_entity_runnable_average(struct sched_entity *se) sa->period_contrib = 1023; sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + sa->util_avg = sched_freq() ? + sysctl_sched_initial_task_util : + scale_load_down(SCHED_LOAD_SCALE); sa->util_sum = sa->util_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -2600,6 +2616,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; @@ -2740,6 +2757,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif + /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */ + if (cfs_rq == &rq_of(cfs_rq)->cfs) + trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + return decayed || removed; } @@ -2760,6 +2781,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + trace_sched_load_avg_task(task_of(se), &se->avg); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2840,27 +2864,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); + + return last_update_time; +} #else - last_update_time = cfs_rq->avg.last_update_time; +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} #endif +/* + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). + */ +void remove_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); @@ -2962,6 +3004,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } trace_sched_stat_blocked(tsk, delta); + trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by @@ -4157,6 +4200,28 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static bool cpu_overutilized(int cpu); +static inline unsigned long boosted_cpu_util(int cpu); +#else +#define boosted_cpu_util(cpu) cpu_util(cpu) +#endif + +#ifdef CONFIG_SMP +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4167,6 +4232,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#ifdef CONFIG_SMP + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; +#endif for_each_sched_entity(se) { if (se->on_rq) @@ -4183,6 +4252,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -4190,6 +4260,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4201,6 +4272,47 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); +#ifdef CONFIG_SMP + + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + + if (!se) { + walt_inc_cumulative_runnable_avg(rq, p); + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + */ + if (task_new || task_wakeup) + update_capacity_of(cpu_of(rq)); + } + +#endif /* CONFIG_SMP */ hrtick_update(rq); } @@ -4230,6 +4342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -4250,6 +4363,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -4261,6 +4375,38 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) sub_nr_running(rq, 1); +#ifdef CONFIG_SMP + + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + + if (!se) { + walt_dec_cumulative_runnable_avg(rq, p); + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } + +#endif /* CONFIG_SMP */ + hrtick_update(rq); } @@ -4487,15 +4633,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4670,6 +4807,392 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif /* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + int payoff; + struct task_struct *task; + struct { + int before; + int after; + int delta; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; +}; + +/* + * __cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +{ + int util = __cpu_util(cpu, delta); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static int calc_util_delta(struct energy_env *eenv, int cpu) +{ + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) +{ + int i, delta; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct energy_env *eenv, + const struct sched_group_energy * const sge) +{ + int idx; + unsigned long util = group_max_util(eenv); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + break; + } + + eenv->cap_idx = idx; + + return idx; +} + +static int group_idle_state(struct sched_group *sg) +{ + int i, state = INT_MAX; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + return state; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct energy_env *eenv) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!eenv->sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (!sd) + /* + * We most probably raced with hotplug; returning a + * wrong energy estimation is better than entering an + * infinite loop. + */ + return -EINVAL; + + if (sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + unsigned long group_util; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + eenv->sg_cap = sg_shared_cap; + else + eenv->sg_cap = sg; + + cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + + idle_idx = group_idle_state(sg); + group_util = group_norm_util(eenv, sg); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + cpumask_clear_cpu(cpu, &visit_cpus); + continue; + } + + eenv->energy = total_energy; + return 0; +} + +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static inline int __energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0, 0}, + .cap = { 0, 0, 0 }, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; + + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + + return eenv->nrg.diff; +} + +#ifdef CONFIG_SCHED_TUNE + +struct target_nrg schedtune_target_nrg; + +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +static inline int +normalize_energy(int energy_diff) +{ + u32 normalized_nrg; +#ifdef CONFIG_SCHED_DEBUG + int max_delta; + + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +static inline int +energy_diff(struct energy_env *eenv) +{ + int boost = schedtune_task_boost(eenv->task); + int nrg_delta; + + /* Conpute "absolute" energy diff */ + __energy_diff(eenv); + + /* Return energy diff when boost margin is 0 */ + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff(eenv) __energy_diff(eenv) +#endif + +/* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened * at a frequency roughly N times higher than one of its wakees. In order @@ -4760,6 +5283,160 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif + return p->se.avg.util_avg; +} + +unsigned int capacity_margin = 1280; /* ~20% margin */ + +static inline unsigned long boosted_task_util(struct task_struct *task); + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += boosted_task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static inline bool task_fits_spare(struct task_struct *p, int cpu) +{ + return __task_fits(p, cpu, cpu_util(cpu)); +} + +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + +#ifdef CONFIG_SCHED_TUNE + +static long +schedtune_margin(unsigned long signal, long boost) +{ + long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_LOAD_SCALE - S), if B is positive + * M = B * S, if B is negative + * The obtained M could be used by the caller to "boost" S. + */ + if (boost >= 0) { + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; + /* + * Fast integer division by constant: + * Constant : (C) = 100 + * Precision : 0.1% (P) = 0.1 + * Reference : C * 100 / P (R) = 100000 + * + * Thus: + * Shift bits : ceil(log(R,2)) (S) = 17 + * Mult const : round(2^S/C) (M) = 1311 + * + * + */ + margin *= 1311; + margin >>= 17; + + if (boost < 0) + margin *= -1; + return margin; +} + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + int boost = schedtune_cpu_boost(cpu); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +static inline long +schedtune_task_margin(struct task_struct *task) +{ + int boost = schedtune_task_boost(task); + unsigned long util; + long margin; + + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + return 0; +} + +static inline int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + +#endif /* CONFIG_SCHED_TUNE */ + +static inline unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + long margin = schedtune_cpu_margin(util, cpu); + + trace_sched_boost_cpu(cpu, util, margin); + + return util + margin; +} + +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + long margin = schedtune_task_margin(task); + + trace_sched_boost_task(task, util, margin); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4769,7 +5446,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *fit_group = NULL, *spare_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long fit_capacity = ULONG_MAX; + unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4777,7 +5457,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_capacity; int local_group; int i; @@ -4800,6 +5480,25 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + /* + * Look for most energy-efficient group that can fit + * that can fit the task. + */ + if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { + fit_capacity = capacity_of(i); + fit_group = group; + } + + /* + * Look for group which has most spare capacity on a + * single cpu. + */ + spare_capacity = capacity_of(i) - cpu_util(i); + if (spare_capacity > max_spare_capacity) { + max_spare_capacity = spare_capacity; + spare_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4813,6 +5512,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); + if (fit_group) + return fit_group; + + if (spare_group) + return spare_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4833,7 +5538,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { + if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -4845,7 +5550,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && + } else if (idle_cpu(i) && + (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -4854,6 +5560,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } else if (shallowest_idle_cpu == -1) { + /* + * If we haven't found an idle CPU yet + * pick a non-idle one that can fit the task as + * fallback. + */ + shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -4875,15 +5588,20 @@ static int select_idle_sibling(struct task_struct *p, int target) struct sched_domain *sd; struct sched_group *sg; int i = task_cpu(p); + int best_idle = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; - if (idle_cpu(target)) - return target; + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) + return target; - /* - * If the prevous cpu is cache affine and idle, don't be stupid. - */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + } /* * Otherwise, iterate the domains and find an elegible idle cpu. @@ -4896,54 +5614,262 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; - for_each_cpu(i, sched_group_cpus(sg)) { - if (i == target || !idle_cpu(i)) - goto next; - } + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; + + if (i == target && new_usage <= capacity_curr_of(target)) + return target; + + if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { + best_idle = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } - target = cpumask_first_and(sched_group_cpus(sg), + target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); - goto done; + goto done; + } next: sg = sg->next; } while (sg != sd->groups); } + if (best_idle > 0) + target = best_idle; + done: return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) +static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) { - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); + int iter_cpu; + int target_cpu = -1; + int target_util = 0; + int backup_capacity = 0; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + int backup_cpu = -1; + unsigned long task_util_boosted, new_util; + + task_util_boosted = boosted_task_util(p); + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + struct rq *rq; + int idle_idx; + + /* + * Iterate from higher cpus for boosted tasks. + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util_boosted; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + if (new_util > capacity_orig_of(i)) + continue; + +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif + /* + * Unconditionally favoring tasks that prefer idle cpus to + * improve latency. + */ + if (idle_cpu(i) && prefer_idle) { + if (best_idle_cpu < 0) + best_idle_cpu = i; + continue; + } + + cur_capacity = capacity_curr_of(i); + rq = cpu_rq(i); + idle_idx = idle_get_state_idx(rq); + + if (new_util < cur_capacity) { + if (cpu_rq(i)->nr_running) { + if (prefer_idle) { + /* Find a target cpu with highest + * utilization. + */ + if (target_util == 0 || + target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + /* Find a target cpu with lowest + * utilization. + */ + if (target_util == 0 || + target_util > new_util) { + target_cpu = i; + target_util = new_util; + } + } + } else if (!prefer_idle) { + if (best_idle_cpu < 0 || + (sysctl_sched_cstate_aware && + best_idle_cstate > idle_idx)) { + best_idle_cstate = idle_idx; + best_idle_cpu = i; + } + } + } else if (backup_capacity == 0 || + backup_capacity > cur_capacity) { + // Find a backup cpu with least capacity. + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + + if (prefer_idle && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + else if (target_cpu < 0) + target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; + + return target_cpu; +} + +static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) +{ + struct sched_domain *sd; + struct sched_group *sg, *sg_target; + int target_max_cap = INT_MAX; + int target_cpu = task_cpu(p); + unsigned long task_util_boosted, new_util; + int i; + + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } + + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + + if (!sd) + return target; + + sg = sd->groups; + sg_target = sg; + + if (sysctl_sched_is_big_little) { + + /* + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. + */ + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); + + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); - return (util >= capacity) ? capacity : util; + task_util_boosted = boosted_task_util(p); + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + new_util = cpu_util(i) + task_util_boosted; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + } else { + /* + * Find a cpu with sufficient capacity + */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + bool boosted = schedtune_task_boost(p) > 0; + bool prefer_idle = schedtune_prefer_idle(p) > 0; +#else + bool boosted = 0; + bool prefer_idle = 0; +#endif + int tmp_target = find_best_target(p, boosted, prefer_idle); + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) + return target_cpu; + } + } + + if (target_cpu != task_cpu(p)) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = task_cpu(p), + .dst_cpu = target_cpu, + .task = p, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(task_cpu(p))) + return target_cpu; + + if (energy_diff(&eenv) >= 0) + return task_cpu(p); + } + + return target_cpu; } /* @@ -4968,7 +5894,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + energy_aware(); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -4998,7 +5926,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); + else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); } else while (sd) { @@ -5068,6 +5998,8 @@ static void task_dead_fair(struct task_struct *p) { remove_entity_load_avg(&p->se); } +#else +#define task_fits_max(p, cpu) true #endif /* CONFIG_SMP */ static unsigned long @@ -5314,6 +6246,8 @@ again: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5335,9 +6269,12 @@ simple: if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -5550,6 +6487,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -5568,6 +6512,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -5578,6 +6523,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -5759,7 +6705,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + double_lock_balance(env->src_rq, env->dst_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); } /* @@ -5904,6 +6852,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -5925,6 +6877,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -6020,12 +6977,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6041,6 +6992,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6132,19 +7084,58 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6153,13 +7144,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6172,6 +7164,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6196,11 +7189,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); } } else { /* @@ -6210,12 +7204,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -6310,6 +7308,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6320,6 +7330,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } @@ -6331,14 +7344,15 @@ group_type group_classify(struct sched_group *group, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -6355,7 +7369,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6363,8 +7378,17 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) { + *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -6406,9 +7430,25 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + /* + * Candiate sg has no more than one task per cpu and has higher + * per-cpu capacity. No reason to pull tasks to less capable cpus. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6470,7 +7510,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6492,7 +7532,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6514,6 +7554,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -6530,10 +7579,23 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; + + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) { + env->dst_rq->rd->overutilized = overutilized; + trace_sched_overutilized(overutilized); + } + } else { + if (!env->dst_rq->rd->overutilized && overutilized) { + env->dst_rq->rd->overutilized = true; + trace_sched_overutilized(true); + } } } @@ -6682,6 +7744,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -6715,6 +7793,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -6756,6 +7839,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -6784,6 +7871,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -6807,7 +7899,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -6820,6 +7913,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -6878,7 +7972,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* @@ -6939,6 +8034,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -7060,6 +8162,11 @@ more_balance: * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -7151,7 +8258,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); @@ -7283,6 +8391,7 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; + long removed_util=0; idle_enter_fair(this_rq); @@ -7292,8 +8401,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) @@ -7305,6 +8415,17 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); + /* + * If removed_util_avg is !0 we most probably migrated some task away + * from this_cpu. In this case we might be willing to trigger an OPP + * update, but we want to do so if we don't find anybody else to pull + * here (we will trigger an OPP update with the pulled task's enqueue + * anyway). + * + * Record removed_util before calling update_blocked_averages, and use + * it below (before returning) to see if an OPP update is required. + */ + removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -7369,6 +8490,12 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; + } else if (removed_util) { + /* + * No task pulled and someone has been migrated away. + * Good case to trigger an OPP update. + */ + update_capacity_of(this_cpu); } return pulled_task; @@ -7428,8 +8555,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } @@ -7809,12 +8941,13 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); @@ -7920,6 +9053,16 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + +#ifdef CONFIG_SMP + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) { + rq->rd->overutilized = true; + trace_sched_overutilized(true); + } + + rq->misfit_task = !task_fits_max(curr, rq->cpu); +#endif + } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..55e461055332 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,12 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE +SCHED_FEAT(ENERGY_AWARE, true) +#else +SCHED_FEAT(ENERGY_AWARE, false) +#endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..917c94abf5bb 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -19,9 +19,10 @@ * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. */ -void sched_idle_set_state(struct cpuidle_state *idle_state) +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) { idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); } static int __read_mostly cpu_idle_force_poll; @@ -219,6 +220,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 78ae5c1d9412..541b8494450e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,6 +8,8 @@ #include <linux/slab.h> #include <linux/irq_work.h> +#include "walt.h" + int sched_rr_timeslice = RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -889,6 +891,51 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se) return rt_task_of(rt_se)->prio; } +static void dump_throttled_rt_tasks(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *rt_se; + char buf[500]; + char *pos = buf; + char *end = buf + sizeof(buf); + int idx; + + pos += snprintf(pos, sizeof(buf), + "sched: RT throttling activated for rt_rq %p (cpu %d)\n", + rt_rq, cpu_of(rq_of_rt_rq(rt_rq))); + + if (bitmap_empty(array->bitmap, MAX_RT_PRIO)) + goto out; + + pos += snprintf(pos, end - pos, "potential CPU hogs:\n"); + idx = sched_find_first_bit(array->bitmap); + while (idx < MAX_RT_PRIO) { + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pos < end) + pos += snprintf(pos, end - pos, "\t%s (%d)\n", + p->comm, p->pid); + } + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1); + } +out: +#ifdef CONFIG_PANIC_ON_RT_THROTTLING + /* + * Use pr_err() in the BUG() case since printk_sched() will + * not get flushed and deadlock is not a concern. + */ + pr_err("%s", buf); + BUG(); +#else + printk_deferred("%s", buf); +#endif +} + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -912,8 +959,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_deferred_once("sched: RT throttling activated\n"); + + if (!once) { + once = true; + dump_throttled_rt_tasks(rt_rq); + } } else { /* * In case we did anyway, make it go away, @@ -1261,6 +1314,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + walt_inc_cumulative_runnable_avg(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1272,6 +1326,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se); + walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); } @@ -1426,6 +1481,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +#ifdef CONFIG_SMP +static void sched_rt_update_capacity_req(struct rq *rq) +{ + u64 total, used, age_stamp, avg; + s64 delta; + + if (!sched_freq()) + return; + + sched_avg_update(rq); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = rq_clock(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + if (unlikely(used > SCHED_CAPACITY_SCALE)) + used = SCHED_CAPACITY_SCALE; + + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); +} +#else +static inline void sched_rt_update_capacity_req(struct rq *rq) +{ } + +#endif + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1494,8 +1584,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!rt_rq->rt_queued) { + /* + * The next task to be picked on this rq will have a lower + * priority than rt tasks so we can spend some time to update + * the capacity used by rt tasks based on the last activity. + * This value will be the used as an estimation of the next + * activity. + */ + sched_rt_update_capacity_req(rq); return NULL; + } put_prev_task(rq, prev); @@ -2211,6 +2310,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); + if (rq->rt.rt_nr_running) + sched_rt_update_capacity_req(rq); + watchdog(rq, p); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0517abd7dd73..780522c65cea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -410,6 +410,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; @@ -506,10 +510,18 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* This is the "average utilization" for this runqueue */ + s64 avg_bw; }; #ifdef CONFIG_SMP +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -528,6 +540,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). @@ -543,6 +558,9 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + /* Maximum cpu capacity in the system. */ + struct max_cpu_capacity max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -572,6 +590,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; + unsigned int misfit_task; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; @@ -579,6 +598,14 @@ struct rq { #ifdef CONFIG_NO_HZ_FULL unsigned long last_sched_tick; #endif + +#ifdef CONFIG_CPU_QUIET + /* time-based average load */ + u64 nr_last_stamp; + u64 nr_running_integral; + seqcount_t ave_seqcnt; +#endif + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -640,6 +667,30 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_WALT + /* + * max_freq = user or thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, min_freq, max_possible_freq; + struct cpumask freq_domain_cpumask; + + u64 cumulative_runnable_avg; + int efficiency; /* Differentiate cpus with different IPC capability */ + int load_scale_factor; + int capacity; + int max_possible_capacity; + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; +#endif /* CONFIG_SCHED_WALT */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -687,6 +738,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -836,6 +888,8 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_ea); +DECLARE_PER_CPU(struct sched_domain *, sd_scs); struct sched_group_capacity { atomic_t ref; @@ -843,7 +897,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long max_capacity; /* Max per-cpu capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -860,6 +915,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + const struct sched_group_energy *sge; /* * The CPUs this group covers. @@ -1163,6 +1219,7 @@ static const u32 prio_to_wmult[40] = { #endif #define ENQUEUE_REPLENISH 0x08 #define ENQUEUE_RESTORE 0x10 +#define ENQUEUE_WAKEUP_NEW 0x20 #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 @@ -1248,6 +1305,7 @@ extern const struct sched_class idle_sched_class; #ifdef CONFIG_SMP +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); @@ -1276,6 +1334,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1286,6 +1355,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void sysrq_sched_debug_show(void); @@ -1310,7 +1388,7 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -static inline void add_nr_running(struct rq *rq, unsigned count) +static inline void __add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1338,11 +1416,48 @@ static inline void add_nr_running(struct rq *rq, unsigned count) } } -static inline void sub_nr_running(struct rq *rq, unsigned count) +static inline void __sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; } +#ifdef CONFIG_CPU_QUIET +#define NR_AVE_SCALE(x) ((x) << FSHIFT) +static inline u64 do_nr_running_integral(struct rq *rq) +{ + s64 nr, deltax; + u64 nr_running_integral = rq->nr_running_integral; + + deltax = rq->clock_task - rq->nr_last_stamp; + nr = NR_AVE_SCALE(rq->nr_running); + + nr_running_integral += nr * deltax; + + return nr_running_integral; +} + +static inline void add_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __add_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} + +static inline void sub_nr_running(struct rq *rq, unsigned count) +{ + write_seqcount_begin(&rq->ave_seqcnt); + rq->nr_running_integral = do_nr_running_integral(rq); + rq->nr_last_stamp = rq->clock_task; + __sub_nr_running(rq, count); + write_seqcount_end(&rq->ave_seqcnt); +} +#else +#define add_nr_running __add_nr_running +#define sub_nr_running __sub_nr_running +#endif + static inline void rq_last_tick_reset(struct rq *rq) { #ifdef CONFIG_NO_HZ_FULL @@ -1415,10 +1530,146 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int walt_ravg_window; +extern unsigned int walt_disabled; + +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static inline unsigned long __cpu_util(int cpu, int delta) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } +#endif + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); +} + +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +#define capacity_max SCHED_CAPACITY_SCALE +extern unsigned int capacity_margin; +extern struct static_key __sched_freq; + +static inline bool sched_freq(void) +{ + return static_key_false(&__sched_freq); +} + +DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +void update_cpu_capacity_request(int cpu, bool request); + +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + int rtdl = scr->rt + scr->dl; + /* + * WALT tracks the utilization of a CPU considering the load + * generated by all the scheduling classes. + * Since the following call to: + * update_cpu_capacity + * is already adding the RT and DL utilizations let's remove + * these contributions from the WALT signal. + */ + if (capacity > rtdl) + capacity -= rtdl; + else + capacity = 0; + } +#endif + if (scr->cfs != capacity) { + scr->cfs = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; + update_cpu_capacity_request(cpu, request); + } +} +#else +static inline bool sched_freq(void) { return false; } +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); } #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } @@ -1507,6 +1758,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } +extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags); +extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags); + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1579,7 +1833,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) __releases(busiest->lock) { - raw_spin_unlock(&busiest->lock); + if (this_rq != busiest) + raw_spin_unlock(&busiest->lock); lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index cbc67da10954..61f852d46858 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -42,12 +43,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 000000000000..079b18802f17 --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,945 @@ +#include <linux/cgroup.h> +#include <linux/err.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/printk.h> +#include <linux/rcupdate.h> +#include <linux/slab.h> + +#include <trace/events/sched.h> + +#include "sched.h" +#include "tune.h" + +#ifdef CONFIG_CGROUP_SCHEDTUNE +static bool schedtune_initialized = false; +#endif + +unsigned int sysctl_sched_cfs_boost __read_mostly; + +extern struct target_nrg schedtune_target_nrg; + +/* Performance Boost region (B) threshold params */ +static int perf_boost_idx; + +/* Performance Constraint region (C) threshold params */ +static int perf_constrain_idx; + +/** + * Performance-Energy (P-E) Space thresholds constants + */ +struct threshold_params { + int nrg_gain; + int cap_gain; +}; + +/* + * System specific P-E space thresholds constants + */ +static struct threshold_params +threshold_gains[] = { + { 0, 5 }, /* < 10% */ + { 1, 5 }, /* < 20% */ + { 2, 5 }, /* < 30% */ + { 3, 5 }, /* < 40% */ + { 4, 5 }, /* < 50% */ + { 5, 4 }, /* < 60% */ + { 5, 3 }, /* < 70% */ + { 5, 2 }, /* < 80% */ + { 5, 1 }, /* < 90% */ + { 5, 0 } /* <= 100% */ +}; + +static int +__schedtune_accept_deltas(int nrg_delta, int cap_delta, + int perf_boost_idx, int perf_constrain_idx) +{ + int payoff = -INT_MAX; + int gain_idx = -1; + + /* Performance Boost (B) region */ + if (nrg_delta >= 0 && cap_delta > 0) + gain_idx = perf_boost_idx; + /* Performance Constraint (C) region */ + else if (nrg_delta < 0 && cap_delta <= 0) + gain_idx = perf_constrain_idx; + + /* Default: reject schedule candidate */ + if (gain_idx == -1) + return payoff; + + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * + * - Performance Boost (B) region + * + * Condition: nrg_delta > 0 && cap_delta > 0 + * Payoff criteria: + * cap_gain / nrg_gain < cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since both nrg_gain and nrg_delta are positive, the + * inequality does not change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * - Performance Constraint (C) region + * + * Condition: nrg_delta < 0 && cap_delta < 0 + * payoff criteria: + * cap_gain / nrg_gain > cap_delta / nrg_delta = + * cap_gain * nrg_delta < cap_delta * nrg_gain + * Note that since nrg_gain > 0 while nrg_delta < 0, the + * inequality change. Thus: + * + * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta) + * + * This means that, in case of same positive defined {cap,nrg}_gain + * for both the B and C regions, we can use the same payoff formula + * where a positive value represents the accept condition. + */ + payoff = cap_delta * threshold_gains[gain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain; + + return payoff; +} + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +/* + * EAS scheduler tunables for task groups. + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + + /* Performance Boost (B) region threshold params */ + int perf_boost_idx; + + /* Performance Constraint (C) region threshold params */ + int perf_constrain_idx; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards idle CPUs */ + int prefer_idle; +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, + .perf_boost_idx = 0, + .perf_constrain_idx = 0, + .prefer_idle = 0, +}; + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + struct schedtune *ct; + int perf_boost_idx; + int perf_constrain_idx; + + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); + return INT_MAX; + } + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); + return -INT_MAX; + } + + /* Get task specific perf Boost/Constraints indexes */ + rcu_read_lock(); + ct = task_schedtune(task); + perf_boost_idx = ct->perf_boost_idx; + perf_constrain_idx = ct->perf_constrain_idx; + rcu_read_unlock(); + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 4 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + bool idle; + int boost_max; + struct { + /* The boost for tasks on that boost group */ + int boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg; + int boost_max; + int idx; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + + boost_max = max(boost_max, bg->group[idx].boost); + } + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) { + schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); + continue; + } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); + } + + return 0; +} + +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; + + /* Update boosted tasks count while avoiding to make it negative */ + bg->group[idx].tasks = max(0, tasks); + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; + struct schedtune *st; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); + rcu_read_lock(); + + st = task_schedtune(p); + idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct boost_groups *bg; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (!unlikely(schedtune_initialized)) + return 0; + + + cgroup_taskset_for_each(task, css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = lock_rq_of(task, &irq_flags); + + if (!task->on_rq) { + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + unlock_rq_of(rq, task, &irq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} + +void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; + struct schedtune *st; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue is already enforce by the do_exit() code path + * via schedtune_exit_task(). + */ + if (p->flags & PF_EXITING) + return; + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); + rcu_read_lock(); + + st = task_schedtune(p); + idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +void schedtune_exit_task(struct task_struct *tsk) +{ + struct schedtune *st; + unsigned long irq_flags; + unsigned int cpu; + struct rq *rq; + int idx; + + if (!unlikely(schedtune_initialized)) + return; + + rq = lock_rq_of(tsk, &irq_flags); + rcu_read_lock(); + + cpu = cpu_of(rq); + st = task_schedtune(tsk); + idx = st->idx; + schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + unlock_rq_of(rq, tsk, &irq_flags); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + +int schedtune_prefer_idle(struct task_struct *p) +{ + struct schedtune *st; + int prefer_idle; + + /* Get prefer_idle value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_idle = st->prefer_idle; + rcu_read_unlock(); + + return prefer_idle; +} + +static u64 +prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_idle; +} + +static int +prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_idle) +{ + struct schedtune *st = css_st(css); + st->prefer_idle = prefer_idle; + + return 0; +} + +static s64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + s64 boost) +{ + struct schedtune *st = css_st(css); + unsigned threshold_idx; + int boost_pct; + + if (boost < -100 || boost > 100) + return -EINVAL; + boost_pct = boost; + + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + st->perf_boost_idx = threshold_idx; + st->perf_constrain_idx = threshold_idx; + + st->boost = boost; + if (css == &root_schedtune.css) { + sysctl_sched_cfs_boost = boost; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; + } + + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + + trace_sched_tune_config(st->boost); + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_s64 = boost_read, + .write_s64 = boost_write, + }, + { + .name = "prefer_idle", + .read_u64 = prefer_idle_read, + .write_u64 = prefer_idle_write, + }, + { } /* terminate */ +}; + +static int +schedtune_boostgroup_init(struct schedtune *st) +{ + struct boost_groups *bg; + int cpu; + + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; + } + + return 0; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) + return &root_schedtune.css; + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; + + return &st->css; + +release: + kfree(st); +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); + + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, + .legacy_cftypes = files, + .early_init = 1, +}; + +static inline void +schedtune_init_cgroups(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + raw_spin_lock_init(&bg->lock); + } + + pr_info("schedtune: configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + + schedtune_initialized = true; +} + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0); + return INT_MAX; + } + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) { + trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5); + return -INT_MAX; + } + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +int +sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + unsigned threshold_idx; + int boost_pct; + + if (ret || !write) + return ret; + + if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100) + return -EINVAL; + boost_pct = sysctl_sched_cfs_boost; + + /* + * Update threshold params for Performance Boost (B) + * and Performance Constraint (C) regions. + * The current implementatio uses the same cuts for both + * B and C regions. + */ + threshold_idx = clamp(boost_pct, 0, 99) / 10; + perf_boost_idx = threshold_idx; + perf_constrain_idx = threshold_idx; + + return 0; +} + +#ifdef CONFIG_SCHED_DEBUG +static void +schedtune_test_nrg(unsigned long delta_pwr) +{ + unsigned long test_delta_pwr; + unsigned long test_norm_pwr; + int idx; + + /* + * Check normalization constants using some constant system + * energy values + */ + pr_info("schedtune: verify normalization constants...\n"); + for (idx = 0; idx < 6; ++idx) { + test_delta_pwr = delta_pwr >> idx; + + /* Normalize on max energy for target platform */ + test_norm_pwr = reciprocal_divide( + test_delta_pwr << SCHED_LOAD_SHIFT, + schedtune_target_nrg.rdiv); + + pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", + idx, test_delta_pwr, test_norm_pwr); + } +} +#else +#define schedtune_test_nrg(delta_pwr) +#endif + +/* + * Compute the min/max power consumption of a cluster and all its CPUs + */ +static void +schedtune_add_cluster_nrg( + struct sched_domain *sd, + struct sched_group *sg, + struct target_nrg *ste) +{ + struct sched_domain *sd2; + struct sched_group *sg2; + + struct cpumask *cluster_cpus; + char str[32]; + + unsigned long min_pwr; + unsigned long max_pwr; + int cpu; + + /* Get Cluster energy using EM data for the first CPU */ + cluster_cpus = sched_group_cpus(sg); + snprintf(str, 32, "CLUSTER[%*pbl]", + cpumask_pr_args(cluster_cpus)); + + min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; + max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Keep track of this cluster's energy in the computation of the + * overall system energy + */ + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + /* Get CPU energy using EM data for each CPU in the group */ + for_each_cpu(cpu, cluster_cpus) { + /* Get a SD view for the specific CPU */ + for_each_domain(cpu, sd2) { + /* Get the CPU group */ + sg2 = sd2->groups; + min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; + max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; + + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + snprintf(str, 32, "CPU[%d]", cpu); + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Assume we have EM data only at the CPU and + * the upper CLUSTER level + */ + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); + break; + } + } +} + +/* + * Initialize the constants required to compute normalized energy. + * The values of these constants depends on the EM data for the specific + * target system and topology. + * Thus, this function is expected to be called by the code + * that bind the EM to the topology information. + */ +static int +schedtune_init(void) +{ + struct target_nrg *ste = &schedtune_target_nrg; + unsigned long delta_pwr = 0; + struct sched_domain *sd; + struct sched_group *sg; + + pr_info("schedtune: init normalization constants...\n"); + ste->max_power = 0; + ste->min_power = 0; + + rcu_read_lock(); + + /* + * When EAS is in use, we always have a pointer to the highest SD + * which provides EM data. + */ + sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); + if (!sd) { + pr_info("schedtune: no energy model data\n"); + goto nodata; + } + + sg = sd->groups; + do { + schedtune_add_cluster_nrg(sd, sg, ste); + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + "SYSTEM", ste->min_power, ste->max_power); + + /* Compute normalization constants */ + delta_pwr = ste->max_power - ste->min_power; + ste->rdiv = reciprocal_value(delta_pwr); + pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", + ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); + + schedtune_test_nrg(delta_pwr); + +#ifdef CONFIG_CGROUP_SCHEDTUNE + schedtune_init_cgroups(); +#else + pr_info("schedtune: configured to support global boosting only\n"); +#endif + + return 0; + +nodata: + rcu_read_unlock(); + return -EINVAL; +} +postcore_initcall(schedtune_init); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 000000000000..4f6441771e4c --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,55 @@ + +#ifdef CONFIG_SCHED_TUNE + +#include <linux/reciprocal_div.h> + +/* + * System energy normalization constants + */ +struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +}; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); + +int schedtune_prefer_idle(struct task_struct *tsk); + +void schedtune_exit_task(struct task_struct *tsk); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost() +#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost() + +#define schedtune_exit_task(task) do { } while (0) + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +int schedtune_normalize_energy(int energy); +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_cpu_boost(cpu) 0 +#define schedtune_task_boost(tsk) 0 + +#define schedtune_exit_task(task) do { } while (0) + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta + +#endif /* CONFIG_SCHED_TUNE */ diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c new file mode 100644 index 000000000000..6e053bd9830c --- /dev/null +++ b/kernel/sched/walt.c @@ -0,0 +1,1125 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * Window Assisted Load Tracking (WALT) implementation credits: + * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, + * Pavan Kumar Kondeti, Olav Haugan + * + * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla + * and Todd Kjos + */ + +#include <linux/syscore_ops.h> +#include <linux/cpufreq.h> +#include <trace/events/sched.h> +#include "sched.h" +#include "walt.h" + +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +#define EXITING_TASK_MARKER 0xdeaddead + +static __read_mostly unsigned int walt_ravg_hist_size = 5; +static __read_mostly unsigned int walt_window_stats_policy = + WINDOW_STATS_MAX_RECENT_AVG; +static __read_mostly unsigned int walt_account_wait_time = 1; +static __read_mostly unsigned int walt_freq_account_wait_time = 0; +static __read_mostly unsigned int walt_io_is_busy = 0; + +unsigned int sysctl_sched_walt_init_task_load_pct = 15; + +/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ +unsigned int __read_mostly walt_disabled = 0; + +static unsigned int max_possible_efficiency = 1024; +static unsigned int min_possible_efficiency = 1024; + +/* + * Maximum possible frequency across all cpus. Task demand and cpu + * capacity (cpu_power) metrics are scaled in reference to it. + */ +static unsigned int max_possible_freq = 1; + +/* + * Minimum possible max_freq across all cpus. This will be same as + * max_possible_freq on homogeneous systems and could be different from + * max_possible_freq on heterogenous systems. min_max_freq is used to derive + * capacity (cpu_power) of cpus. + */ +static unsigned int min_max_freq = 1; + +static unsigned int max_load_scale_factor = 1024; +static unsigned int max_possible_capacity = 1024; + +/* Mask of all CPUs that have max_possible_capacity */ +static cpumask_t mpc_mask = CPU_MASK_ALL; + +/* Window size (in ns) */ +__read_mostly unsigned int walt_ravg_window = 20000000; + +/* Min window size (in ns) = 10ms */ +#define MIN_SCHED_RAVG_WINDOW 10000000 + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +static unsigned int sync_cpu; +static ktime_t ktime_last; +static bool walt_ktime_suspended; + +static unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +void +walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg += p->ravg.demand; +} + +void +walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ + rq->cumulative_runnable_avg -= p->ravg.demand; + BUG_ON((s64)rq->cumulative_runnable_avg < 0); +} + +static void +fixup_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p, s64 task_load_delta) +{ + rq->cumulative_runnable_avg += task_load_delta; + if ((s64)rq->cumulative_runnable_avg < 0) + panic("cra less than zero: tld: %lld, task_load(p) = %u\n", + task_load_delta, task_load(p)); +} + +u64 walt_ktime_clock(void) +{ + if (unlikely(walt_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void walt_resume(void) +{ + walt_ktime_suspended = false; +} + +static int walt_suspend(void) +{ + ktime_last = ktime_get(); + walt_ktime_suspended = true; + return 0; +} + +static struct syscore_ops walt_syscore_ops = { + .resume = walt_resume, + .suspend = walt_suspend +}; + +static int __init walt_init_ops(void) +{ + register_syscore_ops(&walt_syscore_ops); + return 0; +} +late_initcall(walt_init_ops); + +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg += p->ravg.demand; +} + +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + cfs_rq->cumulative_runnable_avg -= p->ravg.demand; +} + +static int exiting_task(struct task_struct *p) +{ + if (p->flags & PF_EXITING) { + if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) { + p->ravg.sum_history[0] = EXITING_TASK_MARKER; + } + return 1; + } + return 0; +} + +static int __init set_walt_ravg_window(char *str) +{ + get_option(&str, &walt_ravg_window); + + walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + return 0; +} + +early_param("walt_ravg_window", set_walt_ravg_window); + +static void +update_window_start(struct rq *rq, u64 wallclock) +{ + s64 delta; + int nr_windows; + + delta = wallclock - rq->window_start; + /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */ + if (delta < 0) { + delta = 0; + WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n"); + } + + if (delta < walt_ravg_window) + return; + + nr_windows = div64_u64(delta, walt_ravg_window); + rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; +} + +static u64 scale_exec_time(u64 delta, struct rq *rq) +{ + unsigned int cur_freq = rq->cur_freq; + int sf; + + if (unlikely(cur_freq > max_possible_freq)) + cur_freq = rq->max_possible_freq; + + /* round up div64 */ + delta = div64_u64(delta * cur_freq + max_possible_freq - 1, + max_possible_freq); + + sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); + + delta *= sf; + delta >>= 10; + + return delta; +} + +static int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!walt_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + + +#define WALT_HIGH_IRQ_TIMEOUT 3 + +u64 walt_irqload(int cpu) { + struct rq *rq = cpu_rq(cpu); + s64 delta; + delta = get_jiffies_64() - rq->irqload_ts; + + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < WALT_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +int walt_cpu_high_irqload(int cpu) { + return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload; +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE || + event == TASK_UPDATE) + return 1; + + /* Only TASK_MIGRATE && PICK_NEXT_TASK left */ + return walt_freq_account_wait_time; +} + +/* + * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, nr_full_windows = 0; + int p_is_curr_task = (p == rq->curr); + u64 mark_start = p->ravg.mark_start; + u64 window_start = rq->window_start; + u32 window_size = walt_ravg_window; + u64 delta; + + new_window = mark_start < window_start; + if (new_window) { + nr_full_windows = div64_u64((window_start - mark_start), + window_size); + if (p->ravg.active_windows < USHRT_MAX) + p->ravg.active_windows++; + } + + /* Handle per-task window rollover. We don't care about the idle + * task or exiting tasks. */ + if (new_window && !is_idle_task(p) && !exiting_task(p)) { + u32 curr_window = 0; + + if (!nr_full_windows) + curr_window = p->ravg.curr_window; + + p->ravg.prev_window = curr_window; + p->ravg.curr_window = 0; + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) { + /* account_busy_for_cpu_time() = 0, so no update to the + * task's current window needs to be made. This could be + * for example + * + * - a wakeup event on a task within the current + * window (!new_window below, no action required), + * - switching to a new task from idle (PICK_NEXT_TASK) + * in a new window where irqtime is 0 and we aren't + * waiting on IO */ + + if (!new_window) + return; + + /* A new window has started. The RQ demand must be rolled + * over if p is the current task. */ + if (p_is_curr_task) { + u64 prev_sum = 0; + + /* p is either idle task or an exiting task */ + if (!nr_full_windows) { + prev_sum = rq->curr_runnable_sum; + } + + rq->prev_runnable_sum = prev_sum; + rq->curr_runnable_sum = 0; + } + + return; + } + + if (!new_window) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + rq->curr_runnable_sum += delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window += delta; + + return; + } + + if (!p_is_curr_task) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!exiting_task(p)) + p->ravg.prev_window += delta; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!exiting_task(p)) + p->ravg.prev_window = delta; + } + rq->prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum += delta; + if (!exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task or exiting tasks need not + * be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. */ + + if (!nr_full_windows) { + /* A full window hasn't elapsed, account partial + * contribution to previous completed window. */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window += delta; + + delta += rq->curr_runnable_sum; + } else { + /* Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.prev_window = delta; + + } + /* + * Rollover for normal runnable sum is done here by overwriting + * the values in prev_runnable_sum and curr_runnable_sum. + * Rollover for new task runnable sum has completed by previous + * if-else statement. + */ + rq->prev_runnable_sum = delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + rq->curr_runnable_sum = delta; + if (!is_idle_task(p) && !exiting_task(p)) + p->ravg.curr_window = delta; + + return; + } + + if (irqtime) { + /* account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. */ + + BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. */ + rq->prev_runnable_sum = rq->curr_runnable_sum; + if (mark_start > window_start) { + rq->curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + rq->prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + rq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + + BUG(); +} + +static int account_busy_for_task_demand(struct task_struct *p, int event) +{ + /* No need to bother updating task demand for exiting tasks + * or the idle task. */ + if (exiting_task(p) || is_idle_task(p)) + return 0; + + /* When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. */ + if (event == TASK_WAKE || (!walt_account_wait_time && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + u32 *hist = &p->ravg.sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand; + u64 sum = 0; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = walt_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + p->ravg.sum = 0; + + if (walt_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (walt_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, walt_ravg_hist_size); + if (walt_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements hmp stats + * avoid decrementing it here again. + */ + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + fixup_cumulative_runnable_avg(rq, p, demand); + + p->ravg.demand = demand; + +done: + trace_walt_update_history(rq, p, runtime, samples, event); + return; +} + +static void add_to_task_demand(struct rq *rq, struct task_struct *p, + u64 delta) +{ + delta = scale_exec_time(delta, rq); + p->ravg.sum += delta; + if (unlikely(p->ravg.sum > walt_ravg_window)) + p->ravg.sum = walt_ravg_window; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = p->ravg.mark_start; + * wc = wallclock + * ws = rq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, p->ravg.sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, p->ravg.sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of p->ravg.sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally p->ravg.sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static void update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + u64 mark_start = p->ravg.mark_start; + u64 delta, window_start = rq->window_start; + int new_window, nr_full_windows; + u32 window_size = walt_ravg_window; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(p, event)) { + if (new_window) + /* If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. */ + update_history(rq, p, p->ravg.sum, 1, event); + return; + } + + if (!new_window) { + /* The simple case - busy time contained within the existing + * window. */ + add_to_task_demand(rq, p, wallclock - mark_start); + return; + } + + /* Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, p->ravg.sum, 1, event); + if (nr_full_windows) + update_history(rq, p, scale_exec_time(window_size, rq), + nr_full_windows, event); + + /* Roll window_start back to current to process any remainder + * in current window. */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + add_to_task_demand(rq, p, wallclock - mark_start); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + if (walt_disabled || !rq->window_start) + return; + + lockdep_assert_held(&rq->lock); + + update_window_start(rq, wallclock); + + if (!p->ravg.mark_start) + goto done; + + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + +done: + trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime); + + p->ravg.mark_start = wallclock; +} + +unsigned long __weak arch_get_cpu_efficiency(int cpu) +{ + return SCHED_LOAD_SCALE; +} + +void walt_init_cpu_efficiency(void) +{ + int i, efficiency; + unsigned int max = 0, min = UINT_MAX; + + for_each_possible_cpu(i) { + efficiency = arch_get_cpu_efficiency(i); + cpu_rq(i)->efficiency = efficiency; + + if (efficiency > max) + max = efficiency; + if (efficiency < min) + min = efficiency; + } + + if (max) + max_possible_efficiency = max; + + if (min) + min_possible_efficiency = min; +} + +static void reset_task_stats(struct task_struct *p) +{ + u32 sum = 0; + + if (exiting_task(p)) + sum = EXITING_TASK_MARKER; + + memset(&p->ravg, 0, sizeof(struct ravg)); + /* Retain EXITING_TASK marker */ + p->ravg.sum_history[0] = sum; +} + +void walt_mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + + if (!rq->window_start) { + reset_task_stats(p); + return; + } + + wallclock = walt_ktime_clock(); + p->ravg.mark_start = wallclock; +} + +void walt_set_window_start(struct rq *rq) +{ + int cpu = cpu_of(rq); + struct rq *sync_rq = cpu_rq(sync_cpu); + + if (rq->window_start) + return; + + if (cpu == sync_cpu) { + rq->window_start = walt_ktime_clock(); + } else { + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + rq->window_start = cpu_rq(sync_cpu)->window_start; + rq->curr_runnable_sum = rq->prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + rq->curr->ravg.mark_start = rq->window_start; +} + +void walt_migrate_sync_cpu(int cpu) +{ + if (cpu == sync_cpu) + sync_cpu = smp_processor_id(); +} + +void walt_fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + if (exiting_task(p)) { + return; + } + + if (p->state == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + + if (p->ravg.curr_window) { + src_rq->curr_runnable_sum -= p->ravg.curr_window; + dest_rq->curr_runnable_sum += p->ravg.curr_window; + } + + if (p->ravg.prev_window) { + src_rq->prev_runnable_sum -= p->ravg.prev_window; + dest_rq->prev_runnable_sum += p->ravg.prev_window; + } + + if ((s64)src_rq->prev_runnable_sum < 0) { + src_rq->prev_runnable_sum = 0; + WARN_ON(1); + } + if ((s64)src_rq->curr_runnable_sum < 0) { + src_rq->curr_runnable_sum = 0; + WARN_ON(1); + } + + trace_walt_migration_update_sum(src_rq, p); + trace_walt_migration_update_sum(dest_rq, p); + + if (p->state == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +/* + * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that + * least efficient cpu gets capacity of 1024 + */ +static unsigned long capacity_scale_cpu_efficiency(int cpu) +{ + return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; +} + +/* + * Return 'capacity' of a cpu in reference to cpu with lowest max_freq + * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. + */ +static unsigned long capacity_scale_cpu_freq(int cpu) +{ + return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; +} + +/* + * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so + * that "most" efficient cpu gets a load_scale_factor of 1 + */ +static unsigned long load_scale_cpu_efficiency(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_efficiency, + cpu_rq(cpu)->efficiency); +} + +/* + * Return load_scale_factor of a cpu in reference to cpu with best max_freq + * (max_possible_freq), so that one with best max_freq gets a load_scale_factor + * of 1. + */ +static unsigned long load_scale_cpu_freq(int cpu) +{ + return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); +} + +static int compute_capacity(int cpu) +{ + int capacity = 1024; + + capacity *= capacity_scale_cpu_efficiency(cpu); + capacity >>= 10; + + capacity *= capacity_scale_cpu_freq(cpu); + capacity >>= 10; + + return capacity; +} + +static int compute_load_scale_factor(int cpu) +{ + int load_scale = 1024; + + /* + * load_scale_factor accounts for the fact that task load + * is in reference to "best" performing cpu. Task's load will need to be + * scaled (up) by a factor to determine suitability to be placed on a + * (little) cpu. + */ + load_scale *= load_scale_cpu_efficiency(cpu); + load_scale >>= 10; + + load_scale *= load_scale_cpu_freq(cpu); + load_scale >>= 10; + + return load_scale; +} + +static int cpufreq_notifier_policy(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_policy *policy = (struct cpufreq_policy *)data; + int i, update_max = 0; + u64 highest_mpc = 0, highest_mplsf = 0; + const struct cpumask *cpus = policy->related_cpus; + unsigned int orig_min_max_freq = min_max_freq; + unsigned int orig_max_possible_freq = max_possible_freq; + /* Initialized to policy->max in case policy->related_cpus is empty! */ + unsigned int orig_max_freq = policy->max; + + if (val != CPUFREQ_NOTIFY) + return 0; + + for_each_cpu(i, policy->related_cpus) { + cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, + policy->related_cpus); + orig_max_freq = cpu_rq(i)->max_freq; + cpu_rq(i)->min_freq = policy->min; + cpu_rq(i)->max_freq = policy->max; + cpu_rq(i)->cur_freq = policy->cur; + cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; + } + + max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); + if (min_max_freq == 1) + min_max_freq = UINT_MAX; + min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); + BUG_ON(!min_max_freq); + BUG_ON(!policy->max); + + /* Changes to policy other than max_freq don't require any updates */ + if (orig_max_freq == policy->max) + return 0; + + /* + * A changed min_max_freq or max_possible_freq (possible during bootup) + * needs to trigger re-computation of load_scale_factor and capacity for + * all possible cpus (even those offline). It also needs to trigger + * re-computation of nr_big_task count on all online cpus. + * + * A changed rq->max_freq otoh needs to trigger re-computation of + * load_scale_factor and capacity for just the cluster of cpus involved. + * Since small task definition depends on max_load_scale_factor, a + * changed load_scale_factor of one cluster could influence + * classification of tasks in another cluster. Hence a changed + * rq->max_freq will need to trigger re-computation of nr_big_task + * count on all online cpus. + * + * While it should be sufficient for nr_big_tasks to be + * re-computed for only online cpus, we have inadequate context + * information here (in policy notifier) with regard to hotplug-safety + * context in which notification is issued. As a result, we can't use + * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is + * fixed up to issue notification always in hotplug-safe context, + * re-compute nr_big_task for all possible cpus. + */ + + if (orig_min_max_freq != min_max_freq || + orig_max_possible_freq != max_possible_freq) { + cpus = cpu_possible_mask; + update_max = 1; + } + + /* + * Changed load_scale_factor can trigger reclassification of tasks as + * big or small. Make this change "atomic" so that tasks are accounted + * properly due to changed load_scale_factor + */ + for_each_cpu(i, cpus) { + struct rq *rq = cpu_rq(i); + + rq->capacity = compute_capacity(i); + rq->load_scale_factor = compute_load_scale_factor(i); + + if (update_max) { + u64 mpc, mplsf; + + mpc = div_u64(((u64) rq->capacity) * + rq->max_possible_freq, rq->max_freq); + rq->max_possible_capacity = (int) mpc; + + mplsf = div_u64(((u64) rq->load_scale_factor) * + rq->max_possible_freq, rq->max_freq); + + if (mpc > highest_mpc) { + highest_mpc = mpc; + cpumask_clear(&mpc_mask); + cpumask_set_cpu(i, &mpc_mask); + } else if (mpc == highest_mpc) { + cpumask_set_cpu(i, &mpc_mask); + } + + if (mplsf > highest_mplsf) + highest_mplsf = mplsf; + } + } + + if (update_max) { + max_possible_capacity = highest_mpc; + max_load_scale_factor = highest_mplsf; + } + + return 0; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->cpu, new_freq = freq->new; + unsigned long flags; + int i; + + if (val != CPUFREQ_POSTCHANGE) + return 0; + + BUG_ON(!new_freq); + + if (cpu_rq(cpu)->cur_freq == new_freq) + return 0; + + for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { + struct rq *rq = cpu_rq(i); + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); + rq->cur_freq = new_freq; + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + return 0; +} + +static struct notifier_block notifier_policy_block = { + .notifier_call = cpufreq_notifier_policy +}; + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static int register_sched_callback(void) +{ + int ret; + + ret = cpufreq_register_notifier(¬ifier_policy_block, + CPUFREQ_POLICY_NOTIFIER); + + if (!ret) + ret = cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); + + return 0; +} + +/* + * cpufreq callbacks can be registered at core_initcall or later time. + * Any registration done prior to that is "forgotten" by cpufreq. See + * initialization of variable init_cpufreq_transition_notifier_list_called + * for further information. + */ +core_initcall(register_sched_callback); + +void walt_init_new_task_load(struct task_struct *p) +{ + int i; + u32 init_load_windows = + div64_u64((u64)sysctl_sched_walt_init_task_load_pct * + (u64)walt_ravg_window, 100); + u32 init_load_pct = current->init_load_pct; + + p->init_load_pct = 0; + memset(&p->ravg, 0, sizeof(struct ravg)); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)walt_ravg_window, 100); + } + + p->ravg.demand = init_load_windows; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + p->ravg.sum_history[i] = init_load_windows; +} diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..e181c87a928d --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p); +void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p); +void walt_fixup_busy_time(struct task_struct *p, int new_cpu); +void walt_init_new_task_load(struct task_struct *p); +void walt_mark_task_starting(struct task_struct *p); +void walt_set_window_start(struct rq *rq); +void walt_migrate_sync_cpu(int cpu); +void walt_init_cpu_efficiency(void); +u64 walt_ktime_clock(void); +void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta, + u64 wallclock); + +u64 walt_irqload(int cpu); +int walt_cpu_high_irqload(int cpu); + +#else /* CONFIG_SCHED_WALT */ + +static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { } +static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq, + struct task_struct *p) { } +static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void walt_init_new_task_load(struct task_struct *p) { } +static inline void walt_mark_task_starting(struct task_struct *p) { } +static inline void walt_set_window_start(struct rq *rq) { } +static inline void walt_migrate_sync_cpu(int cpu) { } +static inline void walt_init_cpu_efficiency(void) { } +static inline u64 walt_ktime_clock(void) { return 0; } + +#endif /* CONFIG_SCHED_WALT */ + +extern unsigned int walt_disabled; + +#endif |