diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 126 | ||||
-rw-r--r-- | kernel/sched/core_ctl.c | 4 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 154 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 2 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 6 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/debug.c | 3 | ||||
-rw-r--r-- | kernel/sched/energy.c | 13 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1070 | ||||
-rw-r--r-- | kernel/sched/features.h | 26 | ||||
-rw-r--r-- | kernel/sched/hmp.c | 40 | ||||
-rw-r--r-- | kernel/sched/rt.c | 343 | ||||
-rw-r--r-- | kernel/sched/sched.h | 69 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 3 | ||||
-rw-r--r-- | kernel/sched/tune.c | 116 | ||||
-rw-r--r-- | kernel/sched/wait.c | 8 | ||||
-rw-r--r-- | kernel/sched/walt.c | 7 | ||||
-rw-r--r-- | kernel/sched/walt.h | 2 |
19 files changed, 1322 insertions, 673 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7dde1b9918e4..ea301717538f 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,6 +19,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f6f8bb2f0d95..d28060bc74fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -78,6 +78,7 @@ #include <linux/irq.h> #include <linux/sched/core_ctl.h> #include <linux/cpufreq_times.h> +#include <linux/prefetch.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -97,6 +98,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +#include "walt.h" ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); @@ -1084,6 +1086,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} + +/* + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + return false; + + if (is_per_cpu_kthread(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +} + /* * This is how migration works: * @@ -1141,16 +1170,10 @@ struct migration_arg { */ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) { - int src_cpu; - - if (unlikely(!cpu_active(dest_cpu))) - return rq; - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + if (!is_cpu_allowed(p, dest_cpu)) return rq; - src_cpu = cpu_of(rq); rq = move_queued_task(rq, p, dest_cpu); return rq; @@ -1364,6 +1387,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; perf_event_task_migrate(p); + walt_fixup_busy_time(p, new_cpu); fixup_busy_time(p, new_cpu); } @@ -1648,9 +1672,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) - continue; - if (!cpu_active(dest_cpu)) + if (!is_cpu_allowed(p, dest_cpu)) continue; if (cpu_isolated(dest_cpu)) { if (allow_iso) @@ -1989,6 +2011,9 @@ out: bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } #endif /* CONFIG_SMP */ @@ -2129,9 +2154,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, raw_spin_lock(&rq->lock); old_load = task_load(p); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); raw_spin_unlock(&rq->lock); rcu_read_lock(); @@ -2225,6 +2254,12 @@ static void try_to_wake_up_local(struct task_struct *p) update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + cpufreq_update_util(rq, 0); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); note_task_waking(p, wallclock); } @@ -2357,6 +2392,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -2641,6 +2677,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; add_new_task_to_grp(p); + walt_init_new_task_load(p); raw_spin_lock_irqsave(&p->pi_lock, flags); p->state = TASK_RUNNING; @@ -2659,6 +2696,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); mark_task_starting(p); + walt_mark_task_starting(p); update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); @@ -3129,6 +3167,23 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + +/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -3162,6 +3217,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } @@ -3189,13 +3245,18 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); old_load = task_load(curr); + walt_set_window_start(rq); set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + + cpufreq_update_util(rq, 0); early_notif = early_detection_notify(rq, wallclock); raw_spin_unlock(&rq->lock); @@ -3554,6 +3615,9 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -3564,6 +3628,7 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); + cpufreq_update_util(rq, 0); if (!is_idle_task(prev) && !prev->on_rq) update_avg_burst(prev); @@ -3582,6 +3647,7 @@ static void __sched notrace __schedule(bool preempt) cpu = cpu_of(rq); } else { update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); + cpufreq_update_util(rq, 0); lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); } @@ -3819,7 +3885,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(prio)) { struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || - (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; queue_flag |= ENQUEUE_REPLENISH; } else @@ -4892,6 +4959,9 @@ again: retval = -EINVAL; } + if (!retval && !(p->flags & PF_KTHREAD)) + cpumask_and(&p->cpus_requested, in_mask, cpu_possible_mask); + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -4991,14 +5061,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, if (len & (sizeof(unsigned long)-1)) return -EINVAL; - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); if (ret == 0) { size_t retlen = min_t(size_t, len, cpumask_size()); - if (copy_to_user(user_mask_ptr, mask, retlen)) + if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen)) ret = -EFAULT; else ret = retlen; @@ -5913,12 +5983,6 @@ int sched_isolate_cpu(int cpu) cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); - /* We cannot isolate ALL cpus in the system */ - if (cpumask_weight(&avail_cpus) == 1) { - ret_code = -EINVAL; - goto out; - } - if (!cpu_online(cpu)) { ret_code = -EINVAL; goto out; @@ -5927,6 +5991,13 @@ int sched_isolate_cpu(int cpu) if (++cpu_isolation_vote[cpu] > 1) goto out; + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + --cpu_isolation_vote[cpu]; + ret_code = -EINVAL; + goto out; + } + /* * There is a race between watchdog being enabled by hotplug and * core isolation disabling the watchdog. When a CPU is hotplugged in @@ -5950,7 +6021,9 @@ int sched_isolate_cpu(int cpu) smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + irq_lock_sparse(); stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + irq_unlock_sparse(); calc_load_migrate(rq); update_max_interval(); @@ -6309,6 +6382,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); set_window_start(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; @@ -6330,6 +6404,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -8315,6 +8390,7 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); + cpumask_copy(¤t->cpus_requested, cpu_possible_mask); sched_init_granularity(); free_cpumask_var(non_isolated_cpus); @@ -8524,6 +8600,11 @@ void __init sched_init(void) } #endif rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); @@ -9252,8 +9333,9 @@ int sched_rr_handler(struct ctl_table *table, int write, /* make sure that internally we keep jiffies */ /* also, writing zero resets timeslice to default */ if (!ret && write) { - sched_rr_timeslice = sched_rr_timeslice <= 0 ? - RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); + sched_rr_timeslice = + sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : + msecs_to_jiffies(sysctl_sched_rr_timeslice); } mutex_unlock(&mutex); return ret; @@ -9316,7 +9398,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) * This is called before wake_up_new_task(), therefore we really only * have to set its group bits, all the other stuff does not apply. */ -static void cpu_cgroup_fork(struct task_struct *task, void *private) +static void cpu_cgroup_fork(struct task_struct *task) { unsigned long flags; struct rq *rq; diff --git a/kernel/sched/core_ctl.c b/kernel/sched/core_ctl.c index 70cd0649ac9b..2f060a570061 100644 --- a/kernel/sched/core_ctl.c +++ b/kernel/sched/core_ctl.c @@ -22,6 +22,7 @@ #include <linux/sched/rt.h> #include <trace/events/sched.h> +#include "sched.h" #define MAX_CPUS_PER_CLUSTER 4 #define MAX_CLUSTERS 2 @@ -575,7 +576,8 @@ static bool eval_need(struct cluster_data *cluster) cluster->active_cpus = get_active_cpu_count(cluster); thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; list_for_each_entry(c, &cluster->lru, sib) { - if (c->busy >= cluster->busy_up_thres[thres_idx]) + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) c->is_busy = true; else if (c->busy < cluster->busy_down_thres[thres_idx]) c->is_busy = false; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6c84b4d28914..869a125ebb87 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -33,6 +33,7 @@ struct sugov_tunables { struct gov_attr_set attr_set; unsigned int up_rate_limit_us; unsigned int down_rate_limit_us; + bool iowait_boost_enable; }; struct sugov_policy { @@ -81,6 +82,7 @@ struct sugov_cpu { }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); +static DEFINE_PER_CPU(struct sugov_tunables *, cached_tunables); /************************ Governor internals ***********************/ @@ -88,16 +90,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) { s64 delta_ns; - if (sg_policy->work_in_progress) - return false; - if (unlikely(sg_policy->need_freq_update)) { - sg_policy->need_freq_update = false; - /* - * This happens when limits change, so forget the previous - * next_freq value and force an update. - */ - sg_policy->next_freq = UINT_MAX; return true; } @@ -149,7 +142,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else { + } else if (!sg_policy->work_in_progress) { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -186,8 +179,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = (freq + (freq >> 2)) * util / max; - if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; + + sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -228,6 +223,20 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + + if (!sg_policy->tunables->iowait_boost_enable) + return; + + if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (delta_ns > TICK_NSEC) { + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } + } if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -241,14 +250,6 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, } else { sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; } - } else if (sg_cpu->iowait_boost) { - s64 delta_ns = time - sg_cpu->last_update; - - /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) { - sg_cpu->iowait_boost = 0; - sg_cpu->iowait_boost_pending = false; - } } } @@ -305,6 +306,13 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; + /* + * For slow-switch systems, single policy requests can't run at the + * moment if update is in progress, unless we acquire update_lock. + */ + if (sg_policy->work_in_progress) + return; + if (!sugov_should_update_freq(sg_policy, time)) return; @@ -320,7 +328,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (busy && next_f < sg_policy->next_freq && + sg_policy->next_freq != UINT_MAX) { next_f = sg_policy->next_freq; /* Reset cached freq as next_freq has changed */ @@ -360,7 +369,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; - if (j_util * max > j_max * util) { + if (j_util * max >= j_max * util) { util = j_util; max = j_max; } @@ -405,13 +414,27 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + unsigned int freq; + unsigned long flags; + + /* + * Hold sg_policy->update_lock shortly to handle the case where: + * incase sg_policy->next_freq is read here, and then updated by + * sugov_update_shared just before work_in_progress is set to false + * here, we may miss queueing the new update. + * + * Note: If a work was queued after the update_lock is released, + * sugov_work will just be called again by kthread_work code; and the + * request will be proceed before the sugov thread sleeps. + */ + raw_spin_lock_irqsave(&sg_policy->update_lock, flags); + freq = sg_policy->next_freq; + sg_policy->work_in_progress = false; + raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags); mutex_lock(&sg_policy->work_lock); - __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, - CPUFREQ_RELATION_L); + __cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); mutex_unlock(&sg_policy->work_lock); - - sg_policy->work_in_progress = false; } static void sugov_irq_work(struct irq_work *irq_work) @@ -510,12 +533,36 @@ static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, return count; } +static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set, + char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->iowait_boost_enable); +} + +static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + bool enable; + + if (kstrtobool(buf, &enable)) + return -EINVAL; + + tunables->iowait_boost_enable = enable; + + return count; +} + static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); +static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable); static struct attribute *sugov_attributes[] = { &up_rate_limit_us.attr, &down_rate_limit_us.attr, + &iowait_boost_enable.attr, NULL }; @@ -610,6 +657,29 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } +static void sugov_tunables_save(struct cpufreq_policy *policy, + struct sugov_tunables *tunables) +{ + int cpu; + struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!have_governor_per_policy()) + return; + + if (!cached) { + cached = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (!cached) { + pr_warn("Couldn't allocate tunables for caching\n"); + return; + } + for_each_cpu(cpu, policy->related_cpus) + per_cpu(cached_tunables, cpu) = cached; + } + + cached->up_rate_limit_us = tunables->up_rate_limit_us; + cached->down_rate_limit_us = tunables->down_rate_limit_us; +} + static void sugov_tunables_free(struct sugov_tunables *tunables) { if (!have_governor_per_policy()) @@ -618,6 +688,25 @@ static void sugov_tunables_free(struct sugov_tunables *tunables) kfree(tunables); } +static void sugov_tunables_restore(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!cached) + return; + + tunables->up_rate_limit_us = cached->up_rate_limit_us; + tunables->down_rate_limit_us = cached->down_rate_limit_us; + sg_policy->up_rate_delay_ns = + tunables->up_rate_limit_us * NSEC_PER_USEC; + sg_policy->down_rate_delay_ns = + tunables->down_rate_limit_us * NSEC_PER_USEC; + sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns, + sg_policy->down_rate_delay_ns); +} + static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; @@ -675,9 +764,13 @@ static int sugov_init(struct cpufreq_policy *policy) } } + tunables->iowait_boost_enable = policy->iowait_boost_enable; + policy->governor_data = sg_policy; sg_policy->tunables = tunables; + sugov_tunables_restore(policy); + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, get_governor_parent_kobj(policy), "%s", cpufreq_gov_schedutil.name); @@ -717,8 +810,10 @@ static int sugov_exit(struct cpufreq_policy *policy) count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; - if (!count) + if (!count) { + sugov_tunables_save(policy, tunables); sugov_tunables_free(tunables); + } mutex_unlock(&global_tunables_lock); @@ -740,7 +835,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC; update_min_rate_limit_us(sg_policy); sg_policy->last_freq_update_time = 0; - sg_policy->next_freq = UINT_MAX; + sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; @@ -752,6 +847,11 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_DL; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + } + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, policy_is_shared(policy) ? sugov_update_shared : diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 14225d5d8617..867cb7877511 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -133,6 +133,8 @@ retry: if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_andnot(lowest_mask, lowest_mask, + cpu_isolated_mask); if (drop_nopreempts) drop_nopreempt_cpus(lowest_mask); /* diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e6ec68c15aa3..cf6729cb46dd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -6,6 +6,7 @@ #include <linux/context_tracking.h> #include <linux/cpufreq_times.h> #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -79,9 +80,10 @@ void irqtime_account_irq(struct task_struct *curr) irq_time_write_end(); - if (account) + if (account) { + walt_account_irqtime(cpu, curr, delta, wallclock); sched_account_irqtime(cpu, curr, delta, wallclock); - else if (curr != this_cpu_ksoftirqd()) + } else if (curr != this_cpu_ksoftirqd()) sched_account_irqstart(cpu, curr, wallclock); local_irq_restore(flags); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 188c8388a63f..d40995e9cf5f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1038,6 +1038,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); @@ -1053,6 +1054,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_hmp_sched_stats_dl(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index ed8e6bb4531b..5c8e6e37fce7 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -104,7 +104,8 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; } #endif diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c index 50d183b1e156..770624996f9f 100644 --- a/kernel/sched/energy.c +++ b/kernel/sched/energy.c @@ -91,11 +91,17 @@ void init_sched_energy_costs(void) sge = kcalloc(1, sizeof(struct sched_group_energy), GFP_NOWAIT); + if (!sge) + goto out; nstates = (prop->length / sizeof(u32)) / 2; cap_states = kcalloc(nstates, sizeof(struct capacity_state), GFP_NOWAIT); + if (!cap_states) { + kfree(sge); + goto out; + } for (i = 0, val = prop->value; i < nstates; i++) { cap_states[i].cap = be32_to_cpup(val++); @@ -108,6 +114,8 @@ void init_sched_energy_costs(void) prop = of_find_property(cp, "idle-cost-data", NULL); if (!prop || !prop->value) { pr_warn("No idle-cost data, skipping sched_energy init\n"); + kfree(sge); + kfree(cap_states); goto out; } @@ -115,6 +123,11 @@ void init_sched_energy_costs(void) idle_states = kcalloc(nstates, sizeof(struct idle_state), GFP_NOWAIT); + if (!idle_states) { + kfree(sge); + kfree(cap_states); + goto out; + } for (i = 0, val = prop->value; i < nstates; i++) idle_states[i].power = be32_to_cpup(val++); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df2e6dd2c665..43c3d2684f64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,12 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -2504,7 +2510,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -3001,6 +3007,8 @@ struct cpu_select_env *env, struct cluster_cpu_stats *stats) int i; struct cpumask search_cpus; + extern int num_clusters; + while (!bitmap_empty(env->backup_list, num_clusters)) { next = next_candidate(env->backup_list, 0, num_clusters); __clear_bit(next->id, env->backup_list); @@ -3024,6 +3032,8 @@ next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env, { struct sched_cluster *next = NULL; + extern int num_clusters; + __clear_bit(cluster->id, env->candidate_list); if (env->rtg && preferred_cluster(cluster, env->p)) @@ -3680,68 +3690,6 @@ static inline int migration_needed(struct task_struct *p, int cpu) return 0; } -static inline int -kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) -{ - unsigned long flags; - int rc = 0; - - /* Invoke active balance to force migrate currently running task */ - raw_spin_lock_irqsave(&rq->lock, flags); - if (!rq->active_balance) { - rq->active_balance = 1; - rq->push_cpu = new_cpu; - get_task_struct(p); - rq->push_task = p; - rc = 1; - } - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return rc; -} - -static DEFINE_RAW_SPINLOCK(migration_lock); - -static bool do_migration(int reason, int new_cpu, int cpu) -{ - if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) - && same_cluster(new_cpu, cpu)) - return false; - - /* Inter cluster high irqload migrations are OK */ - return new_cpu != cpu; -} - -/* - * Check if currently running task should be migrated to a better cpu. - * - * Todo: Effect this via changes to nohz_balancer_kick() and load balance? - */ -void check_for_migration(struct rq *rq, struct task_struct *p) -{ - int cpu = cpu_of(rq), new_cpu; - int active_balance = 0, reason; - - reason = migration_needed(p, cpu); - if (!reason) - return; - - raw_spin_lock(&migration_lock); - new_cpu = select_best_cpu(p, cpu, reason, 0); - - if (do_migration(reason, new_cpu, cpu)) { - active_balance = kick_active_balance(rq, p, new_cpu); - if (active_balance) - mark_reserved(new_cpu); - } - - raw_spin_unlock(&migration_lock); - - if (active_balance) - stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, - &rq->active_balance_work); -} - #ifdef CONFIG_CFS_BANDWIDTH static void init_cfs_rq_hmp_stats(struct cfs_rq *cfs_rq) @@ -4175,6 +4123,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -4292,6 +4270,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) */ #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 +#define SKIP_CPUFREQ 0x4 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct sched_entity *se, int flags) @@ -4312,7 +4291,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ)); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -4488,6 +4467,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 +#define SKIP_CPUFREQ 0x0 static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void @@ -4710,6 +4690,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int update_flags; + /* * Update run-time statistics of the 'current'. */ @@ -4723,7 +4705,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(se, UPDATE_TG); + update_flags = UPDATE_TG; + + if (flags & DEQUEUE_IDLE) + update_flags |= SKIP_CPUFREQ; + + update_load_avg(se, update_flags); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -5011,14 +4998,10 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - if (cfs_b->quota == RUNTIME_INF) return; - now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5040,7 +5023,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -5057,61 +5040,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) - cfs_rq->runtime_expires = expires; return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. - */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; @@ -5345,8 +5284,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cpu_temp(cpu_of(rq))); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -5367,7 +5305,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -5392,7 +5329,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -5420,8 +5357,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -5434,8 +5369,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; @@ -5475,7 +5409,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@ -5483,7 +5417,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; @@ -5512,8 +5446,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -5545,7 +5478,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); @@ -5562,7 +5494,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -5571,11 +5502,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -5673,20 +5603,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -5894,6 +5832,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; + + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); #endif /* @@ -5919,6 +5876,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; @@ -5927,6 +5885,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) @@ -5942,27 +5901,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - - /* - * Update SchedTune accounting. - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - * - * We do it also in the case where we enqueue a throttled task; - * we could argue that a throttled task should not boost a CPU, - * however: - * a) properly implementing CPU boosting considering throttled - * tasks will increase a lot the complexity of the solution - * b) it's not easy to quantify the benefits introduced by - * such a more complex solution. - * Thus, for the time being we go for the simple solution and boost - * also for throttled RQs. - */ - schedtune_enqueue_task(p, cpu_of(rq)); - if (energy_aware() && !se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; @@ -5987,6 +5927,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + if (task_sleep && rq->nr_running == 1) + flags |= DEQUEUE_IDLE; + +#ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); +#endif + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -6000,6 +5954,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ @@ -6018,14 +5973,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { + int update_flags; + cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_flags = UPDATE_TG; + + if (flags & DEQUEUE_IDLE) + update_flags |= SKIP_CPUFREQ; + + update_load_avg(se, update_flags); update_cfs_shares(se); } @@ -6035,16 +5998,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - - /* - * Update SchedTune accounting - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - */ - schedtune_dequeue_task(p, cpu_of(rq)); - + if (energy_aware() && !se) + walt_dec_cumulative_runnable_avg(rq, p); #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -6457,28 +6412,79 @@ unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * CPU candidates. + * + * These are labels to reference CPU candidates for an energy_diff. + * Currently we support only two possible candidates: the task's previous CPU + * and another candiate CPU. + * More advanced/aggressive EAS selection policies can consider more + * candidates. + */ +#define EAS_CPU_PRV 0 +#define EAS_CPU_NXT 1 +#define EAS_CPU_BKP 2 +#define EAS_CPU_CNT 3 + +/* + * Returns the current capacity of cpu after applying both + * cpu and min freq scaling. + */ +unsigned long capacity_min_of(int cpu) +{ + if (!sched_feat(MIN_CAPACITY_CAPPING)) + return 0; + return arch_scale_cpu_capacity(NULL, cpu) * + arch_scale_min_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +/* + * energy_diff - supports the computation of the estimated energy impact in + * moving a "task"'s "util_delta" between different CPU candidates. + */ struct energy_env { - struct sched_group *sg_top; - struct sched_group *sg_cap; - int cap_idx; + /* Utilization to move */ + struct task_struct *p; int util_delta; - int src_cpu; - int dst_cpu; - int trg_cpu; - int energy; - int payoff; - struct task_struct *task; - struct { - int before; - int after; - int delta; - int diff; - } nrg; + + /* Mask of CPUs candidates to evaluate */ + cpumask_t cpus_mask; + + /* CPU candidates to evaluate */ struct { - int before; - int after; - int delta; - } cap; + + /* CPU ID, must be in cpus_mask */ + int cpu_id; + + /* + * Index (into sched_group_energy::cap_states) of the OPP the + * CPU needs to run at if the task is placed on it. + * This includes the both active and blocked load, due to + * other tasks on this CPU, as well as the task's own + * utilization. + */ + int cap_idx; + int cap; + + /* Estimated system energy */ + unsigned int energy; + + /* Estimated energy variation wrt EAS_CPU_PRV */ + int nrg_delta; + + } cpu[EAS_CPU_CNT]; + + /* + * Index (into energy_env::cpu) of the morst energy efficient CPU for + * the specified energy_env::task + */ + int next_idx; + + /* Support data */ + struct sched_group *sg_top; + struct sched_group *sg_cap; + struct sched_group *sg; }; static int cpu_util_wake(int cpu, struct task_struct *p); @@ -6506,24 +6512,33 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct energy_env *eenv) +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) { unsigned long max_util = 0; unsigned long util; int cpu; for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { - util = cpu_util_wake(cpu, eenv->task); + util = cpu_util_wake(cpu, eenv->p); /* * If we are looking at the target CPU specified by the eenv, * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->trg_cpu)) + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) util += eenv->util_delta; max_util = max(max_util, util); + + /* + * Take into account any minimum frequency imposed + * elsewhere which limits the energy states available + * If the MIN_CAPACITY_CAPPING feature is not enabled + * capacity_min_of will return 0 (not capped). + */ + max_util = max(max_util, capacity_min_of(cpu)); + } return max_util; @@ -6541,21 +6556,21 @@ static unsigned long group_max_util(struct energy_env *eenv) * estimate (more busy). */ static unsigned -long group_norm_util(struct energy_env *eenv, struct sched_group *sg) +long group_norm_util(struct energy_env *eenv, int cpu_idx) { - unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long capacity = eenv->cpu[cpu_idx].cap; unsigned long util, util_sum = 0; int cpu; - for_each_cpu(cpu, sched_group_cpus(sg)) { - util = cpu_util_wake(cpu, eenv->task); + for_each_cpu(cpu, sched_group_cpus(eenv->sg)) { + util = cpu_util_wake(cpu, eenv->p); /* * If we are looking at the target CPU specified by the eenv, * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->trg_cpu)) + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) util += eenv->util_delta; util_sum += __cpu_norm_util(util, capacity); @@ -6564,27 +6579,31 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } -static int find_new_capacity(struct energy_env *eenv, - const struct sched_group_energy * const sge) +static int find_new_capacity(struct energy_env *eenv, int cpu_idx) { + const struct sched_group_energy *sge = eenv->sg->sge; int idx, max_idx = sge->nr_cap_states - 1; - unsigned long util = group_max_util(eenv); + unsigned long util = group_max_util(eenv, cpu_idx); /* default is max_cap if we don't find a match */ - eenv->cap_idx = max_idx; + eenv->cpu[cpu_idx].cap_idx = max_idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap; for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) { - eenv->cap_idx = idx; + /* Keep track of SG's capacity */ + eenv->cpu[cpu_idx].cap_idx = idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap; break; } } - return eenv->cap_idx; + return eenv->cpu[cpu_idx].cap_idx; } -static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) +static int group_idle_state(struct energy_env *eenv, int cpu_idx) { + struct sched_group *sg = eenv->sg; int i, state = INT_MAX; int src_in_grp, dst_in_grp; long grp_util = 0; @@ -6596,8 +6615,10 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; - src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); - dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id, + sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id, + sched_group_cpus(sg)); if (src_in_grp == dst_in_grp) { /* both CPUs under consideration are in the same group or not in * either group, migration should leave idle state the same. @@ -6610,8 +6631,8 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) * achievable when we move the task. */ for_each_cpu(i, sched_group_cpus(sg)) { - grp_util += cpu_util_wake(i, eenv->task); - if (unlikely(i == eenv->trg_cpu)) + grp_util += cpu_util_wake(i, eenv->p); + if (unlikely(i == eenv->cpu[cpu_idx].cpu_id)) grp_util += eenv->util_delta; } @@ -6647,19 +6668,65 @@ end: } /* - * sched_group_energy(): Computes the absolute energy consumption of cpus - * belonging to the sched_group including shared resources shared only by - * members of the group. Iterates over all cpus in the hierarchy below the - * sched_group starting from the bottom working it's way up before going to - * the next cpu until all cpus are covered at all levels. The current - * implementation is likely to gather the same util statistics multiple times. - * This can probably be done in a faster but more complex way. - * Note: sched_group_energy() may fail when racing with sched_domain updates. + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg). + * + * This works in iterations to compute the SG's energy for each CPU + * candidate defined by the energy_env's cpu array. + * + * NOTE: in the following computations for busy_energy and idle_energy we do + * not shift by SCHED_CAPACITY_SHIFT in order to reduce rounding errors. + * The required scaling will be performed just one time, by the calling + * functions, once we accumulated the contributons for all the SGs. */ -static int sched_group_energy(struct energy_env *eenv) +static void calc_sg_energy(struct energy_env *eenv) +{ + struct sched_group *sg = eenv->sg; + int busy_energy, idle_energy; + unsigned int busy_power; + unsigned int idle_power; + unsigned long sg_util; + int cap_idx, idle_idx; + int total_energy = 0; + int cpu_idx; + + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + + + if (eenv->cpu[cpu_idx].cpu_id == -1) + continue; + /* Compute ACTIVE energy */ + cap_idx = find_new_capacity(eenv, cpu_idx); + busy_power = sg->sge->cap_states[cap_idx].power; + /* + * in order to calculate cpu_norm_util, we need to know which + * capacity level the group will be at, so calculate that first + */ + sg_util = group_norm_util(eenv, cpu_idx); + + busy_energy = sg_util * busy_power; + + /* Compute IDLE energy */ + idle_idx = group_idle_state(eenv, cpu_idx); + idle_power = sg->sge->idle_states[idle_idx].power; + + idle_energy = SCHED_CAPACITY_SCALE - sg_util; + idle_energy *= idle_power; + + total_energy = busy_energy + idle_energy; + eenv->cpu[cpu_idx].energy += total_energy; + } +} + +/* + * compute_energy() computes the absolute variation in energy consumption by + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT. + * + * NOTE: compute_energy() may fail when racing with sched_domain updates, in + * which case we abort by returning -EINVAL. + */ +static int compute_energy(struct energy_env *eenv) { struct cpumask visit_cpus; - u64 total_energy = 0; int cpu_count; WARN_ON(!eenv->sg_top->sge); @@ -6701,41 +6768,18 @@ static int sched_group_energy(struct energy_env *eenv) break; do { - unsigned long group_util; - int sg_busy_energy, sg_idle_energy; - int cap_idx, idle_idx; - + eenv->sg_cap = sg; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; - else - eenv->sg_cap = sg; - - cap_idx = find_new_capacity(eenv, sg->sge); - - if (sg->group_weight == 1) { - /* Remove capacity of src CPU (before task move) */ - if (eenv->trg_cpu == eenv->src_cpu && - cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { - eenv->cap.before = sg->sge->cap_states[cap_idx].cap; - eenv->cap.delta -= eenv->cap.before; - } - /* Add capacity of dst CPU (after task move) */ - if (eenv->trg_cpu == eenv->dst_cpu && - cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { - eenv->cap.after = sg->sge->cap_states[cap_idx].cap; - eenv->cap.delta += eenv->cap.after; - } - } - - idle_idx = group_idle_state(eenv, sg); - group_util = group_norm_util(eenv, sg); - - sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) - * sg->sge->idle_states[idle_idx].power); - total_energy += sg_busy_energy + sg_idle_energy; + /* + * Compute the energy for all the candidate + * CPUs in the current visited SG. + */ + eenv->sg = sg; + calc_sg_energy(eenv); + /* remove CPUs we have just visited */ if (!sd->child) { /* * cpu_count here is the number of @@ -6776,7 +6820,6 @@ next_cpu: continue; } - eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT; return 0; } @@ -6785,180 +6828,100 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -static inline unsigned long task_util(struct task_struct *p); - /* - * energy_diff(): Estimate the energy impact of changing the utilization - * distribution. eenv specifies the change: utilisation amount, source, and - * destination cpu. Source or destination cpu may be -1 in which case the - * utilization is removed from or added to the system (e.g. task wake-up). If - * both are specified, the utilization is migrated. + * select_energy_cpu_idx(): estimate the energy impact of changing the + * utilization distribution. + * + * The eenv parameter specifies the changes: utilisation amount and a pair of + * possible CPU candidates (the previous CPU and a different target CPU). + * + * This function returns the index of a CPU candidate specified by the + * energy_env which corresponds to the first CPU saving energy. + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy + * efficient than running on prev_cpu. This is also the value returned in case + * of abort due to error conditions during the computations. + * A value greater than zero means that the first energy-efficient CPU is the + * one represented by eenv->cpu[eenv->next_idx].cpu_id. */ -static inline int __energy_diff(struct energy_env *eenv) +static inline int select_energy_cpu_idx(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; - int sd_cpu = -1, energy_before = 0, energy_after = 0; - int diff, margin; - - struct energy_env eenv_before = { - .util_delta = task_util(eenv->task), - .src_cpu = eenv->src_cpu, - .dst_cpu = eenv->dst_cpu, - .trg_cpu = eenv->src_cpu, - .nrg = { 0, 0, 0, 0}, - .cap = { 0, 0, 0 }, - .task = eenv->task, - }; + int sd_cpu = -1; + int cpu_idx; + int margin; - if (eenv->src_cpu == eenv->dst_cpu) - return 0; - - sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id; sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); - if (!sd) - return 0; /* Error */ + return EAS_CPU_PRV; - sg = sd->groups; + cpumask_clear(&eenv->cpus_mask); + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + int cpu = eenv->cpu[cpu_idx].cpu_id; - do { - if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { - eenv_before.sg_top = eenv->sg_top = sg; + if (cpu < 0) + continue; + cpumask_set_cpu(cpu, &eenv->cpus_mask); + } - if (sched_group_energy(&eenv_before)) - return 0; /* Invalid result abort */ - energy_before += eenv_before.energy; + sg = sd->groups; + do { + /* Skip SGs which do not contains a candidate CPU */ + if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg))) + continue; - /* Keep track of SRC cpu (before) capacity */ - eenv->cap.before = eenv_before.cap.before; - eenv->cap.delta = eenv_before.cap.delta; + eenv->sg_top = sg; + /* energy is unscaled to reduce rounding errors */ + if (compute_energy(eenv) == -EINVAL) + return EAS_CPU_PRV; - if (sched_group_energy(eenv)) - return 0; /* Invalid result abort */ - energy_after += eenv->energy; - } } while (sg = sg->next, sg != sd->groups); - eenv->nrg.before = energy_before; - eenv->nrg.after = energy_after; - eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; - eenv->payoff = 0; -#ifndef CONFIG_SCHED_TUNE - trace_sched_energy_diff(eenv->task, - eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, - eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, - eenv->cap.before, eenv->cap.after, eenv->cap.delta, - eenv->nrg.delta, eenv->payoff); -#endif + /* Scale energy before comparisons */ + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) + eenv->cpu[cpu_idx].energy >>= SCHED_CAPACITY_SHIFT; + /* - * Dead-zone margin preventing too many migrations. + * Compute the dead-zone margin used to prevent too many task + * migrations with negligible energy savings. + * An energy saving is considered meaningful if it reduces the energy + * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56% */ + margin = eenv->cpu[EAS_CPU_PRV].energy >> 6; - margin = eenv->nrg.before >> 6; /* ~1.56% */ - - diff = eenv->nrg.after - eenv->nrg.before; - - eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; - - return eenv->nrg.diff; -} - -#ifdef CONFIG_SCHED_TUNE - -struct target_nrg schedtune_target_nrg; - -#ifdef CONFIG_CGROUP_SCHEDTUNE -extern bool schedtune_initialized; -#endif /* CONFIG_CGROUP_SCHEDTUNE */ - -/* - * System energy normalization - * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], - * corresponding to the specified energy variation. - */ -static inline int -normalize_energy(int energy_diff) -{ - u32 normalized_nrg; - -#ifdef CONFIG_CGROUP_SCHEDTUNE - /* during early setup, we don't know the extents */ - if (unlikely(!schedtune_initialized)) - return energy_diff < 0 ? -1 : 1 ; -#endif /* CONFIG_CGROUP_SCHEDTUNE */ - -#ifdef CONFIG_SCHED_DEBUG - { - int max_delta; - - /* Check for boundaries */ - max_delta = schedtune_target_nrg.max_power; - max_delta -= schedtune_target_nrg.min_power; - WARN_ON(abs(energy_diff) >= max_delta); - } -#endif - - /* Do scaling using positive numbers to increase the range */ - normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; - - /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_CAPACITY_SHIFT; - - /* Normalize on max energy for target platform */ - normalized_nrg = reciprocal_divide( - normalized_nrg, schedtune_target_nrg.rdiv); - - return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; -} - -static inline int -energy_diff(struct energy_env *eenv) -{ - int boost = schedtune_task_boost(eenv->task); - int nrg_delta; - - /* Conpute "absolute" energy diff */ - __energy_diff(eenv); - - /* Return energy diff when boost margin is 0 */ - if (boost == 0) { - trace_sched_energy_diff(eenv->task, - eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, - eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, - eenv->cap.before, eenv->cap.after, eenv->cap.delta, - 0, -eenv->nrg.diff); - return eenv->nrg.diff; - } - - /* Compute normalized energy diff */ - nrg_delta = normalize_energy(eenv->nrg.diff); - eenv->nrg.delta = nrg_delta; - - eenv->payoff = schedtune_accept_deltas( - eenv->nrg.delta, - eenv->cap.delta, - eenv->task); - - trace_sched_energy_diff(eenv->task, - eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, - eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, - eenv->cap.before, eenv->cap.after, eenv->cap.delta, - eenv->nrg.delta, eenv->payoff); + /* + * By default the EAS_CPU_PRV CPU is considered the most energy + * efficient, with a 0 energy variation. + */ + eenv->next_idx = EAS_CPU_PRV; /* - * When SchedTune is enabled, the energy_diff() function will return - * the computed energy payoff value. Since the energy_diff() return - * value is expected to be negative by its callers, this evaluation - * function return a negative value each time the evaluation return a - * positive payoff, which is the condition for the acceptance of - * a scheduling decision + * Compare the other CPU candidates to find a CPU which can be + * more energy efficient then EAS_CPU_PRV */ - return -eenv->payoff; + for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + /* Skip not valid scheduled candidates */ + if (eenv->cpu[cpu_idx].cpu_id < 0) + continue; + /* Compute energy delta wrt EAS_CPU_PRV */ + eenv->cpu[cpu_idx].nrg_delta = + eenv->cpu[cpu_idx].energy - + eenv->cpu[EAS_CPU_PRV].energy; + /* filter energy variations within the dead-zone margin */ + if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin) + eenv->cpu[cpu_idx].nrg_delta = 0; + /* update the schedule candidate with min(nrg_delta) */ + if (eenv->cpu[cpu_idx].nrg_delta < + eenv->cpu[eenv->next_idx].nrg_delta) { + eenv->next_idx = cpu_idx; + if (sched_feat(FBT_STRICT_ORDER)) + break; + } + } + + return eenv->next_idx; } -#else /* CONFIG_SCHED_TUNE */ -#define energy_diff(eenv) __energy_diff(eenv) -#endif /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. @@ -7054,12 +7017,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return 1; } -static inline unsigned long task_util(struct task_struct *p) -{ - return p->se.avg.util_avg; -} - -static inline unsigned long boosted_task_util(struct task_struct *task); +static inline unsigned long boosted_task_util(struct task_struct *p); static inline bool __task_fits(struct task_struct *p, int cpu, int util) { @@ -7136,16 +7094,16 @@ schedtune_cpu_margin(unsigned long util, int cpu) } static inline long -schedtune_task_margin(struct task_struct *task) +schedtune_task_margin(struct task_struct *p) { - int boost = schedtune_task_boost(task); + int boost = schedtune_task_boost(p); unsigned long util; long margin; if (boost == 0) return 0; - util = task_util(task); + util = task_util(p); margin = schedtune_margin(util, boost); return margin; @@ -7160,7 +7118,7 @@ schedtune_cpu_margin(unsigned long util, int cpu) } static inline int -schedtune_task_margin(struct task_struct *task) +schedtune_task_margin(struct task_struct *p) { return 0; } @@ -7179,12 +7137,12 @@ boosted_cpu_util(int cpu) } static inline unsigned long -boosted_task_util(struct task_struct *task) +boosted_task_util(struct task_struct *p) { - unsigned long util = task_util(task); - long margin = schedtune_task_margin(task); + unsigned long util = task_util(p); + long margin = schedtune_task_margin(p); - trace_sched_boost_task(task, util, margin); + trace_sched_boost_task(p, util, margin); return util + margin; } @@ -7554,6 +7512,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; unsigned long best_active_util = ULONG_MAX; + unsigned long target_idle_max_spare_cap = 0; int best_idle_cstate = INT_MAX; struct sched_domain *sd; struct sched_group *sg; @@ -7589,7 +7548,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { unsigned long capacity_curr = capacity_curr_of(i); unsigned long capacity_orig = capacity_orig_of(i); - unsigned long wake_util, new_util; + unsigned long wake_util, new_util, min_capped_util; if (!cpu_online(i)) continue; @@ -7611,6 +7570,16 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, * than the one required to boost the task. */ new_util = max(min_util, new_util); + + /* + * Include minimum capacity constraint: + * new_util contains the required utilization including + * boost. min_capped_util also takes into account a + * minimum capacity cap imposed on the CPU by external + * actors. + */ + min_capped_util = max(new_util, capacity_min_of(i)); + if (new_util > capacity_orig) continue; @@ -7733,6 +7702,12 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, /* Select idle CPU with lower cap_orig */ if (capacity_orig > best_idle_min_cap_orig) continue; + /* Favor CPUs that won't end up running at a + * high OPP. + */ + if ((capacity_orig - min_capped_util) < + target_idle_max_spare_cap) + continue; /* * Skip CPUs in deeper idle state, but only @@ -7746,6 +7721,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, /* Keep track of best idle CPU */ best_idle_min_cap_orig = capacity_orig; + target_idle_max_spare_cap = capacity_orig - + min_capped_util; best_idle_cstate = idle_idx; best_idle_cpu = i; continue; @@ -7776,10 +7753,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, continue; /* Favor CPUs with maximum spare capacity */ - if ((capacity_orig - new_util) < target_max_spare_cap) + if ((capacity_orig - min_capped_util) < + target_max_spare_cap) continue; - target_max_spare_cap = capacity_orig - new_util; + target_max_spare_cap = capacity_orig - min_capped_util; target_capacity = capacity_orig; target_cpu = i; } @@ -7851,9 +7829,11 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) { - struct sched_domain *sd; - int target_cpu = prev_cpu, tmp_target, tmp_backup; bool boosted, prefer_idle; + struct sched_domain *sd; + int target_cpu; + int backup_cpu; + int next_cpu; schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); schedstat_inc(this_rq(), eas_stats.secb_attempts); @@ -7868,7 +7848,6 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } } - rcu_read_lock(); #ifdef CONFIG_CGROUP_SCHEDTUNE boosted = schedtune_task_boost(p) > 0; prefer_idle = schedtune_prefer_idle(p) > 0; @@ -7877,31 +7856,49 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync prefer_idle = 0; #endif - sync_entity_load_avg(&p->se); + rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + if (!sd) { + target_cpu = prev_cpu; + goto unlock; + } + + sync_entity_load_avg(&p->se); + /* Find a cpu with sufficient capacity */ - tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); + next_cpu = find_best_target(p, &backup_cpu, boosted, prefer_idle); + if (next_cpu == -1) { + target_cpu = prev_cpu; + goto unlock; + } - if (!sd) + /* Unconditionally prefer IDLE CPUs for boosted/prefer_idle tasks */ + if ((boosted || prefer_idle) && idle_cpu(next_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); + target_cpu = next_cpu; goto unlock; - if (tmp_target >= 0) { - target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); - schedstat_inc(this_rq(), eas_stats.secb_idle_bt); - goto unlock; - } } - if (target_cpu != prev_cpu) { + target_cpu = prev_cpu; + if (next_cpu != prev_cpu) { int delta = 0; struct energy_env eenv = { + .p = p, .util_delta = task_util(p), - .src_cpu = prev_cpu, - .dst_cpu = target_cpu, - .task = p, - .trg_cpu = target_cpu, + /* Task's previous CPU candidate */ + .cpu[EAS_CPU_PRV] = { + .cpu_id = prev_cpu, + }, + /* Main alternative CPU candidate */ + .cpu[EAS_CPU_NXT] = { + .cpu_id = next_cpu, + }, + /* Backup alternative CPU candidate */ + .cpu[EAS_CPU_BKP] = { + .cpu_id = backup_cpu, + }, }; @@ -7914,26 +7911,21 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync if (__cpu_overutilized(prev_cpu, delta)) { schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); + target_cpu = next_cpu; goto unlock; } - if (energy_diff(&eenv) >= 0) { - /* No energy saving for target_cpu, try backup */ - target_cpu = tmp_backup; - eenv.dst_cpu = target_cpu; - eenv.trg_cpu = target_cpu; - if (tmp_backup < 0 || - tmp_backup == prev_cpu || - energy_diff(&eenv) >= 0) { - schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); - target_cpu = prev_cpu; - goto unlock; - } + /* Check if EAS_CPU_NXT is a more energy efficient CPU */ + if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + target_cpu = eenv.cpu[eenv.next_idx].cpu_id; + goto unlock; } - schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); - schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; goto unlock; } @@ -8898,12 +8890,30 @@ redo: if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load / 2) > env->imbalance) + /* + * p is not running task when we goes until here, so if p is one + * of the 2 task in src cpu rq and not the running one, + * that means it is the only task that can be balanced. + * So only when there is other tasks can be balanced or + * there is situation to ignore big task, it is needed + * to skip the task load bigger than 2*imbalance. + */ + if (((cpu_rq(env->src_cpu)->nr_running > 2) || + (env->flags & LBF_IGNORE_BIG_TASKS)) && + ((load / 2) > env->imbalance)) goto next; detach_task(p, env); @@ -9013,6 +9023,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -9021,9 +9033,10 @@ static void update_blocked_averages(int cpu) true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -9292,6 +9305,9 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity_orig = capacity; + capacity *= arch_scale_max_freq_capacity(sd, cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; raw_spin_lock_irqsave(&mcc->lock, flags); @@ -10326,6 +10342,17 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a cpu that could + * eventually lead to active_balancing high->low capacity. + * Higher per-cpu capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(i); /* @@ -10393,8 +10420,10 @@ static int need_active_balance(struct lb_env *env) * It's worth migrating the task if the src_cpu's capacity is reduced * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. + * Avoid pulling the CFS task if it is the only task running. */ if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->nr_running > 1) && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) @@ -10536,7 +10565,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); @@ -10550,6 +10578,12 @@ more_balance: } /* + * Set loop_max when rq's lock is taken to prevent a race. + */ + env.loop_max = min(sysctl_sched_nr_migrate, + busiest->nr_running); + + /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ @@ -10627,7 +10661,24 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * dst_cpu is not a valid busiest cpu in the following + * check since load cannot be pulled from dst_cpu to be + * put on dst_cpu. + */ + cpumask_clear_cpu(env.dst_cpu, cpus); + /* + * Go back to "redo" iff the load-balance cpumask + * contains other potential busiest cpus for the + * current sched domain. + */ + if (cpumask_intersects(cpus, sched_domain_span(env.sd))) { + /* + * Now that the check has passed, reenable + * dst_cpu so that load can be calculated on + * it in the redo path. + */ + cpumask_set_cpu(env.dst_cpu, cpus); env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; @@ -11606,6 +11657,92 @@ static void rq_offline_fair(struct rq *rq) unthrottle_offline_cfs_rqs(rq); } +static inline int +kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu) +{ + unsigned long flags; + int rc = 0; + + /* Invoke active balance to force migrate currently running task */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (!rq->active_balance) { + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + rq->push_task = p; + rc = 1; + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return rc; +} + +#ifdef CONFIG_SCHED_HMP +static DEFINE_RAW_SPINLOCK(migration_lock); + +static bool do_migration(int reason, int new_cpu, int cpu) +{ + if ((reason == UP_MIGRATION || reason == DOWN_MIGRATION) + && same_cluster(new_cpu, cpu)) + return false; + + /* Inter cluster high irqload migrations are OK */ + return new_cpu != cpu; +} + +/* + * Check if currently running task should be migrated to a better cpu. + * + * Todo: Effect this via changes to nohz_balancer_kick() and load balance? + */ +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq), new_cpu; + int active_balance = 0, reason; + + reason = migration_needed(p, cpu); + if (!reason) + return; + + raw_spin_lock(&migration_lock); + new_cpu = select_best_cpu(p, cpu, reason, 0); + + if (do_migration(reason, new_cpu, cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + mark_reserved(new_cpu); + } + + raw_spin_unlock(&migration_lock); + + if (active_balance) + stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, rq, + &rq->active_balance_work); +} +#else +void check_for_migration(struct rq *rq, struct task_struct *p) +{ + int new_cpu; + int active_balance; + int cpu = task_cpu(p); + + if (rq->misfit_task) { + if (rq->curr->state != TASK_RUNNING || + rq->curr->nr_cpus_allowed == 1) + return; + + new_cpu = select_energy_cpu_brute(p, cpu, 0); + if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) { + active_balance = kick_active_balance(rq, p, new_cpu); + if (active_balance) + stop_one_cpu_nowait(cpu, + active_load_balance_cpu_stop, + rq, &rq->active_balance_work); + } + } +} +#endif + #endif /* CONFIG_SMP */ /* @@ -11714,7 +11851,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_class == &fair_sched_class)) return true; return false; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index c30c48fde7e6..c3e301589515 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -78,3 +78,29 @@ SCHED_FEAT(ENERGY_AWARE, true) #else SCHED_FEAT(ENERGY_AWARE, false) #endif + +/* + * Minimum capacity capping. Keep track of minimum capacity factor when + * minimum frequency available to a policy is modified. + * If enabled, this can be used to inform the scheduler about capacity + * restrictions. + */ +SCHED_FEAT(MIN_CAPACITY_CAPPING, true) + +/* + * Enforce the priority of candidates selected by find_best_target() + * ON: If the target CPU saves any energy, use that. + * OFF: Use whichever of target or backup saves most. + */ +SCHED_FEAT(FBT_STRICT_ORDER, true) + +/* + * Apply schedtune boost hold to tasks of all sched classes. + * If enabled, schedtune will hold the boost applied to a CPU + * for 50ms regardless of task activation - if the task is + * still running 50ms later, the boost hold expires and schedtune + * boost will expire immediately the task stops. + * If disabled, this behaviour will only apply to tasks of the + * RT class. + */ +SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 5337ac7fcba1..649d6a437a13 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -764,13 +764,16 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ unsigned int min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ -/* Min window size (in ns) = 10ms */ -#define MIN_SCHED_RAVG_WINDOW 10000000 +/* Min window size (in ns) = 20ms */ +#define MIN_SCHED_RAVG_WINDOW ((20000000 / TICK_NSEC) * TICK_NSEC) /* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) -/* Window size (in ns) */ +/* + * Window size (in ns). Adjust for the tick size so that the window + * rollover occurs just before the tick boundary. + */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; /* Maximum allowed threshold before freq aggregation must be enabled */ @@ -1616,17 +1619,20 @@ static inline int exiting_task(struct task_struct *p) static int __init set_sched_ravg_window(char *str) { + unsigned int adj_window; unsigned int window_size; get_option(&str, &window_size); - if (window_size < MIN_SCHED_RAVG_WINDOW || - window_size > MAX_SCHED_RAVG_WINDOW) { - WARN_ON(1); - return -EINVAL; - } + /* Adjust for CONFIG_HZ */ + adj_window = (window_size / TICK_NSEC) * TICK_NSEC; + + /* Warn if we're a bit too far away from the expected window size */ + WARN(adj_window < window_size - NSEC_PER_MSEC, + "tick-adjusted window size %u, original was %u\n", adj_window, + window_size); - sched_ravg_window = window_size; + sched_ravg_window = adj_window; return 0; } @@ -3217,6 +3223,13 @@ void sched_get_cpus_busy(struct sched_load *busy, update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(), 0); + /* + * Ensure that we don't report load for 'cpu' again via the + * cpufreq_update_util path in the window that started at + * rq->window_start + */ + rq->load_reported_window = rq->window_start; + account_load_subtractions(rq); load[i] = rq->prev_runnable_sum; nload[i] = rq->nt_prev_runnable_sum; @@ -3649,6 +3662,13 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) migrate_top_tasks(p, src_rq, dest_rq); + if (!same_freq_domain(new_cpu, task_cpu(p))) { + cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG | + SCHED_CPUFREQ_WALT); + cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG | + SCHED_CPUFREQ_WALT); + } + if (p == src_rq->ed_task) { src_rq->ed_task = NULL; if (!dest_rq->ed_task) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index af21389466b8..9d7f6998edd5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -12,8 +12,10 @@ #include <linux/hrtimer.h> #include "tune.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; +int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); @@ -1437,6 +1439,25 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) } /* + * Keep track of whether each cpu has an RT task that will + * soon schedule on that core. The problem this is intended + * to address is that we want to avoid entering a non-preemptible + * softirq handler if we are about to schedule a real-time + * task on that core. Ideally, we could just check whether + * the RT runqueue on that core had a runnable task, but the + * window between choosing to schedule a real-time task + * on a core and actually enqueueing it on that run-queue + * is large enough to lose races at an unacceptably high rate. + * + * This variable attempts to reduce that window by indicating + * when we have decided to schedule an RT task on a core + * but not yet enqueued it. + * This variable is a heuristic only: it is not guaranteed + * to be correct and may be updated without synchronization. + */ +DEFINE_PER_CPU(bool, incoming_rt_task); + +/* * Adding/removing a task to/from a priority array: */ static void @@ -1444,14 +1465,20 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; +#ifdef CONFIG_SMP + schedtune_enqueue_task(p, cpu_of(rq)); +#endif + if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); + walt_inc_cumulative_runnable_avg(rq, p); inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + *per_cpu_ptr(&incoming_rt_task, cpu_of(rq)) = false; if (!schedtune_task_boost(p)) return; @@ -1485,8 +1512,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; +#ifdef CONFIG_SMP + schedtune_dequeue_task(p, cpu_of(rq)); +#endif + update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + walt_dec_cumulative_runnable_avg(rq, p); dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); @@ -1539,8 +1571,19 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } +/* + * Return whether the given cpu has (or will shortly have) an RT task + * ready to run. NB: This is a heuristic and is subject to races. + */ +bool +cpu_has_rt_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + return rq->rt.rt_nr_running > 0 || per_cpu(incoming_rt_task, cpu); +} + #ifdef CONFIG_SMP -static int find_lowest_rq(struct task_struct *task); +static int find_lowest_rq(struct task_struct *task, int sync); #ifdef CONFIG_SCHED_HMP static int @@ -1549,7 +1592,7 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) int target; rcu_read_lock(); - target = find_lowest_rq(p); + target = find_lowest_rq(p, 0); if (target != -1) cpu = target; rcu_read_unlock(); @@ -1561,8 +1604,10 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) /* * Return whether the task on the given cpu is currently non-preemptible * while handling a potentially long softint, or if the task is likely - * to block preemptions soon because it is a ksoftirq thread that is - * handling slow softints. + * to block preemptions soon because (a) it is a ksoftirq thread that is + * handling slow softints, (b) it is idle and therefore likely to start + * processing the irq's immediately, (c) the cpu is currently handling + * hard irq's and will soon move on to the softirq handler. */ bool task_may_not_preempt(struct task_struct *task, int cpu) @@ -1572,8 +1617,9 @@ task_may_not_preempt(struct task_struct *task, int cpu) struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); return ((softirqs & LONG_SOFTIRQ_MASK) && - (task == cpu_ksoftirqd || - task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); + (task == cpu_ksoftirqd || is_idle_task(task) || + (task_thread_info(task)->preempt_count + & (HARDIRQ_MASK | SOFTIRQ_MASK)))); } /* @@ -1606,9 +1652,11 @@ static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, int sibling_count_hint) { - struct task_struct *curr; + struct task_struct *curr, *tgt_task; struct rq *rq; bool may_not_preempt; + int target; + int sync = flags & WF_SYNC; #ifdef CONFIG_SCHED_HMP return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); @@ -1623,58 +1671,28 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, rcu_read_lock(); curr = READ_ONCE(rq->curr); /* unlocked access */ - /* - * If the current task on @p's runqueue is a softirq task, - * it may run without preemption for a time that is - * ill-suited for a waiting RT task. Therefore, try to - * wake this RT task on another runqueue. - * - * Also, if the current task on @p's runqueue is an RT task, then - * it may run without preemption for a time that is - * ill-suited for a waiting RT task. Therefore, try to - * wake this RT task on another runqueue. - * - * Also, if the current task on @p's runqueue is an RT task, then - * try to see if we can wake this RT task up on another - * runqueue. Otherwise simply start this RT task - * on its current runqueue. - * - * We want to avoid overloading runqueues. If the woken - * task is a higher priority, then it will stay on this CPU - * and the lower prio task should be moved to another CPU. - * Even though this will probably make the lower prio task - * lose its cache, we do not want to bounce a higher task - * around just because it gave up its CPU, perhaps for a - * lock? - * - * For equal prio tasks, we just let the scheduler sort it out. - * - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - * - * This test is optimistic, if we get it wrong the load-balancer - * will have to sort it out. - */ may_not_preempt = task_may_not_preempt(curr, cpu); - if (may_not_preempt || - (unlikely(rt_task(curr)) && - (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio))) { - int target = find_lowest_rq(p); + target = find_lowest_rq(p, sync); - /* - * If cpu is non-preemptible, prefer remote cpu - * even if it's running a higher-prio task. - * Otherwise: Don't bother moving it if the - * destination CPU is not running a lower priority task. - */ - if (target != -1 && - (may_not_preempt || - p->prio < cpu_rq(target)->rt.highest_prio.curr)) - cpu = target; + /* + * Check once for losing a race with the other core's irq handler. + * This does not happen frequently, but it can avoid delaying + * the execution of the RT task in those cases. + */ + if (target != -1) { + tgt_task = READ_ONCE(cpu_rq(target)->curr); + if (task_may_not_preempt(tgt_task, target)) + target = find_lowest_rq(p, sync); } + /* + * Possible race. Don't bother moving it if the + * destination CPU is not running a lower priority task. + */ + if (target != -1 && + (may_not_preempt || p->prio < cpu_rq(target)->rt.highest_prio.curr)) + cpu = target; + *per_cpu_ptr(&incoming_rt_task, cpu) = true; rcu_read_unlock(); - out: /* * If previous CPU was different, make sure to cancel any active @@ -1718,7 +1736,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) requeue_task_rt(rq, p, 1); resched_curr(rq); } - #endif /* CONFIG_SMP */ /* @@ -1982,12 +1999,108 @@ retry: } #endif /* CONFIG_SCHED_HMP */ -static int find_lowest_rq(struct task_struct *task) +static int find_best_rt_target(struct task_struct* task, int cpu, + struct cpumask* lowest_mask, + bool boosted, bool prefer_idle) { + int iter_cpu; + int target_cpu = -1; + int boosted_cpu = -1; + int backup_cpu = -1; + int boosted_orig_capacity = capacity_orig_of(0); + int backup_capacity = 0; + int best_idle_cpu = -1; + unsigned long target_util = 0; + unsigned long new_util; + /* We want to elect the best one based on task class, + * idleness, and utilization. + */ + for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { + int cur_capacity; + /* + * Iterate from higher cpus for boosted tasks. + */ + int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; + if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(task))) + continue; + + new_util = cpu_util(i) + task_util(task); + + if (new_util > capacity_orig_of(i)) + continue; + + /* + * Unconditionally favoring tasks that prefer idle cpus to + * improve latency. + */ + if (idle_cpu(i) && prefer_idle + && cpumask_test_cpu(i, lowest_mask) && best_idle_cpu < 0) { + best_idle_cpu = i; + continue; + } + + if (cpumask_test_cpu(i, lowest_mask)) { + /* Bias cpu selection towards cpu with higher original + * capacity if task is boosted. + * Assumption: Higher cpus are exclusively alloted for + * boosted tasks. + */ + if (boosted && boosted_cpu < 0 + && boosted_orig_capacity < capacity_orig_of(i)) { + boosted_cpu = i; + boosted_orig_capacity = capacity_orig_of(i); + } + cur_capacity = capacity_curr_of(i); + if (new_util < cur_capacity && cpu_rq(i)->nr_running) { + if(!boosted) { + /* Find a target cpu with highest utilization.*/ + if (target_util < new_util) { + target_cpu = i; + target_util = new_util; + } + } else { + if (target_util == 0 || target_util > new_util) { + /* Find a target cpu with lowest utilization.*/ + target_cpu = i; + target_util = new_util; + } + } + } else if (backup_capacity == 0 || backup_capacity < cur_capacity) { + /* Select a backup CPU with highest capacity.*/ + backup_capacity = cur_capacity; + backup_cpu = i; + } + } + } + + if (boosted && boosted_cpu >=0 && boosted_cpu > best_idle_cpu) + target_cpu = boosted_cpu; + else if (prefer_idle && best_idle_cpu >= 0) + target_cpu = best_idle_cpu; + + if (target_cpu < 0) { + if (backup_cpu >= 0) + return backup_cpu; + + /* Select current cpu if it is present in the mask.*/ + if (cpumask_test_cpu(cpu, lowest_mask)) + return cpu; + + /* Pick a random cpu from lowest_mask */ + target_cpu = cpumask_any(lowest_mask); + if (target_cpu < nr_cpu_ids) + return target_cpu; + return -1; + } + return target_cpu; +} + +static int find_lowest_rq(struct task_struct *task, int sync) { struct sched_domain *sd; struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + bool boosted, prefer_idle; #ifdef CONFIG_SCHED_HMP return find_lowest_rq_hmp(task); @@ -2000,64 +2113,88 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ + /* Constructing cpumask of lowest priorities */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ - /* - * At this point we have built a mask of cpus representing the - * lowest priority tasks in the system. Now we want to elect - * the best one based on our affinity and topology. - * - * We prioritize the last cpu that the task executed on since - * it is most likely cache-hot in that location. + /* Return current cpu if WF_SYNC hint is set and present in + * lowest_mask. Improves data locality. */ - if (cpumask_test_cpu(cpu, lowest_mask)) - return cpu; + if (sysctl_sched_sync_hint_enable && sync) { + cpumask_t search_cpus; + cpumask_and(&search_cpus, tsk_cpus_allowed(task), lowest_mask); + if (cpumask_test_cpu(cpu, &search_cpus)) + return cpu; + } /* - * Otherwise, we consult the sched_domains span maps to figure - * out which cpu is logically closest to our hot cache data. + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. */ - if (!cpumask_test_cpu(this_cpu, lowest_mask)) - this_cpu = -1; /* Skip this_cpu opt if not among lowest */ - - rcu_read_lock(); - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - int best_cpu; - /* - * "this_cpu" is cheaper to preempt than a - * remote processor. - */ - if (this_cpu != -1 && - cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { - rcu_read_unlock(); - return this_cpu; - } + boosted = schedtune_task_boost(task) > 0; + prefer_idle = schedtune_prefer_idle(task) > 0; + if(boosted || prefer_idle) { + return find_best_rt_target(task, cpu, lowest_mask, boosted, prefer_idle); + } else { + /* Now we want to elect the best one based on on our affinity + * and topology. + * We prioritize the last cpu that the task executed on since + * it is most likely cache-hot in that location. + */ + struct task_struct* curr; + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + curr = cpu_rq(this_cpu)->curr; + /* Ensuring that boosted/prefer idle + * tasks are not pre-empted even if low + * priority*/ + if (!curr || (schedtune_task_boost(curr) == 0 + && schedtune_prefer_idle(curr) == 0)) { + rcu_read_unlock(); + return this_cpu; + } + } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); - if (best_cpu < nr_cpu_ids) { - rcu_read_unlock(); - return best_cpu; + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) { + curr = cpu_rq(best_cpu)->curr; + /* Ensuring that boosted/prefer idle + * tasks are not pre-empted even if low + * priority*/ + if(!curr || (schedtune_task_boost(curr) == 0 + && schedtune_prefer_idle(curr) == 0)) { + rcu_read_unlock(); + return best_cpu; + } + } } } - } - rcu_read_unlock(); + rcu_read_unlock(); - /* - * And finally, if there were no matches within the domains - * just give the caller *something* to work with from the compatible - * locations. - */ - if (this_cpu != -1) - return this_cpu; + /* And finally, if there were no matches within the domains just + * give the caller *something* to work with from the compatible + * locations. + */ + if (this_cpu != -1) + return this_cpu; - cpu = cpumask_any(lowest_mask); - if (cpu < nr_cpu_ids) - return cpu; - return -1; + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; + } } /* Will lock the rq it finds */ @@ -2068,7 +2205,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) int cpu; for (tries = 0; tries < RT_MAX_TRIES; tries++) { - cpu = find_lowest_rq(task); + cpu = find_lowest_rq(task, 0); if ((cpu == -1) || (cpu == rq->cpu)) break; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 07a3cd3c6fbc..fa4d0ab014b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -33,8 +33,10 @@ extern long calc_load_fold_active(struct rq *this_rq); #ifdef CONFIG_SMP extern void update_cpu_load_active(struct rq *this_rq); +extern void check_for_migration(struct rq *rq, struct task_struct *p); #else static inline void update_cpu_load_active(struct rq *this_rq) { } +static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } #endif /* @@ -226,9 +228,8 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchical_quota; - u64 runtime_expires; - int idle, period_active; + short idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -430,7 +431,6 @@ struct related_thread_group { }; extern struct list_head cluster_head; -extern int num_clusters; extern struct sched_cluster *sched_cluster[NR_CPUS]; struct cpu_cycle { @@ -441,6 +441,7 @@ struct cpu_cycle { #define for_each_sched_cluster(cluster) \ list_for_each_entry_rcu(cluster, &cluster_head, list) +extern unsigned int sched_disable_window_stats; #endif /* CONFIG_SCHED_HMP */ /* CFS-related fields in a runqueue */ @@ -511,6 +512,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_SCHED_HMP @@ -518,7 +523,6 @@ struct cfs_rq { #endif int runtime_enabled; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock, throttled_clock_task; @@ -793,6 +797,7 @@ struct rq { int cstate, wakeup_latency, wakeup_energy; u64 window_start; + u64 load_reported_window; unsigned long hmp_flags; u64 cur_irqload; @@ -818,6 +823,7 @@ struct rq { #endif #ifdef CONFIG_SCHED_WALT + unsigned int cur_freq; u64 cumulative_runnable_avg; u64 window_start; u64 curr_runnable_sum; @@ -1466,7 +1472,6 @@ static inline bool is_short_burst_task(struct task_struct *p) p->ravg.avg_sleep_time > sysctl_sched_short_sleep; } -extern void check_for_migration(struct rq *rq, struct task_struct *p); extern void pre_big_task_count_change(const struct cpumask *cpus); extern void post_big_task_count_change(const struct cpumask *cpus); extern void set_hmp_defaults(void); @@ -1726,7 +1731,6 @@ static inline int same_freq_domain(int src_cpu, int dst_cpu) return 1; } -static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } static inline void pre_big_task_count_change(void) { } static inline void post_big_task_count_change(void) { } static inline void set_hmp_defaults(void) { } @@ -1853,7 +1857,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; @@ -2026,6 +2030,7 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_IDLE 0x80 /* The last dequeue before IDLE */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 @@ -2352,6 +2357,26 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_max_freq_capacity +static __always_inline +unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +#ifndef arch_scale_min_freq_capacity +static __always_inline +unsigned long arch_scale_min_freq_capacity(struct sched_domain *sd, int cpu) +{ + /* + * Multiplied with any capacity value, this scale factor will return + * 0, which represents an un-capped state + */ + return 0; +} +#endif + #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -2378,6 +2403,19 @@ extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int walt_ravg_window; extern bool walt_disabled; +static inline unsigned long task_util(struct task_struct *p) +{ + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_task_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif + return p->se.avg.util_avg; +} + + /* * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can @@ -2852,7 +2890,22 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); +#ifdef CONFIG_SCHED_HMP + /* + * Skip if we've already reported, but not if this is an inter-cluster + * migration. Also only allow WALT update sites. + */ + if (!(flags & SCHED_CPUFREQ_WALT)) + return; + if (!sched_disable_window_stats && + (rq->load_reported_window == rq->window_start) && + !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG)) + return; + rq->load_reported_window = rq->window_start; +#endif + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); if (data) data->func(data, rq_clock(rq), flags); } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 3278c81cefb1..0fa11d86599e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -78,6 +79,7 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); inc_hmp_sched_stats_stop(rq, p); } @@ -85,6 +87,7 @@ static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); dec_hmp_sched_stats_stop(rq, p); } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b84d13750604..728553403c2b 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -17,8 +17,11 @@ bool schedtune_initialized = false; unsigned int sysctl_sched_cfs_boost __read_mostly; +/* We hold schedtune boost in effect for at least this long */ +#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL + extern struct reciprocal_value schedtune_spc_rdiv; -extern struct target_nrg schedtune_target_nrg; +struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ static int perf_boost_idx; @@ -240,7 +243,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, * implementation especially for the computation of the per-CPU boost * value */ -#define BOOSTGROUPS_COUNT 5 +#define BOOSTGROUPS_COUNT 6 /* Array of configured boostgroups */ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { @@ -260,11 +263,14 @@ struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ bool idle; int boost_max; + u64 boost_ts; struct { /* The boost for tasks on that boost group */ int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; + /* Timestamp of boost activation */ + u64 ts; } group[BOOSTGROUPS_COUNT]; /* CPU's boost group locking */ raw_spinlock_t lock; @@ -388,32 +394,52 @@ static inline void init_sched_boost(struct schedtune *st) { } #endif /* CONFIG_SCHED_HMP */ +static inline bool schedtune_boost_timeout(u64 now, u64 ts) +{ + return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS); +} + +static inline bool +schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now) +{ + if (bg->group[idx].tasks) + return true; + + return !schedtune_boost_timeout(now, bg->group[idx].ts); +} + static void -schedtune_cpu_update(int cpu) +schedtune_cpu_update(int cpu, u64 now) { struct boost_groups *bg; - int boost_max; + u64 boost_ts = now; + int boost_max = INT_MIN; int idx; bg = &per_cpu(cpu_boost_groups, cpu); - /* The root boost group is always active */ - boost_max = bg->group[0].boost; - for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + for (idx = 0; idx < BOOSTGROUPS_COUNT; ++idx) { /* * A boost group affects a CPU only if it has - * RUNNABLE tasks on that CPU + * RUNNABLE tasks on that CPU or it has hold + * in effect from a previous task. */ - if (bg->group[idx].tasks == 0) + if (!schedtune_boost_group_active(idx, bg, now)) + continue; + + /* this boost group is active */ + if (boost_max > bg->group[idx].boost) continue; - boost_max = max(boost_max, bg->group[idx].boost); + boost_max = bg->group[idx].boost; + boost_ts = bg->group[idx].ts; } - /* Ensures boost_max is non-negative when all cgroup boost values - * are neagtive. Avoids under-accounting of cpu capacity which may cause - * task stacking and frequency spikes.*/ - boost_max = max(boost_max, 0); + + /* If there are no active boost groups on the CPU, set no boost */ + if (boost_max == INT_MIN) + boost_max = 0; bg->boost_max = boost_max; + bg->boost_ts = boost_ts; } static int @@ -423,6 +449,7 @@ schedtune_boostgroup_update(int idx, int boost) int cur_boost_max; int old_boost; int cpu; + u64 now; /* Update per CPU boost groups */ for_each_possible_cpu(cpu) { @@ -439,16 +466,22 @@ schedtune_boostgroup_update(int idx, int boost) /* Update the boost value of this boost group */ bg->group[idx].boost = boost; - /* Check if this update increase current max */ - if (boost > cur_boost_max && bg->group[idx].tasks) { + now = sched_clock_cpu(cpu); + /* + * Check if this update increase current max. + */ + if (boost > cur_boost_max && + schedtune_boost_group_active(idx, bg, now)) { bg->boost_max = boost; + bg->boost_ts = bg->group[idx].ts; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ if (cur_boost_max == old_boost && old_boost > boost) { - schedtune_cpu_update(cpu); + schedtune_cpu_update(cpu, now); trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); continue; } @@ -462,21 +495,38 @@ schedtune_boostgroup_update(int idx, int boost) #define ENQUEUE_TASK 1 #define DEQUEUE_TASK -1 +static inline bool +schedtune_update_timestamp(struct task_struct *p) +{ + if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL)) + return true; + + return task_has_rt_policy(p); +} + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); int tasks = bg->group[idx].tasks + task_count; + u64 now; /* Update boosted tasks count while avoiding to make it negative */ bg->group[idx].tasks = max(0, tasks); + /* Update timeout on enqueue */ + if (task_count > 0) { + now = sched_clock_cpu(cpu); + if (schedtune_update_timestamp(p)) + bg->group[idx].ts = now; + + /* Boost group activation or deactivation on that RQ */ + if (bg->group[idx].tasks == 1) + schedtune_cpu_update(cpu, now); + } trace_sched_tune_tasks_update(p, cpu, tasks, idx, - bg->group[idx].boost, bg->boost_max); - - /* Boost group activation or deactivation on that RQ */ - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + bg->group[idx].boost, bg->boost_max, + bg->group[idx].ts); } /* @@ -529,6 +579,7 @@ int schedtune_can_attach(struct cgroup_taskset *tset) int src_bg; /* Source boost group index */ int dst_bg; /* Destination boost group index */ int tasks; + u64 now; if (!unlikely(schedtune_initialized)) return 0; @@ -574,18 +625,19 @@ int schedtune_can_attach(struct cgroup_taskset *tset) * current boost group. */ + now = sched_clock_cpu(cpu); + /* Move task from src to dst boost group */ tasks = bg->group[src_bg].tasks - 1; bg->group[src_bg].tasks = max(0, tasks); bg->group[dst_bg].tasks += 1; + bg->group[dst_bg].ts = now; + + /* update next time someone asks */ + bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS; raw_spin_unlock(&bg->lock); unlock_rq_of(rq, task, &irq_flags); - - /* Update CPU boost group */ - if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) - schedtune_cpu_update(task_cpu(task)); - } return 0; @@ -666,8 +718,15 @@ void schedtune_exit_task(struct task_struct *tsk) int schedtune_cpu_boost(int cpu) { struct boost_groups *bg; + u64 now; bg = &per_cpu(cpu_boost_groups, cpu); + now = sched_clock_cpu(cpu); + + /* check to see if we have a hold in effect */ + if (schedtune_boost_timeout(now, bg->boost_ts)) + schedtune_cpu_update(cpu, now); + return bg->boost_max; } @@ -770,6 +829,7 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, static void schedtune_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_SCHED_HMP struct task_struct *task; struct cgroup_subsys_state *css; struct schedtune *st; @@ -782,6 +842,7 @@ static void schedtune_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sync_cgroup_colocation(task, colocate); +#endif } static struct cftype files[] = { @@ -829,6 +890,7 @@ schedtune_boostgroup_init(struct schedtune *st) bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; + bg->group[st->idx].ts = 0; } return 0; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f15d6b6a538a..675228037d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -10,6 +10,7 @@ #include <linux/wait.h> #include <linux/hash.h> #include <linux/kthread.h> +#include <linux/poll.h> void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -156,6 +157,13 @@ void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(wait_queue_head_t *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, (void *)(POLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 8d25ffbe4fed..0162ff4647b6 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -55,12 +55,7 @@ __read_mostly unsigned int walt_ravg_window = static unsigned int sync_cpu; static ktime_t ktime_last; -static bool walt_ktime_suspended; - -static unsigned int task_load(struct task_struct *p) -{ - return p->ravg.demand; -} +static __read_mostly bool walt_ktime_suspended; static inline void fixup_cum_window_demand(struct rq *rq, s64 delta) { diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index de7edac43674..34c72a0fcf39 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -54,6 +54,8 @@ static inline void walt_set_window_start(struct rq *rq) { } static inline void walt_migrate_sync_cpu(int cpu) { } static inline void walt_init_cpu_efficiency(void) { } static inline u64 walt_ktime_clock(void) { return 0; } +static inline void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) { } #define walt_cpu_high_irqload(cpu) false |