summaryrefslogtreecommitdiff
path: root/kernel/sched/hmp.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/hmp.c')
-rw-r--r--kernel/sched/hmp.c386
1 files changed, 179 insertions, 207 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index c1c35bb582fb..968a41e0e81e 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -17,8 +17,6 @@
#include <linux/cpufreq.h>
#include <linux/list_sort.h>
#include <linux/syscore_ops.h>
-#include <linux/of.h>
-#include <linux/sched/core_ctl.h>
#include "sched.h"
@@ -231,52 +229,6 @@ fail:
return ret;
}
-/*
- * It is possible that CPUs of the same micro architecture can have slight
- * difference in the efficiency due to other factors like cache size. The
- * BOOST_ON_BIG policy may not be optimial for such systems. The required
- * boost policy can be specified via device tree to handle this.
- */
-static int __read_mostly sched_boost_policy = SCHED_BOOST_NONE;
-
-/*
- * This should be called after clusters are populated and
- * the respective efficiency values are initialized.
- */
-void init_sched_hmp_boost_policy(void)
-{
- /*
- * Initialize the boost type here if it is not passed from
- * device tree.
- */
- if (sched_boost_policy == SCHED_BOOST_NONE) {
- if (max_possible_efficiency != min_possible_efficiency)
- sched_boost_policy = SCHED_BOOST_ON_BIG;
- else
- sched_boost_policy = SCHED_BOOST_ON_ALL;
- }
-}
-
-void sched_hmp_parse_dt(void)
-{
- struct device_node *sn;
- const char *boost_policy;
-
- if (!sched_enable_hmp)
- return;
-
- sn = of_find_node_by_path("/sched-hmp");
- if (!sn)
- return;
-
- if (!of_property_read_string(sn, "boost-policy", &boost_policy)) {
- if (!strcmp(boost_policy, "boost-on-big"))
- sched_boost_policy = SCHED_BOOST_ON_BIG;
- else if (!strcmp(boost_policy, "boost-on-all"))
- sched_boost_policy = SCHED_BOOST_ON_ALL;
- }
-}
-
unsigned int max_possible_efficiency = 1;
unsigned int min_possible_efficiency = UINT_MAX;
@@ -680,29 +632,6 @@ int __init set_sched_enable_hmp(char *str)
early_param("sched_enable_hmp", set_sched_enable_hmp);
-int got_boost_kick(void)
-{
- int cpu = smp_processor_id();
- struct rq *rq = cpu_rq(cpu);
-
- return test_bit(BOOST_KICK, &rq->hmp_flags);
-}
-
-inline void clear_boost_kick(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- clear_bit(BOOST_KICK, &rq->hmp_flags);
-}
-
-inline void boost_kick(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
- smp_send_reschedule(cpu);
-}
-
/* Clear any HMP scheduler related requests pending from or on cpu */
void clear_hmp_request(int cpu)
{
@@ -840,6 +769,9 @@ min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
/* Window size (in ns) */
__read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
+/* Maximum allowed threshold before freq aggregation must be enabled */
+#define MAX_FREQ_AGGR_THRESH 1000
+
/* Temporarily disable window-stats activity on all cpus */
unsigned int __read_mostly sched_disable_window_stats;
@@ -919,8 +851,8 @@ static const unsigned int top_tasks_bitmap_size =
* C1 busy time = 5 + 5 + 6 = 16ms
*
*/
-static __read_mostly unsigned int sched_freq_aggregate;
-__read_mostly unsigned int sysctl_sched_freq_aggregate;
+static __read_mostly unsigned int sched_freq_aggregate = 1;
+__read_mostly unsigned int sysctl_sched_freq_aggregate = 1;
unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
static unsigned int __read_mostly sched_freq_aggregate_threshold;
@@ -937,14 +869,6 @@ unsigned int max_task_load(void)
/* Use this knob to turn on or off HMP-aware task placement logic */
unsigned int __read_mostly sched_enable_hmp;
-/*
- * Scheduler boost is a mechanism to temporarily place tasks on CPUs
- * with higher capacity than those where a task would have normally
- * ended up with their load characteristics. Any entity enabling
- * boost is responsible for disabling it as well.
- */
-unsigned int sysctl_sched_boost;
-
/* A cpu can no longer accommodate more tasks if:
*
* rq->nr_running > sysctl_sched_spill_nr_run ||
@@ -996,6 +920,21 @@ unsigned int __read_mostly sched_downmigrate;
unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
/*
+ * Task groups whose aggregate demand on a cpu is more than
+ * sched_group_upmigrate need to be up-migrated if possible.
+ */
+unsigned int __read_mostly sched_group_upmigrate;
+unsigned int __read_mostly sysctl_sched_group_upmigrate_pct = 100;
+
+/*
+ * Task groups, once up-migrated, will need to drop their aggregate
+ * demand to less than sched_group_downmigrate before they are "down"
+ * migrated.
+ */
+unsigned int __read_mostly sched_group_downmigrate;
+unsigned int __read_mostly sysctl_sched_group_downmigrate_pct = 95;
+
+/*
* The load scale factor of a CPU gets boosted when its max frequency
* is restricted due to which the tasks are migrating to higher capacity
* CPUs early. The sched_upmigrate threshold is auto-upgraded by
@@ -1017,33 +956,46 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
-void update_up_down_migrate(void)
+static void
+_update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
{
- unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
- unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
unsigned int delta;
if (up_down_migrate_scale_factor == 1024)
- goto done;
+ return;
- delta = up_migrate - down_migrate;
+ delta = *up_migrate - *down_migrate;
- up_migrate /= NSEC_PER_USEC;
- up_migrate *= up_down_migrate_scale_factor;
- up_migrate >>= 10;
- up_migrate *= NSEC_PER_USEC;
+ *up_migrate /= NSEC_PER_USEC;
+ *up_migrate *= up_down_migrate_scale_factor;
+ *up_migrate >>= 10;
+ *up_migrate *= NSEC_PER_USEC;
- up_migrate = min(up_migrate, sched_ravg_window);
+ *up_migrate = min(*up_migrate, sched_ravg_window);
- down_migrate /= NSEC_PER_USEC;
- down_migrate *= up_down_migrate_scale_factor;
- down_migrate >>= 10;
- down_migrate *= NSEC_PER_USEC;
+ *down_migrate /= NSEC_PER_USEC;
+ *down_migrate *= up_down_migrate_scale_factor;
+ *down_migrate >>= 10;
+ *down_migrate *= NSEC_PER_USEC;
- down_migrate = min(down_migrate, up_migrate - delta);
-done:
+ *down_migrate = min(*down_migrate, *up_migrate - delta);
+}
+
+static void update_up_down_migrate(void)
+{
+ unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
+ unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
+
+ _update_up_down_migrate(&up_migrate, &down_migrate);
sched_upmigrate = up_migrate;
sched_downmigrate = down_migrate;
+
+ up_migrate = pct_to_real(sysctl_sched_group_upmigrate_pct);
+ down_migrate = pct_to_real(sysctl_sched_group_downmigrate_pct);
+
+ _update_up_down_migrate(&up_migrate, &down_migrate);
+ sched_group_upmigrate = up_migrate;
+ sched_group_downmigrate = down_migrate;
}
void set_hmp_defaults(void)
@@ -1134,82 +1086,6 @@ u64 cpu_load_sync(int cpu, int sync)
return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
}
-static int boost_refcount;
-static DEFINE_SPINLOCK(boost_lock);
-static DEFINE_MUTEX(boost_mutex);
-
-static void boost_kick_cpus(void)
-{
- int i;
-
- for_each_online_cpu(i) {
- if (cpu_capacity(i) != max_capacity)
- boost_kick(i);
- }
-}
-
-int sched_boost(void)
-{
- return boost_refcount > 0;
-}
-
-int sched_set_boost(int enable)
-{
- unsigned long flags;
- int ret = 0;
- int old_refcount;
-
- if (!sched_enable_hmp)
- return -EINVAL;
-
- spin_lock_irqsave(&boost_lock, flags);
-
- old_refcount = boost_refcount;
-
- if (enable == 1) {
- boost_refcount++;
- } else if (!enable) {
- if (boost_refcount >= 1)
- boost_refcount--;
- else
- ret = -EINVAL;
- } else {
- ret = -EINVAL;
- }
-
- if (!old_refcount && boost_refcount)
- boost_kick_cpus();
-
- if (boost_refcount <= 1)
- core_ctl_set_boost(boost_refcount == 1);
- trace_sched_set_boost(boost_refcount);
- spin_unlock_irqrestore(&boost_lock, flags);
-
- return ret;
-}
-
-int sched_boost_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
-{
- int ret;
-
- mutex_lock(&boost_mutex);
- if (!write)
- sysctl_sched_boost = sched_boost();
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret || !write)
- goto done;
-
- ret = (sysctl_sched_boost <= 1) ?
- sched_set_boost(sysctl_sched_boost) : -EINVAL;
-
-done:
- mutex_unlock(&boost_mutex);
- return ret;
-}
-
/*
* Task will fit on a cpu if it's bandwidth consumption on that cpu
* will be less than sched_upmigrate. A big task that was previously
@@ -1219,60 +1095,63 @@ done:
* tasks with load close to the upmigrate threshold
*/
int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
- enum sched_boost_type boost_type)
+ enum sched_boost_policy boost_policy)
{
- int upmigrate;
+ int upmigrate = sched_upmigrate;
if (cpu_capacity(cpu) == max_capacity)
return 1;
- if (boost_type != SCHED_BOOST_ON_BIG) {
+ if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
+ upmigrate = sched_downmigrate;
+
+ if (boost_policy != SCHED_BOOST_ON_BIG) {
if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
upmigrate_discouraged(p))
return 1;
- upmigrate = sched_upmigrate;
- if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
- upmigrate = sched_downmigrate;
-
if (task_load < upmigrate)
return 1;
+ } else {
+ if (task_sched_boost(p) || task_load >= upmigrate)
+ return 0;
+
+ return 1;
}
return 0;
}
-enum sched_boost_type sched_boost_type(void)
-{
- if (sched_boost())
- return sched_boost_policy;
-
- return SCHED_BOOST_NONE;
-}
-
int task_will_fit(struct task_struct *p, int cpu)
{
u64 tload = scale_load_to_cpu(task_load(p), cpu);
- return task_load_will_fit(p, tload, cpu, sched_boost_type());
+ return task_load_will_fit(p, tload, cpu, sched_boost_policy());
}
-int group_will_fit(struct sched_cluster *cluster,
- struct related_thread_group *grp, u64 demand)
+static int
+group_will_fit(struct sched_cluster *cluster, struct related_thread_group *grp,
+ u64 demand, bool group_boost)
{
int cpu = cluster_first_cpu(cluster);
int prev_capacity = 0;
- unsigned int threshold = sched_upmigrate;
+ unsigned int threshold = sched_group_upmigrate;
u64 load;
if (cluster->capacity == max_capacity)
return 1;
+ if (group_boost)
+ return 0;
+
+ if (!demand)
+ return 1;
+
if (grp->preferred_cluster)
prev_capacity = grp->preferred_cluster->capacity;
if (cluster->capacity < prev_capacity)
- threshold = sched_downmigrate;
+ threshold = sched_group_downmigrate;
load = scale_load_to_cpu(demand, cpu);
if (load < threshold)
@@ -1495,6 +1374,23 @@ void post_big_task_count_change(const struct cpumask *cpus)
DEFINE_MUTEX(policy_mutex);
+unsigned int update_freq_aggregate_threshold(unsigned int threshold)
+{
+ unsigned int old_threshold;
+
+ mutex_lock(&policy_mutex);
+
+ old_threshold = sysctl_sched_freq_aggregate_threshold_pct;
+
+ sysctl_sched_freq_aggregate_threshold_pct = threshold;
+ sched_freq_aggregate_threshold =
+ pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
+
+ mutex_unlock(&policy_mutex);
+
+ return old_threshold;
+}
+
static inline int invalid_value_freq_input(unsigned int *data)
{
if (data == &sysctl_sched_freq_aggregate)
@@ -1578,7 +1474,9 @@ int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
if (write && (old_val == *data))
goto done;
- if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
+ if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct ||
+ sysctl_sched_group_downmigrate_pct >
+ sysctl_sched_group_upmigrate_pct) {
*data = old_val;
ret = -EINVAL;
goto done;
@@ -3801,13 +3699,13 @@ static void check_for_up_down_migrate_update(const struct cpumask *cpus)
}
/* Return cluster which can offer required capacity for group */
-static struct sched_cluster *
-best_cluster(struct related_thread_group *grp, u64 total_demand)
+static struct sched_cluster *best_cluster(struct related_thread_group *grp,
+ u64 total_demand, bool group_boost)
{
struct sched_cluster *cluster = NULL;
for_each_sched_cluster(cluster) {
- if (group_will_fit(cluster, grp, total_demand))
+ if (group_will_fit(cluster, grp, total_demand, group_boost))
return cluster;
}
@@ -3818,6 +3716,9 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
{
struct task_struct *p;
u64 combined_demand = 0;
+ bool boost_on_big = sched_boost_policy() == SCHED_BOOST_ON_BIG;
+ bool group_boost = false;
+ u64 wallclock;
if (!sysctl_sched_enable_colocation) {
grp->last_update = sched_ktime_clock();
@@ -3825,19 +3726,36 @@ static void _set_preferred_cluster(struct related_thread_group *grp)
return;
}
+ if (list_empty(&grp->tasks))
+ return;
+
+ wallclock = sched_ktime_clock();
+
/*
* wakeup of two or more related tasks could race with each other and
* could result in multiple calls to _set_preferred_cluster being issued
* at same time. Avoid overhead in such cases of rechecking preferred
* cluster
*/
- if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
+ if (wallclock - grp->last_update < sched_ravg_window / 10)
return;
- list_for_each_entry(p, &grp->tasks, grp_list)
+ list_for_each_entry(p, &grp->tasks, grp_list) {
+ if (boost_on_big && task_sched_boost(p)) {
+ group_boost = true;
+ break;
+ }
+
+ if (p->ravg.mark_start < wallclock -
+ (sched_ravg_window * sched_ravg_hist_size))
+ continue;
+
combined_demand += p->ravg.demand;
- grp->preferred_cluster = best_cluster(grp, combined_demand);
+ }
+
+ grp->preferred_cluster = best_cluster(grp,
+ combined_demand, group_boost);
grp->last_update = sched_ktime_clock();
trace_sched_set_preferred_cluster(grp, combined_demand);
}
@@ -3852,6 +3770,8 @@ void set_preferred_cluster(struct related_thread_group *grp)
#define ADD_TASK 0
#define REM_TASK 1
+#define DEFAULT_CGROUP_COLOC_ID 1
+
static inline void free_group_cputime(struct related_thread_group *grp)
{
free_percpu(grp->cpu_time);
@@ -4109,7 +4029,8 @@ static void remove_task_from_group(struct task_struct *p)
raw_spin_unlock(&grp->lock);
- if (empty_group) {
+ /* Reserved groups cannot be destroyed */
+ if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) {
list_del(&grp->list);
call_rcu(&grp->rcu, free_related_thread_group);
}
@@ -4143,20 +4064,25 @@ void add_new_task_to_grp(struct task_struct *new)
{
unsigned long flags;
struct related_thread_group *grp;
- struct task_struct *parent;
+ struct task_struct *leader = new->group_leader;
+ unsigned int leader_grp_id = sched_get_group_id(leader);
- if (!sysctl_sched_enable_thread_grouping)
+ if (!sysctl_sched_enable_thread_grouping &&
+ leader_grp_id != DEFAULT_CGROUP_COLOC_ID)
return;
if (thread_group_leader(new))
return;
- parent = new->group_leader;
+ if (leader_grp_id == DEFAULT_CGROUP_COLOC_ID) {
+ if (!same_schedtune(new, leader))
+ return;
+ }
write_lock_irqsave(&related_thread_group_lock, flags);
rcu_read_lock();
- grp = task_related_thread_group(parent);
+ grp = task_related_thread_group(leader);
rcu_read_unlock();
/*
@@ -4179,6 +4105,47 @@ void add_new_task_to_grp(struct task_struct *new)
write_unlock_irqrestore(&related_thread_group_lock, flags);
}
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+ struct related_thread_group *grp = NULL;
+ unsigned long flags;
+
+ grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+ if (IS_ERR(grp)) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
+ write_lock_irqsave(&related_thread_group_lock, flags);
+ list_add(&grp->list, &related_thread_groups);
+ write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+ update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+ return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+ unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+ return sched_set_group_id(p, grp_id);
+}
+#endif
+
int sched_set_group_id(struct task_struct *p, unsigned int group_id)
{
int rc = 0;
@@ -4201,6 +4168,12 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
grp = lookup_related_thread_group(group_id);
if (!grp) {
+ /* This is a reserved id */
+ if (group_id == DEFAULT_CGROUP_COLOC_ID) {
+ rc = -EINVAL;
+ goto done;
+ }
+
grp = alloc_related_thread_group(group_id);
if (IS_ERR(grp)) {
rc = -ENOMEM;
@@ -4210,7 +4183,6 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
list_add(&grp->list, &related_thread_groups);
}
- BUG_ON(!grp);
rc = add_task_to_group(p, grp);
done:
write_unlock(&related_thread_group_lock);
@@ -4459,7 +4431,7 @@ bool early_detection_notify(struct rq *rq, u64 wallclock)
struct task_struct *p;
int loop_max = 10;
- if (!sched_boost() || !rq->cfs.h_nr_running)
+ if (sched_boost_policy() == SCHED_BOOST_NONE || !rq->cfs.h_nr_running)
return 0;
rq->ed_task = NULL;