1 files changed, 186 insertions, 119 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 301ed592f50f..a8bf39c6d7d7 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -74,11 +74,6 @@ inline void clear_ed_task(struct task_struct *p, struct rq *rq)
 		rq->ed_task = NULL;
 }
 
-inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
-{
-	p->last_wake_ts = wallclock;
-}
-
 inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
 {
 	p->last_switch_out_ts = wallclock;
@@ -641,14 +636,18 @@ void clear_hmp_request(int cpu)
 	clear_boost_kick(cpu);
 	clear_reserved(cpu);
 	if (rq->push_task) {
+		struct task_struct *push_task = NULL;
+
 		raw_spin_lock_irqsave(&rq->lock, flags);
 		if (rq->push_task) {
 			clear_reserved(rq->push_cpu);
-			put_task_struct(rq->push_task);
+			push_task = rq->push_task;
 			rq->push_task = NULL;
 		}
 		rq->active_balance = 0;
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
+		if (push_task)
+			put_task_struct(push_task);
 	}
 }
 
@@ -784,11 +783,12 @@ __read_mostly unsigned int sched_major_task_runtime = 10000000;
 
 static unsigned int sync_cpu;
 
-static LIST_HEAD(related_thread_groups);
+struct related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID];
+static LIST_HEAD(active_related_thread_groups);
 static DEFINE_RWLOCK(related_thread_group_lock);
 
 #define for_each_related_thread_group(grp) \
-	list_for_each_entry(grp, &related_thread_groups, list)
+	list_for_each_entry(grp, &active_related_thread_groups, list)
 
 /*
  * Task load is categorized into buckets for the purpose of top task tracking.
@@ -956,6 +956,16 @@ sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
 
 unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
 
+/*
+ * Scheduler tries to avoid waking up idle CPUs for tasks running
+ * in short bursts. If the task average burst is less than
+ * sysctl_sched_short_burst nanoseconds and it sleeps on an average
+ * for more than sysctl_sched_short_sleep nanoseconds, then the
+ * task is eligible for packing.
+ */
+unsigned int __read_mostly sysctl_sched_short_burst;
+unsigned int __read_mostly sysctl_sched_short_sleep = 1 * NSEC_PER_MSEC;
+
 static void
 _update_up_down_migrate(unsigned int *up_migrate, unsigned int *down_migrate)
 {
@@ -1547,6 +1557,15 @@ void init_new_task_load(struct task_struct *p, bool idle_task)
 	INIT_LIST_HEAD(&p->grp_list);
 	memset(&p->ravg, 0, sizeof(struct ravg));
 	p->cpu_cycles = 0;
+	p->ravg.curr_burst = 0;
+	/*
+	 * Initialize the avg_burst to twice the threshold, so that
+	 * a task would not be classified as short burst right away
+	 * after fork. It takes at least 6 sleep-wakeup cycles for
+	 * the avg_burst to go below the threshold.
+	 */
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+	p->ravg.avg_sleep_time = 0;
 
 	p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
 	p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32), GFP_KERNEL);
@@ -1767,20 +1786,20 @@ static int send_notification(struct rq *rq, int check_pred, int check_groups)
 		if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
 			return 0;
 	} else {
-		read_lock(&related_thread_group_lock);
+		read_lock_irqsave(&related_thread_group_lock, flags);
 		/*
 		 * Protect from concurrent update of rq->prev_runnable_sum and
 		 * group cpu load
 		 */
-		raw_spin_lock_irqsave(&rq->lock, flags);
+		raw_spin_lock(&rq->lock);
 		if (check_groups)
 			_group_load_in_cpu(cpu_of(rq), &group_load, NULL);
 
 		new_load = rq->prev_runnable_sum + group_load;
 		new_load = freq_policy_load(rq, new_load);
 
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		read_unlock(&related_thread_group_lock);
+		raw_spin_unlock(&rq->lock);
+		read_unlock_irqrestore(&related_thread_group_lock, flags);
 
 		cur_freq = load_to_freq(rq, rq->old_busy_time);
 		freq_required = load_to_freq(rq, new_load);
@@ -2733,12 +2752,14 @@ done:
 	trace_sched_update_history(rq, p, runtime, samples, event);
 }
 
-static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
+static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
 {
 	delta = scale_exec_time(delta, rq);
 	p->ravg.sum += delta;
 	if (unlikely(p->ravg.sum > sched_ravg_window))
 		p->ravg.sum = sched_ravg_window;
+
+	return delta;
 }
 
 /*
@@ -2791,13 +2812,14 @@ static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
  * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
  * depends on it!
  */
-static void update_task_demand(struct task_struct *p, struct rq *rq,
+static u64 update_task_demand(struct task_struct *p, struct rq *rq,
 			       int event, u64 wallclock)
 {
 	u64 mark_start = p->ravg.mark_start;
 	u64 delta, window_start = rq->window_start;
 	int new_window, nr_full_windows;
 	u32 window_size = sched_ravg_window;
+	u64 runtime;
 
 	new_window = mark_start < window_start;
 	if (!account_busy_for_task_demand(p, event)) {
@@ -2811,7 +2833,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 			 * it is not necessary to account those.
 			 */
 			update_history(rq, p, p->ravg.sum, 1, event);
-		return;
+		return 0;
 	}
 
 	if (!new_window) {
@@ -2819,8 +2841,7 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 		 * The simple case - busy time contained within the existing
 		 * window.
 		 */
-		add_to_task_demand(rq, p, wallclock - mark_start);
-		return;
+		return add_to_task_demand(rq, p, wallclock - mark_start);
 	}
 
 	/*
@@ -2832,13 +2853,16 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 	window_start -= (u64)nr_full_windows * (u64)window_size;
 
 	/* Process (window_start - mark_start) first */
-	add_to_task_demand(rq, p, window_start - mark_start);
+	runtime = add_to_task_demand(rq, p, window_start - mark_start);
 
 	/* Push new sample(s) into task's demand history */
 	update_history(rq, p, p->ravg.sum, 1, event);
-	if (nr_full_windows)
-		update_history(rq, p, scale_exec_time(window_size, rq),
-			       nr_full_windows, event);
+	if (nr_full_windows) {
+		u64 scaled_window = scale_exec_time(window_size, rq);
+
+		update_history(rq, p, scaled_window, nr_full_windows, event);
+		runtime += nr_full_windows * scaled_window;
+	}
 
 	/*
 	 * Roll window_start back to current to process any remainder
@@ -2848,13 +2872,31 @@ static void update_task_demand(struct task_struct *p, struct rq *rq,
 
 	/* Process (wallclock - window_start) next */
 	mark_start = window_start;
-	add_to_task_demand(rq, p, wallclock - mark_start);
+	runtime += add_to_task_demand(rq, p, wallclock - mark_start);
+
+	return runtime;
+}
+
+static inline void
+update_task_burst(struct task_struct *p, struct rq *rq, int event, int runtime)
+{
+	/*
+	 * update_task_demand() has checks for idle task and
+	 * exit task. The runtime may include the wait time,
+	 * so update the burst only for the cases where the
+	 * task is running.
+	 */
+	if (event == PUT_PREV_TASK || (event == TASK_UPDATE &&
+				rq->curr == p))
+		p->ravg.curr_burst += runtime;
 }
 
 /* Reflect task activity on its demand and cpu's busy time statistics */
 void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 						u64 wallclock, u64 irqtime)
 {
+	u64 runtime;
+
 	if (!rq->window_start || sched_disable_window_stats ||
 	    p->ravg.mark_start == wallclock)
 		return;
@@ -2869,7 +2911,9 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 	}
 
 	update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
-	update_task_demand(p, rq, event, wallclock);
+	runtime = update_task_demand(p, rq, event, wallclock);
+	if (runtime)
+		update_task_burst(p, rq, event, runtime);
 	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
 	update_task_pred_demand(rq, p, event);
 done:
@@ -2955,6 +2999,8 @@ void reset_task_stats(struct task_struct *p)
 	p->ravg.curr_window_cpu = curr_window_ptr;
 	p->ravg.prev_window_cpu = prev_window_ptr;
 
+	p->ravg.avg_burst = 2 * (u64)sysctl_sched_short_burst;
+
 	/* Retain EXITING_TASK marker */
 	p->ravg.sum_history[0] = sum;
 }
@@ -3053,7 +3099,7 @@ void reset_all_window_stats(u64 window_start, unsigned int window_size)
 
 	read_unlock(&tasklist_lock);
 
-	list_for_each_entry(grp, &related_thread_groups, list) {
+	list_for_each_entry(grp, &active_related_thread_groups, list) {
 		int j;
 
 		for_each_possible_cpu(j) {
@@ -3203,14 +3249,16 @@ void sched_get_cpus_busy(struct sched_load *busy,
 	if (unlikely(cpus == 0))
 		return;
 
+	local_irq_save(flags);
+
+	read_lock(&related_thread_group_lock);
+
 	/*
 	 * This function could be called in timer context, and the
 	 * current task may have been executing for a long time. Ensure
 	 * that the window stats are current by doing an update.
 	 */
-	read_lock(&related_thread_group_lock);
 
-	local_irq_save(flags);
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_lock(&cpu_rq(cpu)->lock);
 
@@ -3310,10 +3358,11 @@ skip_early:
 
 	for_each_cpu(cpu, query_cpus)
 		raw_spin_unlock(&(cpu_rq(cpu))->lock);
-	local_irq_restore(flags);
 
 	read_unlock(&related_thread_group_lock);
 
+	local_irq_restore(flags);
+
 	i = 0;
 	for_each_cpu(cpu, query_cpus) {
 		rq = cpu_rq(cpu);
@@ -3966,47 +4015,54 @@ _group_cpu_time(struct related_thread_group *grp, int cpu)
 	return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
 }
 
-struct related_thread_group *alloc_related_thread_group(int group_id)
+static inline struct related_thread_group*
+lookup_related_thread_group(unsigned int group_id)
 {
-	struct related_thread_group *grp;
-
-	grp = kzalloc(sizeof(*grp), GFP_ATOMIC);
-	if (!grp)
-		return ERR_PTR(-ENOMEM);
-
-	if (alloc_group_cputime(grp)) {
-		kfree(grp);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	grp->id = group_id;
-	INIT_LIST_HEAD(&grp->tasks);
-	INIT_LIST_HEAD(&grp->list);
-	raw_spin_lock_init(&grp->lock);
-
-	return grp;
+	return related_thread_groups[group_id];
 }
 
-struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
+int alloc_related_thread_groups(void)
 {
+	int i, ret;
 	struct related_thread_group *grp;
 
-	list_for_each_entry(grp, &related_thread_groups, list) {
-		if (grp->id == group_id)
-			return grp;
+	/* groupd_id = 0 is invalid as it's special id to remove group. */
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = kzalloc(sizeof(*grp), GFP_NOWAIT);
+		if (!grp) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (alloc_group_cputime(grp)) {
+			kfree(grp);
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		grp->id = i;
+		INIT_LIST_HEAD(&grp->tasks);
+		INIT_LIST_HEAD(&grp->list);
+		raw_spin_lock_init(&grp->lock);
+
+		related_thread_groups[i] = grp;
 	}
 
-	return NULL;
-}
+	return 0;
 
-/* See comments before preferred_cluster() */
-static void free_related_thread_group(struct rcu_head *rcu)
-{
-	struct related_thread_group *grp = container_of(rcu, struct
-			related_thread_group, rcu);
+err:
+	for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) {
+		grp = lookup_related_thread_group(i);
+		if (grp) {
+			free_group_cputime(grp);
+			kfree(grp);
+			related_thread_groups[i] = NULL;
+		} else {
+			break;
+		}
+	}
 
-	free_group_cputime(grp);
-	kfree(grp);
+	return ret;
 }
 
 static void remove_task_from_group(struct task_struct *p)
@@ -4031,10 +4087,12 @@ static void remove_task_from_group(struct task_struct *p)
 	raw_spin_unlock(&grp->lock);
 
 	/* Reserved groups cannot be destroyed */
-	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) {
-		list_del(&grp->list);
-		call_rcu(&grp->rcu, free_related_thread_group);
-	}
+	if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID)
+		 /*
+		  * We test whether grp->list is attached with list_empty()
+		  * hence re-init the list after deletion.
+		  */
+		list_del_init(&grp->list);
 }
 
 static int
@@ -4106,53 +4164,15 @@ void add_new_task_to_grp(struct task_struct *new)
 	write_unlock_irqrestore(&related_thread_group_lock, flags);
 }
 
-#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
-/*
- * We create a default colocation group at boot. There is no need to
- * synchronize tasks between cgroups at creation time because the
- * correct cgroup hierarchy is not available at boot. Therefore cgroup
- * colocation is turned off by default even though the colocation group
- * itself has been allocated. Furthermore this colocation group cannot
- * be destroyted once it has been created. All of this has been as part
- * of runtime optimizations.
- *
- * The job of synchronizing tasks to the colocation group is done when
- * the colocation flag in the cgroup is turned on.
- */
-static int __init create_default_coloc_group(void)
-{
-	struct related_thread_group *grp = NULL;
-	unsigned long flags;
-
-	grp = alloc_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
-	if (IS_ERR(grp)) {
-		WARN_ON(1);
-		return -ENOMEM;
-	}
-
-	write_lock_irqsave(&related_thread_group_lock, flags);
-	list_add(&grp->list, &related_thread_groups);
-	write_unlock_irqrestore(&related_thread_group_lock, flags);
-
-	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
-	return 0;
-}
-late_initcall(create_default_coloc_group);
-
-int sync_cgroup_colocation(struct task_struct *p, bool insert)
-{
-	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
-
-	return sched_set_group_id(p, grp_id);
-}
-#endif
-
-int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+static int __sched_set_group_id(struct task_struct *p, unsigned int group_id)
 {
 	int rc = 0;
 	unsigned long flags;
 	struct related_thread_group *grp = NULL;
 
+	if (group_id >= MAX_NUM_CGROUP_COLOC_ID)
+		return -EINVAL;
+
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	write_lock(&related_thread_group_lock);
 
@@ -4168,29 +4188,26 @@ int sched_set_group_id(struct task_struct *p, unsigned int group_id)
 	}
 
 	grp = lookup_related_thread_group(group_id);
-	if (!grp) {
-		/* This is a reserved id */
-		if (group_id == DEFAULT_CGROUP_COLOC_ID) {
-			rc = -EINVAL;
-			goto done;
-		}
-
-		grp = alloc_related_thread_group(group_id);
-		if (IS_ERR(grp)) {
-			rc = -ENOMEM;
-			goto done;
-		}
-
-		list_add(&grp->list, &related_thread_groups);
-	}
+	if (list_empty(&grp->list))
+		list_add(&grp->list, &active_related_thread_groups);
 
 	rc = add_task_to_group(p, grp);
 done:
 	write_unlock(&related_thread_group_lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
 	return rc;
 }
 
+int sched_set_group_id(struct task_struct *p, unsigned int group_id)
+{
+	/* DEFAULT_CGROUP_COLOC_ID is a reserved id */
+	if (group_id == DEFAULT_CGROUP_COLOC_ID)
+		return -EINVAL;
+
+	return __sched_set_group_id(p, group_id);
+}
+
 unsigned int sched_get_group_id(struct task_struct *p)
 {
 	unsigned int group_id;
@@ -4204,6 +4221,42 @@ unsigned int sched_get_group_id(struct task_struct *p)
 	return group_id;
 }
 
+#if defined(CONFIG_SCHED_TUNE) && defined(CONFIG_CGROUP_SCHEDTUNE)
+/*
+ * We create a default colocation group at boot. There is no need to
+ * synchronize tasks between cgroups at creation time because the
+ * correct cgroup hierarchy is not available at boot. Therefore cgroup
+ * colocation is turned off by default even though the colocation group
+ * itself has been allocated. Furthermore this colocation group cannot
+ * be destroyted once it has been created. All of this has been as part
+ * of runtime optimizations.
+ *
+ * The job of synchronizing tasks to the colocation group is done when
+ * the colocation flag in the cgroup is turned on.
+ */
+static int __init create_default_coloc_group(void)
+{
+	struct related_thread_group *grp = NULL;
+	unsigned long flags;
+
+	grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID);
+	write_lock_irqsave(&related_thread_group_lock, flags);
+	list_add(&grp->list, &active_related_thread_groups);
+	write_unlock_irqrestore(&related_thread_group_lock, flags);
+
+	update_freq_aggregate_threshold(MAX_FREQ_AGGR_THRESH);
+	return 0;
+}
+late_initcall(create_default_coloc_group);
+
+int sync_cgroup_colocation(struct task_struct *p, bool insert)
+{
+	unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0;
+
+	return __sched_set_group_id(p, grp_id);
+}
+#endif
+
 static void update_cpu_cluster_capacity(const cpumask_t *cpus)
 {
 	int i;
@@ -4451,6 +4504,20 @@ bool early_detection_notify(struct rq *rq, u64 wallclock)
 	return 0;
 }
 
+void update_avg_burst(struct task_struct *p)
+{
+	update_avg(&p->ravg.avg_burst, p->ravg.curr_burst);
+	p->ravg.curr_burst = 0;
+}
+
+void note_task_waking(struct task_struct *p, u64 wallclock)
+{
+	u64 sleep_time = wallclock - p->last_switch_out_ts;
+
+	p->last_wake_ts = wallclock;
+	update_avg(&p->ravg.avg_sleep_time, sleep_time);
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
 					  struct cftype *cft)