1 files changed, 67 insertions, 11 deletions
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index df47c26ab6d2..ae6876e62c0f 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -1602,7 +1602,7 @@ unsigned int nr_eligible_big_tasks(int cpu)
 	int nr_big = rq->hmp_stats.nr_big_tasks;
 	int nr = rq->nr_running;
 
-	if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
+	if (!is_max_capacity_cpu(cpu))
 		return nr_big;
 
 	return nr;
@@ -2521,10 +2521,42 @@ static inline u32 predict_and_update_buckets(struct rq *rq,
 	return pred_demand;
 }
 
-static void update_task_cpu_cycles(struct task_struct *p, int cpu)
+#define THRESH_CC_UPDATE (2 * NSEC_PER_USEC)
+
+/*
+ * Assumes rq_lock is held and wallclock was recorded in the same critical
+ * section as this function's invocation.
+ */
+static inline u64 read_cycle_counter(int cpu, u64 wallclock)
+{
+	struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
+	u64 delta;
+
+	if (unlikely(!cluster))
+		return cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+
+	/*
+	 * Why don't we need locking here? Let's say that delta is negative
+	 * because some other CPU happened to update last_cc_update with a
+	 * more recent timestamp. We simply read the conter again in that case
+	 * with no harmful side effects. This can happen if there is an FIQ
+	 * between when we read the wallclock and when we use it here.
+	 */
+	delta = wallclock - atomic64_read(&cluster->last_cc_update);
+	if (delta > THRESH_CC_UPDATE) {
+		atomic64_set(&cluster->cycles,
+			     cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu));
+		atomic64_set(&cluster->last_cc_update, wallclock);
+	}
+
+	return atomic64_read(&cluster->cycles);
+}
+
+static void update_task_cpu_cycles(struct task_struct *p, int cpu,
+				   u64 wallclock)
 {
 	if (use_cycle_counter)
-		p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+		p->cpu_cycles = read_cycle_counter(cpu, wallclock);
 }
 
 static void
@@ -2542,7 +2574,7 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
 		return;
 	}
 
-	cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
+	cur_cycles = read_cycle_counter(cpu, wallclock);
 
 	/*
 	 * If current task is idle task and irqtime == 0 CPU was
@@ -2579,7 +2611,8 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
 	trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
 }
 
-static int account_busy_for_task_demand(struct task_struct *p, int event)
+static int
+account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
 {
 	/*
 	 * No need to bother updating task demand for exiting tasks
@@ -2598,6 +2631,17 @@ static int account_busy_for_task_demand(struct task_struct *p, int event)
 			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
 		return 0;
 
+	/*
+	 * TASK_UPDATE can be called on sleeping task, when its moved between
+	 * related groups
+	 */
+	if (event == TASK_UPDATE) {
+		if (rq->curr == p)
+			return 1;
+
+		return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
+	}
+
 	return 1;
 }
 
@@ -2738,7 +2782,7 @@ static u64 update_task_demand(struct task_struct *p, struct rq *rq,
 	u64 runtime;
 
 	new_window = mark_start < window_start;
-	if (!account_busy_for_task_demand(p, event)) {
+	if (!account_busy_for_task_demand(rq, p, event)) {
 		if (new_window)
 			/*
 			 * If the time accounted isn't being accounted as
@@ -2822,7 +2866,7 @@ void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
 	update_window_start(rq, wallclock);
 
 	if (!p->ravg.mark_start) {
-		update_task_cpu_cycles(p, cpu_of(rq));
+		update_task_cpu_cycles(p, cpu_of(rq), wallclock);
 		goto done;
 	}
 
@@ -2890,7 +2934,7 @@ void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
 	if (is_idle_task(curr)) {
 		/* We're here without rq->lock held, IRQ disabled */
 		raw_spin_lock(&rq->lock);
-		update_task_cpu_cycles(curr, cpu);
+		update_task_cpu_cycles(curr, cpu, sched_ktime_clock());
 		raw_spin_unlock(&rq->lock);
 	}
 }
@@ -2935,7 +2979,7 @@ void mark_task_starting(struct task_struct *p)
 	p->ravg.mark_start = p->last_wake_ts = wallclock;
 	p->last_cpu_selected_ts = wallclock;
 	p->last_switch_out_ts = 0;
-	update_task_cpu_cycles(p, cpu_of(rq));
+	update_task_cpu_cycles(p, cpu_of(rq), wallclock);
 }
 
 void set_window_start(struct rq *rq)
@@ -3548,7 +3592,7 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	update_task_ravg(p, task_rq(p), TASK_MIGRATE,
 			 wallclock, 0);
 
-	update_task_cpu_cycles(p, new_cpu);
+	update_task_cpu_cycles(p, new_cpu, wallclock);
 
 	new_task = is_new_task(p);
 	/* Protected by rq_lock */
@@ -4303,8 +4347,20 @@ void note_task_waking(struct task_struct *p, u64 wallclock)
 {
 	u64 sleep_time = wallclock - p->last_switch_out_ts;
 
-	p->last_wake_ts = wallclock;
+	/*
+	 * When a short burst and short sleeping task goes for a long
+	 * sleep, the task's avg_sleep_time gets boosted. It will not
+	 * come below short_sleep threshold for a lot of time and it
+	 * results in incorrect packing. The idead behind tracking
+	 * avg_sleep_time is to detect if a task is short sleeping
+	 * or not. So limit the sleep time to twice the short sleep
+	 * threshold. For regular long sleeping tasks, the avg_sleep_time
+	 * would be higher than threshold, and packing happens correctly.
+	 */
+	sleep_time = min_t(u64, sleep_time, 2 * sysctl_sched_short_sleep);
 	update_avg(&p->ravg.avg_sleep_time, sleep_time);
+
+	p->last_wake_ts = wallclock;
 }
 
 #ifdef CONFIG_CGROUP_SCHED