5 files changed, 355 insertions, 235 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8bd6fbde7efe..8cdd373a8980 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1287,7 +1287,7 @@ static int compute_load_scale_factor(struct sched_cluster *cluster)
 	return load_scale;
 }
 
-static struct list_head cluster_head;
+struct list_head cluster_head;
 static DEFINE_MUTEX(cluster_lock);
 static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
 DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
@@ -1311,9 +1311,6 @@ static struct sched_cluster init_cluster = {
 	.dstate_wakeup_latency	=	0,
 };
 
-#define for_each_sched_cluster(cluster) \
-	list_for_each_entry_rcu(cluster, &cluster_head, list)
-
 void update_all_clusters_stats(void)
 {
 	struct sched_cluster *cluster;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index be7b44f9a85f..950ab9229cfc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2702,15 +2702,9 @@ unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
  */
 unsigned int __read_mostly sysctl_sched_enable_power_aware = 0;
 
-/*
- * This specifies the maximum percent power difference between 2
- * CPUs for them to be considered identical in terms of their
- * power characteristics (i.e. they are in the same power band).
- */
-unsigned int __read_mostly sysctl_sched_powerband_limit_pct;
-
 unsigned int __read_mostly sysctl_sched_lowspill_freq;
 unsigned int __read_mostly sysctl_sched_pack_freq = UINT_MAX;
+
 /*
  * CPUs with load greater than the sched_spill_load_threshold are not
  * eligible for task placement. When all CPUs in a cluster achieve a
@@ -2894,18 +2888,6 @@ static inline u64 cpu_load_sync(int cpu, int sync)
 	return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
 }
 
-static int
-spill_threshold_crossed(u64 task_load, u64 cpu_load, struct rq *rq)
-{
-	u64 total_load = task_load + cpu_load;
-
-	if (total_load > sched_spill_load ||
-	    (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
-		return 1;
-
-	return 0;
-}
-
 static int boost_refcount;
 static DEFINE_SPINLOCK(boost_lock);
 static DEFINE_MUTEX(boost_mutex);
@@ -3010,33 +2992,11 @@ static int task_will_fit(struct task_struct *p, int cpu)
 	return task_load_will_fit(p, tload, cpu);
 }
 
-static int eligible_cpu(u64 task_load, u64 cpu_load, int cpu, int sync)
-{
-	if (sched_cpu_high_irqload(cpu))
-		return 0;
-
-	return !spill_threshold_crossed(task_load, cpu_load, cpu_rq(cpu));
-}
-
 struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
 {
 	return NULL;
 }
 
-int power_delta_exceeded(unsigned int cpu_cost, unsigned int base_cost)
-{
-	int delta, cost_limit;
-
-	if (!base_cost || cpu_cost == base_cost ||
-				!sysctl_sched_powerband_limit_pct)
-		return 0;
-
-	delta = cpu_cost - base_cost;
-	cost_limit = div64_u64((u64)sysctl_sched_powerband_limit_pct *
-						(u64)base_cost, 100);
-	return abs(delta) > cost_limit;
-}
-
 /*
  * Return the cost of running task p on CPU cpu. This function
  * currently assumes that task p is the only task which will run on
@@ -3108,59 +3068,270 @@ unlock:
 
 }
 
+struct cpu_select_env {
+	struct task_struct *p;
+	u8 reason;
+	u8 need_idle:1;
+	u8 boost:1;
+	u8 sync:1;
+	u8 ignore_prev_cpu:1;
+	int prev_cpu;
+	DECLARE_BITMAP(candidate_list, NR_CPUS);
+	DECLARE_BITMAP(backup_list, NR_CPUS);
+	u64 task_load;
+	u64 cpu_load;
+};
+
+struct cluster_cpu_stats {
+	int best_idle_cpu, best_capacity_cpu, best_cpu, best_sibling_cpu;
+	int min_cost, best_sibling_cpu_cost;
+	u64 min_load, best_sibling_cpu_load;
+	s64 highest_spare_capacity;
+};
+
 #define UP_MIGRATION		1
 #define DOWN_MIGRATION		2
-#define IRQLOAD_MIGRATION	4
+#define IRQLOAD_MIGRATION	3
 
-static int skip_cluster(int tcpu, int cpu, int reason)
+static int
+spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
 {
-	int skip;
+	u64 total_load;
 
-	if (!reason)
+	total_load = env->task_load + env->cpu_load;
+
+	if (total_load > sched_spill_load ||
+	    (rq->nr_running + 1) > sysctl_sched_spill_nr_run)
+		return 1;
+
+	return 0;
+}
+
+static int skip_cpu(int cpu, struct cpu_select_env *env)
+{
+	int tcpu = task_cpu(env->p);
+	int skip = 0;
+
+	if (!env->reason)
 		return 0;
 
-	switch (reason) {
-	case UP_MIGRATION:
-		skip = (cpu_capacity(cpu) <= cpu_capacity(tcpu));
-		break;
+	if (is_reserved(cpu))
+		return 1;
 
-	case DOWN_MIGRATION:
-		skip = (cpu_capacity(cpu) >= cpu_capacity(tcpu));
+	switch (env->reason) {
+	case UP_MIGRATION:
+		skip = !idle_cpu(cpu);
 		break;
-
 	case IRQLOAD_MIGRATION:
 		/* Purposely fall through */
-
 	default:
-		return 0;
+		skip = (cpu == tcpu);
+		break;
 	}
 
 	return skip;
 }
 
-static int skip_cpu(struct rq *task_rq, struct rq *rq, int cpu, int reason)
+static inline int
+acceptable_capacity(struct sched_cluster *cluster, struct cpu_select_env *env)
 {
-	int skip;
+	int tcpu;
 
-	if (!reason)
-		return 0;
-
-	if (is_reserved(cpu))
+	if (!env->reason)
 		return 1;
 
-	switch (reason) {
+	tcpu = task_cpu(env->p);
+	switch (env->reason) {
 	case UP_MIGRATION:
-		skip = !idle_cpu(cpu);
-		break;
+		return cluster->capacity > cpu_capacity(tcpu);
 
-	case IRQLOAD_MIGRATION:
-		/* Purposely fall through */
+	case DOWN_MIGRATION:
+		return cluster->capacity < cpu_capacity(tcpu);
 
 	default:
-		skip = (rq == task_rq);
+		break;
 	}
 
-	return skip;
+	return 1;
+}
+
+static int
+skip_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+	if (!acceptable_capacity(cluster, env)) {
+		__clear_bit(cluster->id, env->candidate_list);
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct sched_cluster *
+select_least_power_cluster(struct cpu_select_env *env)
+{
+	struct sched_cluster *cluster;
+
+	for_each_sched_cluster(cluster) {
+		if (!skip_cluster(cluster, env)) {
+			int cpu = cluster_first_cpu(cluster);
+
+			env->task_load = scale_load_to_cpu(task_load(env->p),
+									 cpu);
+			if (task_load_will_fit(env->p, env->task_load, cpu))
+				return cluster;
+
+			__set_bit(cluster->id, env->backup_list);
+			__clear_bit(cluster->id, env->candidate_list);
+		}
+	}
+
+	return NULL;
+}
+
+static struct sched_cluster *
+next_candidate(const unsigned long *list, int start, int end)
+{
+	int cluster_id;
+
+	cluster_id = find_next_bit(list, end, start - 1 + 1);
+	if (cluster_id >= end)
+		return NULL;
+
+	return sched_cluster[cluster_id];
+}
+
+static void update_spare_capacity(
+struct cluster_cpu_stats *stats, int cpu, int capacity, u64 cpu_load)
+{
+	s64 spare_capacity = sched_ravg_window - cpu_load;
+
+	if (spare_capacity > 0 &&
+	    (spare_capacity > stats->highest_spare_capacity ||
+	    (spare_capacity == stats->highest_spare_capacity &&
+	     capacity > cpu_capacity(stats->best_capacity_cpu)))) {
+		stats->highest_spare_capacity = spare_capacity;
+		stats->best_capacity_cpu = cpu;
+	}
+}
+
+static inline void find_backup_cluster(
+struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+	struct sched_cluster *next = NULL;
+	int i;
+
+	while (!bitmap_empty(env->backup_list, num_clusters)) {
+		next = next_candidate(env->backup_list, 0, num_clusters);
+		__clear_bit(next->id, env->backup_list);
+		for_each_cpu_and(i, &env->p->cpus_allowed, &next->cpus) {
+			trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+			sched_irqload(i), power_cost(i, task_load(env->p) +
+					cpu_cravg_sync(i, env->sync)), 0);
+
+			update_spare_capacity(stats, i, next->capacity,
+					  cpu_load_sync(i, env->sync));
+		}
+	}
+}
+
+struct sched_cluster *
+next_best_cluster(struct sched_cluster *cluster, struct cpu_select_env *env)
+{
+	struct sched_cluster *next = NULL;
+
+	__clear_bit(cluster->id, env->candidate_list);
+
+	do {
+		if (bitmap_empty(env->candidate_list, num_clusters))
+			return NULL;
+
+		next = next_candidate(env->candidate_list, 0, num_clusters);
+		if (next)
+			if (skip_cluster(next, env))
+				next = NULL;
+	} while (!next);
+
+	env->task_load = scale_load_to_cpu(task_load(env->p),
+					cluster_first_cpu(next));
+	return next;
+}
+
+static void update_cluster_stats(int cpu, struct cluster_cpu_stats *stats,
+					 struct cpu_select_env *env)
+{
+	int cpu_cost;
+	int prev_cpu = env->prev_cpu;
+
+	cpu_cost = power_cost(cpu, task_load(env->p) +
+				cpu_cravg_sync(cpu, env->sync));
+	if (cpu_cost > stats->min_cost)
+		return;
+
+	if (cpu != prev_cpu && cpus_share_cache(prev_cpu, cpu)) {
+		if (stats->best_sibling_cpu_cost > cpu_cost ||
+		    (stats->best_sibling_cpu_cost == cpu_cost &&
+		     stats->best_sibling_cpu_load > env->cpu_load)) {
+
+			stats->best_sibling_cpu_cost = cpu_cost;
+			stats->best_sibling_cpu_load = env->cpu_load;
+			stats->best_sibling_cpu = cpu;
+		}
+	}
+
+	if ((cpu_cost < stats->min_cost) ||
+	    ((stats->best_cpu != prev_cpu && stats->min_load > env->cpu_load) ||
+	     cpu == prev_cpu)) {
+		if (env->need_idle) {
+			if (idle_cpu(cpu)) {
+				stats->min_cost = cpu_cost;
+				stats->best_idle_cpu = cpu;
+			}
+		} else {
+			stats->min_cost = cpu_cost;
+			stats->min_load = env->cpu_load;
+			stats->best_cpu = cpu;
+		}
+	}
+}
+
+static void find_best_cpu_in_cluster(struct sched_cluster *c,
+	 struct cpu_select_env *env, struct cluster_cpu_stats *stats)
+{
+	int i;
+	struct cpumask search_cpus;
+
+	cpumask_and(&search_cpus, tsk_cpus_allowed(env->p), &c->cpus);
+	if (env->ignore_prev_cpu)
+		cpumask_clear_cpu(env->prev_cpu, &search_cpus);
+
+	for_each_cpu(i, &search_cpus) {
+		env->cpu_load = cpu_load_sync(i, env->sync);
+
+		trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
+			sched_irqload(i),
+			power_cost(i, task_load(env->p) +
+					cpu_cravg_sync(i, env->sync)), 0);
+
+		if (unlikely(!cpu_active(i)) || skip_cpu(i, env))
+			continue;
+
+		update_spare_capacity(stats, i, c->capacity, env->cpu_load);
+
+		if (env->boost || sched_cpu_high_irqload(i) ||
+				spill_threshold_crossed(env, cpu_rq(i)))
+			continue;
+
+		update_cluster_stats(i, stats, env);
+	}
+}
+
+static inline void init_cluster_cpu_stats(struct cluster_cpu_stats *stats)
+{
+	stats->best_cpu = stats->best_idle_cpu = -1;
+	stats->best_capacity_cpu = stats->best_sibling_cpu  = -1;
+	stats->min_cost = stats->best_sibling_cpu_cost = INT_MAX;
+	stats->min_load	= stats->best_sibling_cpu_load = ULLONG_MAX;
+	stats->highest_spare_capacity = 0;
 }
 
 /*
@@ -3175,163 +3346,118 @@ static int skip_cpu(struct rq *task_rq, struct rq *rq, int cpu, int reason)
 static inline int wake_to_idle(struct task_struct *p)
 {
 	return (current->flags & PF_WAKE_UP_IDLE) ||
-			 (p->flags & PF_WAKE_UP_IDLE);
+				(p->flags & PF_WAKE_UP_IDLE);
 }
 
-static inline bool short_sleep_task_waking(struct task_struct *p, int prev_cpu,
-					   const cpumask_t *search_cpus)
+static inline bool
+bias_to_prev_cpu(struct cpu_select_env *env, struct cluster_cpu_stats *stats)
 {
+	int prev_cpu;
+	struct task_struct *task = env->p;
+	struct sched_cluster *cluster;
+
+	if (env->boost || env->reason || env->need_idle ||
+				!sched_short_sleep_task_threshold)
+		return false;
+
+	prev_cpu = env->prev_cpu;
+	if (!cpumask_test_cpu(prev_cpu, tsk_cpus_allowed(task)) ||
+					unlikely(!cpu_active(prev_cpu)))
+		return false;
+
 	/*
 	 * This function should be used by task wake up path only as it's
 	 * assuming p->last_switch_out_ts as last sleep time.
 	 * p->last_switch_out_ts can denote last preemption time as well as
 	 * last sleep time.
 	 */
-	return (sched_short_sleep_task_threshold &&
-		(p->ravg.mark_start - p->last_switch_out_ts <
-		 sched_short_sleep_task_threshold) &&
-		cpumask_test_cpu(prev_cpu, search_cpus));
+	if (task->ravg.mark_start - task->last_switch_out_ts >=
+					sched_short_sleep_task_threshold)
+		return false;
+
+	env->task_load = scale_load_to_cpu(task_load(task), prev_cpu);
+	cluster = cpu_rq(prev_cpu)->cluster;
+
+	if (!task_load_will_fit(task, env->task_load, prev_cpu)) {
+
+		__set_bit(cluster->id, env->backup_list);
+		__clear_bit(cluster->id, env->candidate_list);
+		return false;
+	}
+
+	env->cpu_load = cpu_load_sync(prev_cpu, env->sync);
+	if (sched_cpu_high_irqload(prev_cpu) ||
+			spill_threshold_crossed(env, cpu_rq(prev_cpu))) {
+		update_spare_capacity(stats, prev_cpu,
+				cluster->capacity, env->cpu_load);
+		env->ignore_prev_cpu = 1;
+		return false;
+	}
+
+	return true;
 }
 
 /* return cheapest cpu that can fit this task */
 static int select_best_cpu(struct task_struct *p, int target, int reason,
 			   int sync)
 {
-	int i, best_cpu = -1, best_idle_cpu = -1, best_capacity_cpu = -1;
-	int prev_cpu = task_cpu(p), best_sibling_cpu = -1;
-	int cpu_cost, min_cost = INT_MAX, best_sibling_cpu_cost = INT_MAX;
-	u64 tload, cpu_load, best_sibling_cpu_load = ULLONG_MAX;
-	u64 min_load = ULLONG_MAX;
-	s64 spare_capacity, highest_spare_capacity = 0;
-	int boost = sched_boost();
-	int need_idle = wake_to_idle(p);
+	struct sched_cluster *cluster;
+	struct cluster_cpu_stats stats;
 	bool fast_path = false;
-	cpumask_t search_cpus;
-	struct rq *trq;
-
-	cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
-
-	if (!boost && !reason && !need_idle &&
-	    short_sleep_task_waking(p, prev_cpu, &search_cpus)) {
-		cpu_load = cpu_load_sync(prev_cpu, sync);
-		tload = scale_load_to_cpu(task_load(p), prev_cpu);
-		if (eligible_cpu(tload, cpu_load, prev_cpu, sync) &&
-		    task_load_will_fit(p, tload, prev_cpu)) {
-			fast_path = true;
-			best_cpu = prev_cpu;
-			goto done;
-		}
-
-		spare_capacity = sched_ravg_window - cpu_load;
-		if (spare_capacity > 0) {
-			highest_spare_capacity = spare_capacity;
-			best_capacity_cpu = prev_cpu;
-		}
-		cpumask_clear_cpu(prev_cpu, &search_cpus);
-	}
-
-	trq = task_rq(p);
-	for_each_cpu(i, &search_cpus) {
-		struct rq *rq = cpu_rq(i);
-
-		trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
-		 sched_irqload(i),
-		 power_cost(i, task_load(p) + cpu_cravg_sync(i, sync)),
-		 cpu_temp(i));
-
-		if (skip_cluster(task_cpu(p), i, reason)) {
-			cpumask_andnot(&search_cpus, &search_cpus,
-						&rq->cluster->cpus);
-			continue;
-		}
 
-		if (skip_cpu(task_rq(p), rq, i, reason))
-			continue;
-
-		cpu_load = cpu_load_sync(i, sync);
-		spare_capacity = sched_ravg_window - cpu_load;
+	struct cpu_select_env env = {
+		.p			= p,
+		.reason			= reason,
+		.need_idle		= wake_to_idle(p),
+		.boost			= sched_boost(),
+		.sync			= sync,
+		.prev_cpu		= target,
+		.ignore_prev_cpu	= 0,
+	};
 
-		/* Note the highest spare capacity CPU in the system */
-		if (spare_capacity > 0 &&
-		    (spare_capacity > highest_spare_capacity ||
-		     (spare_capacity == highest_spare_capacity &&
-			cpu_capacity(i) > cpu_capacity(best_capacity_cpu)))) {
-			highest_spare_capacity = spare_capacity;
-			best_capacity_cpu = i;
-		}
+	bitmap_copy(env.candidate_list, all_cluster_ids, NR_CPUS);
+	bitmap_zero(env.backup_list, NR_CPUS);
 
-		if (boost)
-			continue;
+	init_cluster_cpu_stats(&stats);
 
-		tload = scale_load_to_cpu(task_load(p), i);
-		if (!eligible_cpu(tload, cpu_load, i, sync) ||
-					!task_load_will_fit(p, tload, i))
-			continue;
+	if (bias_to_prev_cpu(&env, &stats)) {
+		fast_path = true;
+		goto out;
+	}
 
-		/*
-		 * The task will fit on this CPU and the CPU can accommodate it
-		 * under spill.
-		 */
+	rcu_read_lock();
+	cluster = select_least_power_cluster(&env);
 
-		cpu_cost = power_cost(i, task_load(p) +
-					 cpu_cravg_sync(i, sync));
+	if (!cluster) {
+		rcu_read_unlock();
+		goto out;
+	}
 
-		if (cpu_cost > min_cost)
-			continue;
+	do {
+		find_best_cpu_in_cluster(cluster, &env, &stats);
 
-		/*
-		 * If the task fits in a CPU in a lower power band, that
-		 * overrides all other considerations.
-		 */
-		if (power_delta_exceeded(cpu_cost, min_cost)) {
-			min_cost = cpu_cost;
-			min_load = ULLONG_MAX;
-			best_cpu = -1;
-		}
+	} while ((cluster = next_best_cluster(cluster, &env)));
 
-		if (i != prev_cpu && cpus_share_cache(prev_cpu, i)) {
-			if (best_sibling_cpu_cost > cpu_cost ||
-			    (best_sibling_cpu_cost == cpu_cost &&
-			     best_sibling_cpu_load > cpu_load)) {
-				best_sibling_cpu_cost = cpu_cost;
-				best_sibling_cpu_load = cpu_load;
-				best_sibling_cpu = i;
-			}
-		}
+	rcu_read_unlock();
 
-		if ((cpu_cost < min_cost) ||
-		    ((best_cpu != prev_cpu && min_load > cpu_load) ||
-		     i == prev_cpu)) {
-			if (need_idle) {
-				if (idle_cpu(i)) {
-					min_cost = cpu_cost;
-					best_idle_cpu = i;
-				}
-			} else {
-				min_cost = cpu_cost;
-				min_load = cpu_load;
-				best_cpu = i;
-			}
-		}
-	}
+	if (stats.best_idle_cpu >= 0) {
+		target = stats.best_idle_cpu;
+	} else if (stats.best_cpu >= 0) {
+		if (stats.best_cpu != task_cpu(p) &&
+				stats.min_cost == stats.best_sibling_cpu_cost)
+			stats.best_cpu = stats.best_sibling_cpu;
 
-	if (best_idle_cpu >= 0) {
-		best_cpu = best_idle_cpu;
-	} else if (best_cpu < 0 || boost) {
-		if (unlikely(best_capacity_cpu < 0))
-			best_cpu = prev_cpu;
-		else
-			best_cpu = best_capacity_cpu;
+		target = stats.best_cpu;
 	} else {
-		if (best_cpu != prev_cpu && min_cost == best_sibling_cpu_cost)
-			best_cpu = best_sibling_cpu;
+		find_backup_cluster(&env, &stats);
+		if (stats.best_capacity_cpu >= 0)
+			target = stats.best_capacity_cpu;
 	}
 
-done:
-	trace_sched_task_load(p, boost, reason, sync, need_idle, fast_path,
-			      best_cpu);
-
-	return best_cpu;
+out:
+	trace_sched_task_load(p, sched_boost(), env.reason, env.sync,
+					env.need_idle, fast_path, target);
+	return target;
 }
 
 static void
@@ -3920,7 +4046,7 @@ unsigned int power_cost(int cpu, u64 demand)
 }
 
 static inline int
-spill_threshold_crossed(u64 task_load, u64 cpu_load, struct rq *rq)
+spill_threshold_crossed(struct cpu_select_env *env, struct rq *rq)
 {
 	return 0;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1fd22539a334..e698cc3438c7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1661,13 +1661,15 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
 #ifdef CONFIG_SCHED_HMP
+
 static int find_lowest_rq_hmp(struct task_struct *task)
 {
 	struct cpumask *lowest_mask = *this_cpu_ptr(&local_cpu_mask);
-	int cpu_cost, min_cost = INT_MAX;
-	u64 cpu_load, min_load = ULLONG_MAX;
+	struct cpumask candidate_mask = CPU_MASK_NONE;
+	struct sched_cluster *cluster;
 	int best_cpu = -1;
 	int prev_cpu = task_cpu(task);
+	u64 cpu_load, min_load = ULLONG_MAX;
 	int i;
 
 	/* Make sure the mask is initialized first */
@@ -1686,36 +1688,26 @@ static int find_lowest_rq_hmp(struct task_struct *task)
 	 * the best one based on our affinity and topology.
 	 */
 
-	/* Skip performance considerations and optimize for power.
-	 * Worst case we'll be iterating over all CPUs here. CPU
-	 * online mask should be taken care of when constructing
-	 * the lowest_mask.
-	 */
-	for_each_cpu(i, lowest_mask) {
-		cpu_load = scale_load_to_cpu(
-			cpu_rq(i)->hmp_stats.cumulative_runnable_avg, i);
-		cpu_cost = power_cost(i, cpu_cravg_sync(i, 0));
-		trace_sched_cpu_load_wakeup(cpu_rq(i), idle_cpu(i),
-			sched_irqload(i), cpu_cost, cpu_temp(i));
-
-		if (power_delta_exceeded(cpu_cost, min_cost)) {
-			if (cpu_cost > min_cost)
-				continue;
+	for_each_sched_cluster(cluster) {
+		cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask);
 
-			min_cost = cpu_cost;
-			min_load = ULLONG_MAX;
-			best_cpu = -1;
-		}
-
-		if (sched_cpu_high_irqload(i))
+		if (cpumask_empty(&candidate_mask))
 			continue;
 
-		if (cpu_load < min_load ||
-		    (cpu_load == min_load &&
-		     (i == prev_cpu || (best_cpu != prev_cpu &&
-					cpus_share_cache(prev_cpu, i))))) {
-			min_load = cpu_load;
-			best_cpu = i;
+		for_each_cpu(i, &candidate_mask) {
+			if (sched_cpu_high_irqload(i))
+				continue;
+
+			cpu_load = scale_load_to_cpu(
+			  cpu_rq(i)->hmp_stats.cumulative_runnable_avg, i);
+
+			if (cpu_load < min_load ||
+				(cpu_load == min_load &&
+				(i == prev_cpu || (best_cpu != prev_cpu &&
+				cpus_share_cache(prev_cpu, i))))) {
+				min_load = cpu_load;
+				best_cpu = i;
+			}
 		}
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35a13974f34a..9e4f0887136c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -395,6 +395,13 @@ static inline int cluster_first_cpu(struct sched_cluster *cluster)
 	return cpumask_first(&cluster->cpus);
 }
 
+extern struct list_head cluster_head;
+extern int num_clusters;
+extern struct sched_cluster *sched_cluster[NR_CPUS];
+
+#define for_each_sched_cluster(cluster) \
+	list_for_each_entry_rcu(cluster, &cluster_head, list)
+
 #endif
 
 /* CFS-related fields in a runqueue */
@@ -1031,6 +1038,11 @@ unsigned int cpu_temp(int cpu);
 extern unsigned int nr_eligible_big_tasks(int cpu);
 extern void update_up_down_migrate(void);
 
+static inline struct sched_cluster *cpu_cluster(int cpu)
+{
+	return cpu_rq(cpu)->cluster;
+}
+
 static inline int cpu_capacity(int cpu)
 {
 	return cpu_rq(cpu)->cluster->capacity;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dcb852652bc8..1f2afa6eefaf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -411,13 +411,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= sched_hmp_proc_update_handler,
 	},
 	{
-		.procname	= "sched_power_band_limit",
-		.data		= &sysctl_sched_powerband_limit_pct,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_hmp_proc_update_handler,
-	},
-	{
 		.procname	= "sched_lowspill_freq",
 		.data		= &sysctl_sched_lowspill_freq,
 		.maxlen		= sizeof(unsigned int),