4 files changed, 132 insertions, 84 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b1d48c53bf7e..d7f43c26e0c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -860,7 +860,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 	trace_sched_enq_deq_task(p, 1);
-	rq->cumulative_runnable_avg += p->se.ravg.demand;
+	inc_cumulative_runnable_avg(rq, p);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -870,8 +870,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 	trace_sched_enq_deq_task(p, 0);
-	rq->cumulative_runnable_avg -= p->se.ravg.demand;
-	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+	dec_cumulative_runnable_avg(rq, p);
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1150,11 +1149,7 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 
 		mnd.src_cpu = src_cpu;
 		mnd.dest_cpu = dest_cpu;
-		if (sysctl_sched_ravg_window)
-			 mnd.load = div64_u64((u64)p->se.ravg.demand * 100,
-			 	(u64)(sysctl_sched_ravg_window));
-		else
-			mnd.load = 0;
+		mnd.load = pct_task_load(p);
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
 	}
@@ -1757,6 +1752,15 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 		wq_worker_waking_up(p, cpu_of(rq));
 }
 
+/* Window size (in ns) */
+__read_mostly unsigned int sched_ravg_window = 10000000;
+
+/* Min window size (in ns) = 10ms */
+__read_mostly unsigned int min_sched_ravg_window = 10000000;
+
+/* Max window size (in ns) = 1s */
+__read_mostly unsigned int max_sched_ravg_window = 1000000000;
+
 /*
  * Called when new window is starting for a task, to record cpu usage over
  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
@@ -1766,9 +1770,9 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 static inline void
 update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples)
 {
-	u32 *hist = &p->se.ravg.sum_history[0];
+	u32 *hist = &p->ravg.sum_history[0];
 	int ridx, widx;
-	u32 max = 0;
+	u32 sum = 0, avg;
 
 	/* Ignore windows where task had no activity */
 	if (!runtime)
@@ -1779,86 +1783,97 @@ update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples)
 	ridx = widx - samples;
 	for (; ridx >= 0; --widx, --ridx) {
 		hist[widx] = hist[ridx];
-		if  (hist[widx] > max)
-			max = hist[widx];
+		sum += hist[widx];
 	}
 
 	for (widx = 0; widx < samples && widx < RAVG_HIST_SIZE; widx++) {
 		hist[widx] = runtime;
-		if  (hist[widx] > max)
-			max = hist[widx];
+		sum += hist[widx];
 	}
 
-	p->se.ravg.sum = 0;
+	p->ravg.sum = 0;
 	if (p->on_rq) {
-		rq->cumulative_runnable_avg -= p->se.ravg.demand;
+		rq->cumulative_runnable_avg -= p->ravg.demand;
 		BUG_ON((s64)rq->cumulative_runnable_avg < 0);
 	}
-	/*
-	 * Maximum demand seen over previous RAVG_HIST_SIZE windows drives
-	 * frequency demand for a task. Record maximum in 'demand' attribute.
-	 */
-	p->se.ravg.demand = max;
+
+	avg = sum / RAVG_HIST_SIZE;
+
+	p->ravg.demand = max(avg, runtime);
+
 	if (p->on_rq)
-		rq->cumulative_runnable_avg += p->se.ravg.demand;
+		rq->cumulative_runnable_avg += p->ravg.demand;
 }
 
-/* Window size (in ns) */
-__read_mostly unsigned int sysctl_sched_ravg_window = 50000000;
+static int __init set_sched_ravg_window(char *str)
+{
+	get_option(&str, &sched_ravg_window);
+
+	return 0;
+}
+
+early_param("sched_ravg_window", set_sched_ravg_window);
 
 void update_task_ravg(struct task_struct *p, struct rq *rq, int update_sum)
 {
-	u32 window_size = sysctl_sched_ravg_window;
+	u32 window_size = sched_ravg_window;
 	int new_window;
 	u64 wallclock = sched_clock();
 
+	if (sched_ravg_window < min_sched_ravg_window)
+		return;
+
 	do {
 		s64 delta = 0;
 		int n;
 		u64 now = wallclock;
 
 		new_window = 0;
-		delta = now - p->se.ravg.window_start;
+		delta = now - p->ravg.window_start;
 		BUG_ON(delta < 0);
 		if (delta > window_size) {
-			p->se.ravg.window_start += window_size;
-			now = p->se.ravg.window_start;
+			p->ravg.window_start += window_size;
+			now = p->ravg.window_start;
 			new_window = 1;
 		}
 
 		if (update_sum) {
-			delta = now - p->se.ravg.mark_start;
+			unsigned int cur_freq = rq->cur_freq;
+
+			delta = now - p->ravg.mark_start;
 			BUG_ON(delta < 0);
 
-			if (likely(rq->cur_freq &&
-					rq->cur_freq <= max_possible_freq))
-				delta = div64_u64(delta  * rq->cur_freq,
+			if (unlikely(cur_freq > max_possible_freq))
+				cur_freq = max_possible_freq;
+
+			delta = div64_u64(delta  * cur_freq,
 							max_possible_freq);
-			p->se.ravg.sum += delta;
-			WARN_ON(p->se.ravg.sum > window_size);
+			p->ravg.sum += delta;
+			WARN_ON(p->ravg.sum > window_size);
 		}
 
 		if (!new_window)
 			break;
 
-		update_history(rq, p, p->se.ravg.sum, 1);
+		update_history(rq, p, p->ravg.sum, 1);
 
-		delta = wallclock - p->se.ravg.window_start;
+		delta = wallclock - p->ravg.window_start;
 		BUG_ON(delta < 0);
 		n = div64_u64(delta, window_size);
 		if (n) {
 			if (!update_sum)
-				p->se.ravg.window_start = wallclock;
+				p->ravg.window_start = wallclock;
 			else
-				p->se.ravg.window_start += n * window_size;
-			BUG_ON(p->se.ravg.window_start > wallclock);
+				p->ravg.window_start += (u64)n *
+							 (u64)window_size;
+			BUG_ON(p->ravg.window_start > wallclock);
 			if (update_sum)
 				update_history(rq, p, window_size, n);
 		}
-		p->se.ravg.mark_start =  p->se.ravg.window_start;
+		p->ravg.mark_start =  p->ravg.window_start;
 	} while (new_window);
 
-	p->se.ravg.mark_start = wallclock;
+	p->ravg.mark_start = wallclock;
 }
 
 /*
@@ -2162,11 +2177,8 @@ out:
 
 		mnd.src_cpu = src_cpu;
 		mnd.dest_cpu = cpu;
-		if (sysctl_sched_ravg_window)
-			mnd.load = div64_u64((u64)p->se.ravg.demand * 100,
-				(u64)(sysctl_sched_ravg_window));
-		else
-			mnd.load = 0;
+		mnd.load = pct_task_load(p);
+
 		/*
 		 * Call the migration notifier with mnd for foreground task
 		 * migrations as well as for wakeups if their load is above
@@ -2281,8 +2293,6 @@ void __dl_clear_params(struct task_struct *p)
  */
 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
-	int i;
-
 	p->on_rq			= 0;
 
 	p->se.on_rq			= 0;
@@ -2291,12 +2301,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
-	p->se.ravg.sum			= 0;
-	p->se.ravg.demand		= 0;
-	p->se.ravg.window_start		= 0;
-	p->se.ravg.mark_start		= 0;
-	for (i = 0; i < RAVG_HIST_SIZE; ++i)
-		p->se.ravg.sum_history[i] = 0;
+	init_new_task_load(p);
 
 	INIT_LIST_HEAD(&p->se.group_node);
 
@@ -2564,7 +2569,6 @@ void wake_up_new_task(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
-	u64 wallclock = sched_clock();
 
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	/* Initialize new task's runnable average */
@@ -2580,8 +2584,6 @@ void wake_up_new_task(struct task_struct *p)
 
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
-	p->se.ravg.window_start	= wallclock;
-	p->se.ravg.mark_start	= wallclock;
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
 	check_preempt_curr(rq, p, WF_FORK);
@@ -7577,6 +7579,7 @@ static int cpufreq_notifier_trans(struct notifier_block *nb,
 	if (val != CPUFREQ_POSTCHANGE)
 		return 0;
 
+	BUG_ON(!new_freq);
 	cpu_rq(cpu)->cur_freq = new_freq;
 
 	return 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbf3d3697322..f62f9b7fd049 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2449,6 +2449,39 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline unsigned int task_load(struct task_struct *p)
+{
+	return p->ravg.demand;
+}
+
+static inline unsigned int max_task_load(void)
+{
+	return sched_ravg_window;
+}
+
+/* Return task demand in percentage scale */
+unsigned int pct_task_load(struct task_struct *p)
+{
+	unsigned int load;
+
+	load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
+
+	return load;
+}
+
+void init_new_task_load(struct task_struct *p)
+{
+	int i;
+	u64 wallclock = sched_clock();
+
+	p->ravg.sum			= 0;
+	p->ravg.demand			= 0;
+	p->ravg.window_start		= wallclock;
+	p->ravg.mark_start		= wallclock;
+	for (i = 0; i < RAVG_HIST_SIZE; ++i)
+		p->ravg.sum_history[i] = 0;
+}
+
 #ifdef CONFIG_SMP
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
 static const u32 runnable_avg_yN_inv[] = {
@@ -5558,6 +5591,7 @@ struct lb_env {
 };
 
 static DEFINE_PER_CPU(bool, dbs_boost_needed);
+static DEFINE_PER_CPU(int, dbs_boost_load_moved);
 
 /*
  * Is this task likely cache-hot:
@@ -5746,7 +5780,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  *
  * Returns a task if successful and NULL otherwise.
  */
-static struct task_struct *detach_one_task(struct lb_env *env, int *total_run_moved)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
 	struct task_struct *p, *n;
 
@@ -5765,10 +5799,7 @@ static struct task_struct *detach_one_task(struct lb_env *env, int *total_run_mo
 		 * inside detach_tasks().
 		 */
 		schedstat_inc(env->sd, lb_gained[env->idle]);
-
-		if (sysctl_sched_ravg_window)
-			*total_run_moved += div64_u64((u64)p->se.ravg.demand *
-					100, (u64)(sysctl_sched_ravg_window));
+		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 		return p;
 	}
@@ -5783,7 +5814,7 @@ static const unsigned int sched_nr_migrate_break = 32;
  *
  * Returns number of detached tasks if successful and 0 otherwise.
  */
-static int detach_tasks(struct lb_env *env, int *total_run_moved)
+static int detach_tasks(struct lb_env *env)
 {
 	struct list_head *tasks = &env->src_rq->cfs_tasks;
 	struct task_struct *p;
@@ -5833,9 +5864,7 @@ static int detach_tasks(struct lb_env *env, int *total_run_moved)
 
 		detached++;
 		env->imbalance -= load;
-		if (sysctl_sched_ravg_window)
-			*total_run_moved += div64_u64((u64)p->se.ravg.demand *
-					100, (u64)(sysctl_sched_ravg_window));
+		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
 
 #ifdef CONFIG_PREEMPT
 		/*
@@ -6975,7 +7004,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			int *continue_balancing)
 {
 	int ld_moved = 0, cur_ld_moved, active_balance = 0;
-	int total_run_moved = 0;
 	struct sched_domain *sd_parent = sd->parent;
 	struct sched_group *group = NULL;
 	struct rq *busiest = NULL;
@@ -7004,6 +7032,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpumask_copy(cpus, cpu_active_mask);
 
+	per_cpu(dbs_boost_load_moved, this_cpu) = 0;
 	schedstat_inc(sd, lb_count[idle]);
 
 redo:
@@ -7049,7 +7078,7 @@ more_balance:
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
-		cur_ld_moved = detach_tasks(&env, &total_run_moved);
+		cur_ld_moved = detach_tasks(&env);
 
 		/*
 		 * We've detached some tasks from busiest_rq. Every
@@ -7187,13 +7216,16 @@ more_balance:
 		if (per_cpu(dbs_boost_needed, this_cpu)) {
 			struct migration_notify_data mnd;
 
-			per_cpu(dbs_boost_needed, this_cpu) = false;
-
 			mnd.src_cpu = cpu_of(busiest);
 			mnd.dest_cpu = this_cpu;
-			mnd.load = total_run_moved;
+			mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
+			if (mnd.load > 100)
+				mnd.load = 100;
 			atomic_notifier_call_chain(&migration_notifier_head,
 						   0, (void *)&mnd);
+			per_cpu(dbs_boost_needed, this_cpu) = false;
+			per_cpu(dbs_boost_load_moved, this_cpu) = 0;
+
 		}
 	}
 	if (likely(!active_balance)) {
@@ -7391,13 +7423,14 @@ static int active_load_balance_cpu_stop(void *data)
 	struct rq *busiest_rq = data;
 	int busiest_cpu = cpu_of(busiest_rq);
 	int target_cpu = busiest_rq->push_cpu;
-	int total_run_moved = 0;
 	struct rq *target_rq = cpu_rq(target_cpu);
 	struct sched_domain *sd;
 	struct task_struct *p = NULL;
 
 	raw_spin_lock_irq(&busiest_rq->lock);
 
+	per_cpu(dbs_boost_load_moved, target_cpu) = 0;
+
 	/* make sure the requested cpu hasn't gone down in the meantime */
 	if (unlikely(busiest_cpu != smp_processor_id() ||
 		     !busiest_rq->active_balance))
@@ -7434,7 +7467,7 @@ static int active_load_balance_cpu_stop(void *data)
 
 		schedstat_inc(sd, alb_count);
 
-		p = detach_one_task(&env, &total_run_moved);
+		p = detach_one_task(&env);
 		if (p)
 			schedstat_inc(sd, alb_pushed);
 		else
@@ -7453,13 +7486,16 @@ out_unlock:
 	if (per_cpu(dbs_boost_needed, target_cpu)) {
 		struct migration_notify_data mnd;
 
-		per_cpu(dbs_boost_needed, target_cpu) = false;
-
 		mnd.src_cpu = cpu_of(busiest_rq);
 		mnd.dest_cpu = target_cpu;
-		mnd.load = total_run_moved;
+		mnd.load = per_cpu(dbs_boost_load_moved, target_cpu);
+		if (mnd.load > 100)
+			mnd.load = 100;
 		atomic_notifier_call_chain(&migration_notifier_head,
 					   0, (void *)&mnd);
+
+		per_cpu(dbs_boost_needed, target_cpu) = false;
+		per_cpu(dbs_boost_load_moved, target_cpu) = 0;
 	}
 	return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b85da1bb4b49..fc3f624ca8c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -35,7 +35,6 @@ extern void update_cpu_load_active(struct rq *this_rq);
 static inline void update_cpu_load_active(struct rq *this_rq) { }
 #endif
 
-extern unsigned int sysctl_sched_ravg_window;
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
@@ -912,6 +911,23 @@ static inline void sched_ttwu_pending(void) { }
 #include "stats.h"
 #include "auto_group.h"
 
+extern unsigned int sched_ravg_window;
+extern unsigned int pct_task_load(struct task_struct *p);
+extern void init_new_task_load(struct task_struct *p);
+
+static inline void
+inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+static inline void
+dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p)
+{
+	rq->cumulative_runnable_avg -= p->ravg.demand;
+	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
 #ifdef CONFIG_CGROUP_SCHED
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1314618f07f8..292979f360d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -293,13 +293,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
-		.procname	= "sched_ravg_window",
-		.data		= &sysctl_sched_ravg_window,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-	{
 		.procname	= "sched_wakeup_load_threshold",
 		.data		= &sysctl_sched_wakeup_load_threshold,
 		.maxlen		= sizeof(unsigned int),